brilliant_web_scraper 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/README.md +31 -0
  4. data/brilliant_web_scraper-1.0.0.gem +0 -0
  5. data/brilliant_web_scraper-1.0.gem +0 -0
  6. data/brilliant_web_scraper.gemspec +30 -0
  7. data/lib/brilliant_web_scraper.rb +55 -0
  8. data/lib/parsers/description_helper.rb +28 -0
  9. data/lib/parsers/emails.rb +30 -0
  10. data/lib/parsers/facebook_profile.rb +11 -0
  11. data/lib/parsers/instagram_profile.rb +11 -0
  12. data/lib/parsers/linkedin_profile.rb +11 -0
  13. data/lib/parsers/meta_description.rb +13 -0
  14. data/lib/parsers/org_description.rb +13 -0
  15. data/lib/parsers/phone_numbers.rb +34 -0
  16. data/lib/parsers/pinterest_profile.rb +11 -0
  17. data/lib/parsers/redirected_to.rb +29 -0
  18. data/lib/parsers/title.rb +13 -0
  19. data/lib/parsers/twitter_description.rb +13 -0
  20. data/lib/parsers/twitter_profile.rb +11 -0
  21. data/lib/parsers/unescape_html_helper.rb +17 -0
  22. data/lib/parsers/vimeo_profile.rb +11 -0
  23. data/lib/parsers/youtube_channel.rb +29 -0
  24. data/lib/scraper/errors.rb +19 -0
  25. data/lib/scraper/scrape_exceptions.rb +49 -0
  26. data/lib/scraper/scrape_helper.rb +59 -0
  27. data/lib/scraper/scrape_request.rb +29 -0
  28. data/lib/version.rb +6 -0
  29. data/spec/lib/parsers/description_helper_spec.rb +24 -0
  30. data/spec/lib/parsers/emails_spec.rb +60 -0
  31. data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
  32. data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
  33. data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
  34. data/spec/lib/parsers/meta_description_spec.rb +321 -0
  35. data/spec/lib/parsers/org_description_spec.rb +316 -0
  36. data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
  37. data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
  38. data/spec/lib/parsers/redirected_to_spec.rb +207 -0
  39. data/spec/lib/parsers/title_spec.rb +87 -0
  40. data/spec/lib/parsers/twitter_description_spec.rb +314 -0
  41. data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
  42. data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
  43. data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
  44. data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
  45. data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
  46. data/spec/lib/scraper/scrape_request_test.rb +34 -0
  47. data/spec/spec_helper.rb +111 -0
  48. data/spec/vcr/encoding_compatibility_error.yml +316 -0
  49. data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
  50. data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
  51. data/spec/vcr/non_html_scrape.yml +163 -0
  52. data/spec/vcr/valid_scrape_response.yml +696 -0
  53. metadata +250 -0
@@ -0,0 +1,316 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Orgnisation Description' do
4
+
5
+ class DummyTestClass
6
+ include OrgDescription
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid inputs' do
11
+ expect(dummy_object.grep_org_description('')).to be_nil
12
+ expect(dummy_object.grep_org_description(nil)).to be_nil
13
+ end
14
+ describe 'Name key first organization description tag' do
15
+ it 'should return nil for no organization description tag presence' do
16
+ no_org_description = <<~HTML
17
+ <head>
18
+ <meta charset="utf-8">
19
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
20
+ <meta name="viewport" content="width=device-width, initial-scale=1">
21
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
22
+ <meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
23
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
24
+ </head>
25
+ HTML
26
+ meta_description = dummy_object.grep_org_description(no_org_description.to_s)
27
+ expect(meta_description).to be_nil
28
+ end
29
+
30
+ it 'should return nil when content part is empty' do
31
+ no_meta_description = <<~HTML
32
+ <head>
33
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
34
+ <meta property="og:description" content="">
35
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
36
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
37
+ </head>
38
+ HTML
39
+ meta_description = dummy_object.grep_org_description(no_meta_description.to_s)
40
+ expect(meta_description).to be_nil
41
+ end
42
+
43
+ it 'should return description from valid tag' do
44
+ html = <<~HTML
45
+ <head>
46
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
47
+ <meta content="" property="uid">
48
+ <meta property="og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
49
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
50
+ </head>
51
+ HTML
52
+ meta_description = dummy_object.grep_org_description(html.to_s)
53
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
54
+ end
55
+
56
+ it 'should return description even tag is multilined and partially encoded' do
57
+ html = <<~HTML
58
+ <head>
59
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
60
+ <meta content="" property="uid">
61
+ <meta property="og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
62
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
63
+ name="viewport">
64
+ </head>
65
+ HTML
66
+ meta_description = dummy_object.grep_org_description(html.to_s)
67
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
68
+ end
69
+
70
+ it 'should parse meta tag even it is partially single quoted' do
71
+ html = <<~HTML
72
+ <head>
73
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
74
+ <meta content="" property="uid">
75
+ <meta property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
76
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
77
+ name="viewport">
78
+ </head>
79
+ HTML
80
+ meta_description = dummy_object.grep_org_description(html.to_s)
81
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
82
+ end
83
+
84
+ it 'should parse meta tag even it is having other attributes defined' do
85
+ html = <<~HTML
86
+ <head>
87
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
88
+ <meta content="" property="uid">
89
+ <meta class="metadescription" property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
90
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
91
+ name="viewport">
92
+ </head>
93
+ HTML
94
+ meta_description = dummy_object.grep_org_description(html.to_s)
95
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
96
+ end
97
+
98
+ it 'should parse meta tag with itemprop as description key' do
99
+ html = <<~HTML
100
+ <head>
101
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
102
+ <meta content="" property="uid">
103
+ <meta class="metadescription" itemprop=\'og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
104
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
105
+ name="viewport">
106
+ </head>
107
+ HTML
108
+ meta_description = dummy_object.grep_org_description(html.to_s)
109
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
110
+ end
111
+
112
+ it 'should parse even name/itemprop key content is improperly assigned' do
113
+ html = <<~HTML
114
+ <head>
115
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
116
+ <meta content="" property="uid">
117
+ <meta class="metadescription" property=og:description content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" />
118
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
119
+ name="viewport">
120
+ </head>
121
+ HTML
122
+ meta_description = dummy_object.grep_org_description(html.to_s)
123
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
124
+ end
125
+
126
+ it 'should bring description having single quote' do
127
+ html = <<~HTML
128
+ <html lang="en">
129
+ <head>
130
+ <META charset="utf-8">
131
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
132
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
133
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
134
+ <meta property="og:description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
135
+ </head>
136
+ <html>
137
+ HTML
138
+ meta_description = dummy_object.grep_org_description(html.to_s)
139
+ expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
140
+ end
141
+
142
+ it 'should bring description having double quote' do
143
+ html = <<~HTML
144
+ <html lang="en">
145
+ <head>
146
+ <META charset="utf-8">
147
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
148
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
149
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
150
+ <meta property="og:description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
151
+ </head>
152
+ <html>
153
+ HTML
154
+ meta_description = dummy_object.grep_org_description(html.to_s)
155
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
156
+ end
157
+
158
+ it "should bring description even some other meta tag is empty" do
159
+ html = <<~HTML
160
+ <html lang="en">
161
+ <head>
162
+ <META charset="utf-8">
163
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
164
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
165
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
166
+ <meta property="og:description" content="">
167
+ <meta property="og:description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
168
+ </head>
169
+ <html>
170
+ HTML
171
+ meta_description = dummy_object.grep_org_description(html.to_s)
172
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
173
+ end
174
+ end
175
+ describe 'Content key first organization description tag' do
176
+ it 'should return nil when content part is empty' do
177
+ no_meta_description = <<~HTML
178
+ <head>
179
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
180
+ <meta content="" property="og:description">
181
+ <meta content='' property="og:description">
182
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
183
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
184
+ </head>
185
+ HTML
186
+ meta_description = dummy_object.grep_org_description(no_meta_description.to_s)
187
+ expect(meta_description).to be_nil
188
+ end
189
+
190
+ it 'should return description from valid tag' do
191
+ html = <<~HTML
192
+ <head>
193
+ <meta content="" property="uid">
194
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
195
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
196
+ </head>
197
+ HTML
198
+ meta_description = dummy_object.grep_org_description(html.to_s)
199
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
200
+ end
201
+
202
+ it 'should return description even tag is multilined and partially encoded' do
203
+ html = <<~HTML
204
+ <head>
205
+ <meta content="" property="uid">
206
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description" >
207
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
208
+ name="viewport">
209
+ </head>
210
+ HTML
211
+ meta_description = dummy_object.grep_org_description(html.to_s)
212
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
213
+ end
214
+
215
+ it 'should parse meta tag even it is partially single quoted' do
216
+ html = <<~HTML
217
+ <head>
218
+ <meta content="" property="uid">
219
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property=\'og:description">
220
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
221
+ name="viewport">
222
+ </head>
223
+ HTML
224
+ meta_description = dummy_object.grep_org_description(html.to_s)
225
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
226
+ end
227
+
228
+ it 'should parse meta tag even it is having other attributes defined' do
229
+ html = <<~HTML
230
+ <head>
231
+ <meta content="" property="uid">
232
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" property=\'og:description">
233
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
234
+ name="viewport">
235
+ </head>
236
+ HTML
237
+ meta_description = dummy_object.grep_org_description(html.to_s)
238
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
239
+ end
240
+
241
+ it 'should parse meta tag with itemprop as description key' do
242
+ html = <<~HTML
243
+ <head>
244
+ <meta content="" property="uid">
245
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'og:description" charset="UTF-8">
246
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
247
+ name="viewport">
248
+ </head>
249
+ HTML
250
+ meta_description = dummy_object.grep_org_description(html.to_s)
251
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
252
+ end
253
+
254
+ it 'should parse even name/itemprop key content is improperly assigned' do
255
+ html = <<~HTML
256
+ <head>
257
+ <meta content="" property="uid">
258
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" property=og:description />
259
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
260
+ name="viewport">
261
+ </head>
262
+ HTML
263
+ meta_description = dummy_object.grep_org_description(html.to_s)
264
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
265
+ end
266
+
267
+ it 'should bring description having single quote' do
268
+ html = <<~HTML
269
+ <html lang="en">
270
+ <head>
271
+ <META charset="utf-8">
272
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
273
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
274
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
275
+ <meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." property="og:description" />
276
+ </head>
277
+ <html>
278
+ HTML
279
+ meta_description = dummy_object.grep_org_description(html.to_s)
280
+ expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
281
+ end
282
+
283
+ it 'should bring description having double quote' do
284
+ html = <<~HTML
285
+ <html lang="en">
286
+ <head>
287
+ <META charset="utf-8">
288
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
289
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
290
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
291
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' property="og:description" />
292
+ </head>
293
+ <html>
294
+ HTML
295
+ meta_description = dummy_object.grep_org_description(html.to_s)
296
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
297
+ end
298
+
299
+ it "should bring description even some other meta tag is empty" do
300
+ html = <<~HTML
301
+ <html lang="en">
302
+ <head>
303
+ <META charset="utf-8">
304
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
305
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
306
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
307
+ <meta content="" property="og:description">
308
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' property="og:description"/>
309
+ </head>
310
+ <html>
311
+ HTML
312
+ meta_description = dummy_object.grep_org_description(html.to_s)
313
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,69 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'PhoneNumbers' do
4
+
5
+ class DummyTestClass
6
+ include PhoneNumbers
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_phone_numbers(nil)).to be_nil
12
+ expect(dummy_object.grep_phone_numbers('')).to be_nil
13
+ end
14
+
15
+ it 'should give []' do
16
+ html = <<~HTML
17
+ <a href="tel:{{ location1Phone }}">
18
+ <a href="tel:{{pdModel.preferredDealer.phoneNumbers[0].CompleteNumber.$}}">
19
+ <a href="tel:[value]" data-store-phone data-attr-replace="href"></a>
20
+ <a class="a--reset main-nav--mobile__link--secondary ">Grab us on Live Chat</a>
21
+ HTML
22
+ expect(dummy_object.grep_phone_numbers(html.to_s)).to eq([])
23
+ end
24
+ it 'should grep organization phoneNumbers' do
25
+ html = <<~HTML
26
+ <a href="tel: &#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&#x9;01598 760 700&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;" class="button tel">Tel: 01598 760 700</a>
27
+ <a href="tel: +1 (856) 393-2082">+1 (856) 393-2082</a>
28
+ <a href="tel: 800%20843%203269" data-interaction-context data-interaction-type="phone" data-interaction-name="800 843 3269" class="cta" tabindex="0">
29
+ <span class="cta-content">
30
+ <span class="cta-text" tabindex="-1">
31
+ Call Now 800 843 3269
32
+ </span>
33
+ </span>
34
+ </a>
35
+ <a href="tel://1-859-422-6000">859.422.6000</a>
36
+ <a href="tel:&#43;18009634816" class="btn btn-tertiary" data-tag="call">Call</a>
37
+ <a href="tel:%2B44-161-468-1234">0161 468 1234</a>
38
+ <a class="navbar-left" href="tel:781-788-8180 Ext. 4"> <b>Call Sales</b> 781-788-8180 Ext. 4 </a>
39
+ <a href="tel:8663033809">Call</a>
40
+ <a href="tel:http://484-373-7700"><span class="icon fa fa-phone"></span>484-373-7700</a>
41
+ <a href="tel:877.720.0411" data-organic="877.720.0411" data-metro="877.720.0411" data-display="877.720.0411" data-paid="877.720.0411" class="phone"><span class="number">877.720.0411</span></a>
42
+ <a href="tel:1866INTRALINKS">1-866-INTRALINKS</a>
43
+ <a href=tel:1888%20810%207464 itemprop=url>Call</a>
44
+ <a href=tel:18664946627 style="color: inherit; display: inline;">1 (866) 4-WINMAR</a>
45
+ <a href=tel:312-379-9329 class=phone>312-379-9329</a>
46
+ <a href=tel:312-379-9329 class=phone>312-379-9329</a>
47
+ <a\ndata-animsition-out=none href=tel:01722412512>Tel: 01722 412512</a>
48
+
49
+ HTML
50
+ phone_numbers = dummy_object.grep_phone_numbers(html.to_s)
51
+ expected_phone_numbers = [
52
+ "01598 760 700",
53
+ "+1 (856) 393-2082",
54
+ "800 843 3269",
55
+ "1-859-422-6000",
56
+ "+18009634816",
57
+ "+44-161-468-1234",
58
+ "781-788-8180 Ext. 4",
59
+ "8663033809", "484-373-7700",
60
+ "877.720.0411",
61
+ "1866INTRALINKS",
62
+ "1888 810 7464 ",
63
+ "18664946627 ",
64
+ "312-379-9329 ",
65
+ "01722412512"
66
+ ]
67
+ expect(phone_numbers).to eq(expected_phone_numbers)
68
+ end
69
+ end
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Pinterest Profile' do
4
+
5
+ class DummyTestClass
6
+ include PinterestProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_pinterest_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_pinterest_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep below url format' do
16
+ html = <<~HTML
17
+ <a href="http://pinterest.com/#" style="color: white;" class="fa fa-pinterest"></a>
18
+ <a href="http://pinterest.com/" style="color: white;" class="fa fa-pinterest"></a>
19
+ <a href="https://www.pinterest.com/feed/" style="color: white;" class="fa fa-pinterest"></a>
20
+ <a href="https://ct.pinterest.com/v3/?tid=2620913945757&amp;noscript=1" style="color: white;" class="fa fa-pinterest"></a>
21
+ <a href="http://assets.pinterest.com/js/pinmarklet.js?r=" style="color: white;" class="fa fa-pinterest"></a>
22
+ <a href="http://pinterest.com/pin/create/bookmarklet/" style="color: white;" class="fa fa-pinterest"></a>
23
+ <a href="http://uk.pinterest.com/pin/create/bookmarklet/" style="color: white;" class="fa fa-pinterest"></a>
24
+ <a href="https://ct.pinterest.com/?tid=8KRsk0UkbVS&value=0.00&quantity=1" style="color: white;" class="fa fa-pinterest"></a>
25
+ <a href="https://policy.pinterest.com/cookies" style="color: white;" class="fa fa-pinterest"></a>
26
+ HTML
27
+ expect(dummy_object.grep_pinterest_profile(html.to_s)).to eq([])
28
+ end
29
+
30
+ it 'should grep organization pinterest profiles' do
31
+ html = <<~HTML
32
+ <a href="http://www.pinterest.com/blogher" target="_blank">pinterest</a>
33
+ <a href="http://pinterest.com/orientaltrading?cm_sp=Footer-_-SocialLinks-_-Pinterest" target="_blank">pinterest</a>
34
+ <a href="http://pinterest.com/poppin&quot;" target="_blank">pinterest</a>
35
+ HTML
36
+ pinterest_profiles = dummy_object.grep_pinterest_profile(html.to_s)
37
+ expected_pinterest_profiles = [
38
+ 'http://www.pinterest.com/blogher',
39
+ 'http://pinterest.com/orientaltrading',
40
+ 'http://pinterest.com/poppin'
41
+ ]
42
+ expect(dummy_object.grep_pinterest_profile(html.to_s)).to eq(expected_pinterest_profiles)
43
+ end
44
+ end
@@ -0,0 +1,207 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Website Redirected To' do
4
+
5
+ class DummyTestClass
6
+ include RedirectedTo
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+
11
+ it 'should return nil for invalid input' do
12
+ expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
13
+ expect(dummy_object.grep_redirected_to_url('')).to be_nil
14
+ end
15
+
16
+ describe 'Website grep from link tag' do
17
+ describe 'rel attribute first ' do
18
+
19
+ it 'should return nil when canonical url is empty' do
20
+ html = <<~HTML
21
+ <link rel="canonical" href="">
22
+ <link rel="canonical" href=''>
23
+ HTML
24
+ website = dummy_object.grep_redirected_to_url(html.to_s)
25
+ expect(website).to be_nil
26
+ end
27
+
28
+ it 'should grep website' do
29
+ html = <<~HTML
30
+ <link rel="canonical" href="">
31
+ <link rel="canonical" href='https://www.apple.com/'>
32
+ HTML
33
+ website = dummy_object.grep_redirected_to_url(html.to_s)
34
+ expect(website).to eq('https://www.apple.com/')
35
+ end
36
+
37
+ it 'should grep website even with extra attributes' do
38
+ html = <<~HTML
39
+ <link rel="canonical" href="" itemprop="current_url">
40
+ <link rel="canonical" href='https://www.apple.com/'
41
+ itemprop="current_url" >
42
+ HTML
43
+ website = dummy_object.grep_redirected_to_url(html.to_s)
44
+ expect(website).to eq('https://www.apple.com/')
45
+ end
46
+ end
47
+ describe 'href attribute first' do
48
+ it 'should return nil when canonical url is empty' do
49
+ html = <<~HTML
50
+ <link href="" rel="canonical" >
51
+ <link href='' rel="canonical" >
52
+ HTML
53
+ website = dummy_object.grep_redirected_to_url(html.to_s)
54
+ expect(website).to be_nil
55
+ end
56
+
57
+ it 'should grep website' do
58
+ html = <<~HTML
59
+ <link rel="canonical" href="">
60
+ <link href='https://www.apple.com/' rel="canonical">
61
+ HTML
62
+ website = dummy_object.grep_redirected_to_url(html.to_s)
63
+ expect(website).to eq('https://www.apple.com/')
64
+ end
65
+
66
+ it 'should grep website even with extra attributes' do
67
+ html = <<~HTML
68
+ <link href="" itemprop="current_url" rel="canonical">
69
+ <link href='https://www.apple.com/' rel="canonical"
70
+ itemprop="current_url" >
71
+ HTML
72
+ website = dummy_object.grep_redirected_to_url(html.to_s)
73
+ expect(website).to eq('https://www.apple.com/')
74
+ end
75
+ end
76
+ end
77
+ describe 'Website grep from organization URL' do
78
+ describe 'property attribute first ' do
79
+ it 'should return nil when canonical url is empty' do
80
+ html = <<~HTML
81
+ <meta property="og:url" content="" />
82
+ <meta property="og:url" content='' />
83
+ HTML
84
+ website = dummy_object.grep_redirected_to_url(html.to_s)
85
+ expect(website).to be_nil
86
+ end
87
+
88
+ it 'should grep website' do
89
+ html = <<~HTML
90
+ <link property="og:url" content="">
91
+ <meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
92
+ HTML
93
+ website = dummy_object.grep_redirected_to_url(html.to_s)
94
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
95
+ end
96
+
97
+ it 'should grep website even with extra attributes' do
98
+ html = <<~HTML
99
+ <link property="og:url" content="" calss="og-url">
100
+ <meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
101
+ class="og-url" />
102
+ HTML
103
+ website = dummy_object.grep_redirected_to_url(html.to_s)
104
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
105
+ end
106
+ end
107
+ describe 'content attribute first ' do
108
+ it 'should return nil when canonical url is empty' do
109
+ html = <<~HTML
110
+ <meta content="" property="og:url" />
111
+ <meta content='' property="og:url"/>
112
+ HTML
113
+ website = dummy_object.grep_redirected_to_url(html.to_s)
114
+ expect(website).to be_nil
115
+ end
116
+
117
+ it 'should grep website' do
118
+ html = <<~HTML
119
+ <link content="" property="og:url" >
120
+ <meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
121
+ HTML
122
+ website = dummy_object.grep_redirected_to_url(html.to_s)
123
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
124
+ end
125
+
126
+ it 'should grep website even with extra attributes' do
127
+ html = <<~HTML
128
+ <link content="" calss="og-url" property="og:url">
129
+ <meta content='https://www.dieppe.ca/fr/index.aspx'
130
+ class="og-url" property="og:url" />
131
+ HTML
132
+ website = dummy_object.grep_redirected_to_url(html.to_s)
133
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
134
+ end
135
+ end
136
+ end
137
+ describe 'grep website' do
138
+ it 'it should return nil when link or og:url is absent' do
139
+ html = <<~HTML
140
+ <head>
141
+ <meta charset="utf-8">
142
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
143
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
144
+ <title>Techmologic | index</title>
145
+ <!-- Font Awesome -->
146
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
147
+ <!-- Bootstrap core CSS -->
148
+ <link href="css/bootstrap.min.css" rel="stylesheet">
149
+ <!-- Material Design Bootstrap -->
150
+ <link href="css/mdb.min.css" rel="stylesheet">
151
+ <!-- Your custom styles (optional) -->
152
+ <link href="css/style.css" rel="stylesheet">
153
+ </head>
154
+ HTML
155
+ website = dummy_object.grep_redirected_to_url(html.to_s)
156
+ expect(website).to be_nil
157
+ end
158
+ it 'should grep one of canonical or og:url' do
159
+ html = <<~HTML
160
+ <head>
161
+ <meta charset="utf-8">
162
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
163
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
164
+ <title>Techmologic | index</title>
165
+ <link rel="canonical" href="">
166
+ <meta property="og:url" content="" />
167
+ <!-- Font Awesome -->
168
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
169
+ <!-- Bootstrap core CSS -->
170
+ <link href="css/bootstrap.min.css" rel="stylesheet">
171
+ <!-- Material Design Bootstrap -->
172
+ <link href="css/mdb.min.css" rel="stylesheet">
173
+ <!-- Your custom styles (optional) -->
174
+ <link href="css/style.css" rel="stylesheet">
175
+ <link rel="canonical" href="http://techmologics.com/">
176
+ <meta property="og:url" content="http://techmologics.com/" />
177
+ </head>
178
+ HTML
179
+ website = dummy_object.grep_redirected_to_url(html.to_s)
180
+ expect(website).to eq('http://techmologics.com/')
181
+ end
182
+ it 'should grep one of canonical or og:url whatever it\'s position' do
183
+ html = <<~HTML
184
+ <head>
185
+ <meta charset="utf-8">
186
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
187
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
188
+ <title>Techmologic | index</title>
189
+ <link href="" rel="canonical">
190
+ <meta content="" property="og:url"/>
191
+ <!-- Font Awesome -->
192
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
193
+ <!-- Bootstrap core CSS -->
194
+ <link href="css/bootstrap.min.css" rel="stylesheet">
195
+ <!-- Material Design Bootstrap -->
196
+ <link href="css/mdb.min.css" rel="stylesheet">
197
+ <!-- Your custom styles (optional) -->
198
+ <link href="css/style.css" rel="stylesheet">
199
+ <link href="http://techmologics.com/" rel="canonical" class="canonical">
200
+ <meta content="http://techmologics.com/" property="og:url"/>
201
+ </head>
202
+ HTML
203
+ website = dummy_object.grep_redirected_to_url(html.to_s)
204
+ expect(website).to eq('http://techmologics.com/')
205
+ end
206
+ end
207
+ end