brilliant_web_scraper 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/README.md +31 -0
  4. data/brilliant_web_scraper-1.0.0.gem +0 -0
  5. data/brilliant_web_scraper-1.0.gem +0 -0
  6. data/brilliant_web_scraper.gemspec +30 -0
  7. data/lib/brilliant_web_scraper.rb +55 -0
  8. data/lib/parsers/description_helper.rb +28 -0
  9. data/lib/parsers/emails.rb +30 -0
  10. data/lib/parsers/facebook_profile.rb +11 -0
  11. data/lib/parsers/instagram_profile.rb +11 -0
  12. data/lib/parsers/linkedin_profile.rb +11 -0
  13. data/lib/parsers/meta_description.rb +13 -0
  14. data/lib/parsers/org_description.rb +13 -0
  15. data/lib/parsers/phone_numbers.rb +34 -0
  16. data/lib/parsers/pinterest_profile.rb +11 -0
  17. data/lib/parsers/redirected_to.rb +29 -0
  18. data/lib/parsers/title.rb +13 -0
  19. data/lib/parsers/twitter_description.rb +13 -0
  20. data/lib/parsers/twitter_profile.rb +11 -0
  21. data/lib/parsers/unescape_html_helper.rb +17 -0
  22. data/lib/parsers/vimeo_profile.rb +11 -0
  23. data/lib/parsers/youtube_channel.rb +29 -0
  24. data/lib/scraper/errors.rb +19 -0
  25. data/lib/scraper/scrape_exceptions.rb +49 -0
  26. data/lib/scraper/scrape_helper.rb +59 -0
  27. data/lib/scraper/scrape_request.rb +29 -0
  28. data/lib/version.rb +6 -0
  29. data/spec/lib/parsers/description_helper_spec.rb +24 -0
  30. data/spec/lib/parsers/emails_spec.rb +60 -0
  31. data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
  32. data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
  33. data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
  34. data/spec/lib/parsers/meta_description_spec.rb +321 -0
  35. data/spec/lib/parsers/org_description_spec.rb +316 -0
  36. data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
  37. data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
  38. data/spec/lib/parsers/redirected_to_spec.rb +207 -0
  39. data/spec/lib/parsers/title_spec.rb +87 -0
  40. data/spec/lib/parsers/twitter_description_spec.rb +314 -0
  41. data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
  42. data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
  43. data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
  44. data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
  45. data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
  46. data/spec/lib/scraper/scrape_request_test.rb +34 -0
  47. data/spec/spec_helper.rb +111 -0
  48. data/spec/vcr/encoding_compatibility_error.yml +316 -0
  49. data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
  50. data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
  51. data/spec/vcr/non_html_scrape.yml +163 -0
  52. data/spec/vcr/valid_scrape_response.yml +696 -0
  53. metadata +250 -0
@@ -0,0 +1,316 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Orgnisation Description' do
4
+
5
+ class DummyTestClass
6
+ include OrgDescription
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid inputs' do
11
+ expect(dummy_object.grep_org_description('')).to be_nil
12
+ expect(dummy_object.grep_org_description(nil)).to be_nil
13
+ end
14
+ describe 'Name key first organization description tag' do
15
+ it 'should return nil for no organization description tag presence' do
16
+ no_org_description = <<~HTML
17
+ <head>
18
+ <meta charset="utf-8">
19
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
20
+ <meta name="viewport" content="width=device-width, initial-scale=1">
21
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
22
+ <meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
23
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
24
+ </head>
25
+ HTML
26
+ meta_description = dummy_object.grep_org_description(no_org_description.to_s)
27
+ expect(meta_description).to be_nil
28
+ end
29
+
30
+ it 'should return nil when content part is empty' do
31
+ no_meta_description = <<~HTML
32
+ <head>
33
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
34
+ <meta property="og:description" content="">
35
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
36
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
37
+ </head>
38
+ HTML
39
+ meta_description = dummy_object.grep_org_description(no_meta_description.to_s)
40
+ expect(meta_description).to be_nil
41
+ end
42
+
43
+ it 'should return description from valid tag' do
44
+ html = <<~HTML
45
+ <head>
46
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
47
+ <meta content="" property="uid">
48
+ <meta property="og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
49
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
50
+ </head>
51
+ HTML
52
+ meta_description = dummy_object.grep_org_description(html.to_s)
53
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
54
+ end
55
+
56
+ it 'should return description even tag is multilined and partially encoded' do
57
+ html = <<~HTML
58
+ <head>
59
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
60
+ <meta content="" property="uid">
61
+ <meta property="og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
62
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
63
+ name="viewport">
64
+ </head>
65
+ HTML
66
+ meta_description = dummy_object.grep_org_description(html.to_s)
67
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
68
+ end
69
+
70
+ it 'should parse meta tag even it is partially single quoted' do
71
+ html = <<~HTML
72
+ <head>
73
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
74
+ <meta content="" property="uid">
75
+ <meta property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
76
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
77
+ name="viewport">
78
+ </head>
79
+ HTML
80
+ meta_description = dummy_object.grep_org_description(html.to_s)
81
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
82
+ end
83
+
84
+ it 'should parse meta tag even it is having other attributes defined' do
85
+ html = <<~HTML
86
+ <head>
87
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
88
+ <meta content="" property="uid">
89
+ <meta class="metadescription" property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
90
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
91
+ name="viewport">
92
+ </head>
93
+ HTML
94
+ meta_description = dummy_object.grep_org_description(html.to_s)
95
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
96
+ end
97
+
98
+ it 'should parse meta tag with itemprop as description key' do
99
+ html = <<~HTML
100
+ <head>
101
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
102
+ <meta content="" property="uid">
103
+ <meta class="metadescription" itemprop=\'og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
104
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
105
+ name="viewport">
106
+ </head>
107
+ HTML
108
+ meta_description = dummy_object.grep_org_description(html.to_s)
109
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
110
+ end
111
+
112
+ it 'should parse even name/itemprop key content is improperly assigned' do
113
+ html = <<~HTML
114
+ <head>
115
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
116
+ <meta content="" property="uid">
117
+ <meta class="metadescription" property=og:description content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" />
118
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
119
+ name="viewport">
120
+ </head>
121
+ HTML
122
+ meta_description = dummy_object.grep_org_description(html.to_s)
123
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
124
+ end
125
+
126
+ it 'should bring description having single quote' do
127
+ html = <<~HTML
128
+ <html lang="en">
129
+ <head>
130
+ <META charset="utf-8">
131
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
132
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
133
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
134
+ <meta property="og:description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
135
+ </head>
136
+ <html>
137
+ HTML
138
+ meta_description = dummy_object.grep_org_description(html.to_s)
139
+ expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
140
+ end
141
+
142
+ it 'should bring description having double quote' do
143
+ html = <<~HTML
144
+ <html lang="en">
145
+ <head>
146
+ <META charset="utf-8">
147
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
148
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
149
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
150
+ <meta property="og:description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
151
+ </head>
152
+ <html>
153
+ HTML
154
+ meta_description = dummy_object.grep_org_description(html.to_s)
155
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
156
+ end
157
+
158
+ it "should bring description even some other meta tag is empty" do
159
+ html = <<~HTML
160
+ <html lang="en">
161
+ <head>
162
+ <META charset="utf-8">
163
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
164
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
165
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
166
+ <meta property="og:description" content="">
167
+ <meta property="og:description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
168
+ </head>
169
+ <html>
170
+ HTML
171
+ meta_description = dummy_object.grep_org_description(html.to_s)
172
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
173
+ end
174
+ end
175
+ describe 'Content key first organization description tag' do
176
+ it 'should return nil when content part is empty' do
177
+ no_meta_description = <<~HTML
178
+ <head>
179
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
180
+ <meta content="" property="og:description">
181
+ <meta content='' property="og:description">
182
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
183
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
184
+ </head>
185
+ HTML
186
+ meta_description = dummy_object.grep_org_description(no_meta_description.to_s)
187
+ expect(meta_description).to be_nil
188
+ end
189
+
190
+ it 'should return description from valid tag' do
191
+ html = <<~HTML
192
+ <head>
193
+ <meta content="" property="uid">
194
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
195
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
196
+ </head>
197
+ HTML
198
+ meta_description = dummy_object.grep_org_description(html.to_s)
199
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
200
+ end
201
+
202
+ it 'should return description even tag is multilined and partially encoded' do
203
+ html = <<~HTML
204
+ <head>
205
+ <meta content="" property="uid">
206
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description" >
207
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
208
+ name="viewport">
209
+ </head>
210
+ HTML
211
+ meta_description = dummy_object.grep_org_description(html.to_s)
212
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
213
+ end
214
+
215
+ it 'should parse meta tag even it is partially single quoted' do
216
+ html = <<~HTML
217
+ <head>
218
+ <meta content="" property="uid">
219
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property=\'og:description">
220
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
221
+ name="viewport">
222
+ </head>
223
+ HTML
224
+ meta_description = dummy_object.grep_org_description(html.to_s)
225
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
226
+ end
227
+
228
+ it 'should parse meta tag even it is having other attributes defined' do
229
+ html = <<~HTML
230
+ <head>
231
+ <meta content="" property="uid">
232
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" property=\'og:description">
233
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
234
+ name="viewport">
235
+ </head>
236
+ HTML
237
+ meta_description = dummy_object.grep_org_description(html.to_s)
238
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
239
+ end
240
+
241
+ it 'should parse meta tag with itemprop as description key' do
242
+ html = <<~HTML
243
+ <head>
244
+ <meta content="" property="uid">
245
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'og:description" charset="UTF-8">
246
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
247
+ name="viewport">
248
+ </head>
249
+ HTML
250
+ meta_description = dummy_object.grep_org_description(html.to_s)
251
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
252
+ end
253
+
254
+ it 'should parse even name/itemprop key content is improperly assigned' do
255
+ html = <<~HTML
256
+ <head>
257
+ <meta content="" property="uid">
258
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" property=og:description />
259
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
260
+ name="viewport">
261
+ </head>
262
+ HTML
263
+ meta_description = dummy_object.grep_org_description(html.to_s)
264
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
265
+ end
266
+
267
+ it 'should bring description having single quote' do
268
+ html = <<~HTML
269
+ <html lang="en">
270
+ <head>
271
+ <META charset="utf-8">
272
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
273
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
274
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
275
+ <meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." property="og:description" />
276
+ </head>
277
+ <html>
278
+ HTML
279
+ meta_description = dummy_object.grep_org_description(html.to_s)
280
+ expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
281
+ end
282
+
283
+ it 'should bring description having double quote' do
284
+ html = <<~HTML
285
+ <html lang="en">
286
+ <head>
287
+ <META charset="utf-8">
288
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
289
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
290
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
291
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' property="og:description" />
292
+ </head>
293
+ <html>
294
+ HTML
295
+ meta_description = dummy_object.grep_org_description(html.to_s)
296
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
297
+ end
298
+
299
+ it "should bring description even some other meta tag is empty" do
300
+ html = <<~HTML
301
+ <html lang="en">
302
+ <head>
303
+ <META charset="utf-8">
304
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
305
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
306
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
307
+ <meta content="" property="og:description">
308
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' property="og:description"/>
309
+ </head>
310
+ <html>
311
+ HTML
312
+ meta_description = dummy_object.grep_org_description(html.to_s)
313
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,69 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'PhoneNumbers' do
4
+
5
+ class DummyTestClass
6
+ include PhoneNumbers
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_phone_numbers(nil)).to be_nil
12
+ expect(dummy_object.grep_phone_numbers('')).to be_nil
13
+ end
14
+
15
+ it 'should give []' do
16
+ html = <<~HTML
17
+ <a href="tel:{{ location1Phone }}">
18
+ <a href="tel:{{pdModel.preferredDealer.phoneNumbers[0].CompleteNumber.$}}">
19
+ <a href="tel:[value]" data-store-phone data-attr-replace="href"></a>
20
+ <a class="a--reset main-nav--mobile__link--secondary ">Grab us on Live Chat</a>
21
+ HTML
22
+ expect(dummy_object.grep_phone_numbers(html.to_s)).to eq([])
23
+ end
24
+ it 'should grep organization phoneNumbers' do
25
+ html = <<~HTML
26
+ <a href="tel: &#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&#x9;01598 760 700&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;" class="button tel">Tel: 01598 760 700</a>
27
+ <a href="tel: +1 (856) 393-2082">+1 (856) 393-2082</a>
28
+ <a href="tel: 800%20843%203269" data-interaction-context data-interaction-type="phone" data-interaction-name="800 843 3269" class="cta" tabindex="0">
29
+ <span class="cta-content">
30
+ <span class="cta-text" tabindex="-1">
31
+ Call Now 800 843 3269
32
+ </span>
33
+ </span>
34
+ </a>
35
+ <a href="tel://1-859-422-6000">859.422.6000</a>
36
+ <a href="tel:&#43;18009634816" class="btn btn-tertiary" data-tag="call">Call</a>
37
+ <a href="tel:%2B44-161-468-1234">0161 468 1234</a>
38
+ <a class="navbar-left" href="tel:781-788-8180 Ext. 4"> <b>Call Sales</b> 781-788-8180 Ext. 4 </a>
39
+ <a href="tel:8663033809">Call</a>
40
+ <a href="tel:http://484-373-7700"><span class="icon fa fa-phone"></span>484-373-7700</a>
41
+ <a href="tel:877.720.0411" data-organic="877.720.0411" data-metro="877.720.0411" data-display="877.720.0411" data-paid="877.720.0411" class="phone"><span class="number">877.720.0411</span></a>
42
+ <a href="tel:1866INTRALINKS">1-866-INTRALINKS</a>
43
+ <a href=tel:1888%20810%207464 itemprop=url>Call</a>
44
+ <a href=tel:18664946627 style="color: inherit; display: inline;">1 (866) 4-WINMAR</a>
45
+ <a href=tel:312-379-9329 class=phone>312-379-9329</a>
46
+ <a href=tel:312-379-9329 class=phone>312-379-9329</a>
47
+ <a\ndata-animsition-out=none href=tel:01722412512>Tel: 01722 412512</a>
48
+
49
+ HTML
50
+ phone_numbers = dummy_object.grep_phone_numbers(html.to_s)
51
+ expected_phone_numbers = [
52
+ "01598 760 700",
53
+ "+1 (856) 393-2082",
54
+ "800 843 3269",
55
+ "1-859-422-6000",
56
+ "+18009634816",
57
+ "+44-161-468-1234",
58
+ "781-788-8180 Ext. 4",
59
+ "8663033809", "484-373-7700",
60
+ "877.720.0411",
61
+ "1866INTRALINKS",
62
+ "1888 810 7464 ",
63
+ "18664946627 ",
64
+ "312-379-9329 ",
65
+ "01722412512"
66
+ ]
67
+ expect(phone_numbers).to eq(expected_phone_numbers)
68
+ end
69
+ end
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Pinterest Profile' do
4
+
5
+ class DummyTestClass
6
+ include PinterestProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_pinterest_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_pinterest_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep below url format' do
16
+ html = <<~HTML
17
+ <a href="http://pinterest.com/#" style="color: white;" class="fa fa-pinterest"></a>
18
+ <a href="http://pinterest.com/" style="color: white;" class="fa fa-pinterest"></a>
19
+ <a href="https://www.pinterest.com/feed/" style="color: white;" class="fa fa-pinterest"></a>
20
+ <a href="https://ct.pinterest.com/v3/?tid=2620913945757&amp;noscript=1" style="color: white;" class="fa fa-pinterest"></a>
21
+ <a href="http://assets.pinterest.com/js/pinmarklet.js?r=" style="color: white;" class="fa fa-pinterest"></a>
22
+ <a href="http://pinterest.com/pin/create/bookmarklet/" style="color: white;" class="fa fa-pinterest"></a>
23
+ <a href="http://uk.pinterest.com/pin/create/bookmarklet/" style="color: white;" class="fa fa-pinterest"></a>
24
+ <a href="https://ct.pinterest.com/?tid=8KRsk0UkbVS&value=0.00&quantity=1" style="color: white;" class="fa fa-pinterest"></a>
25
+ <a href="https://policy.pinterest.com/cookies" style="color: white;" class="fa fa-pinterest"></a>
26
+ HTML
27
+ expect(dummy_object.grep_pinterest_profile(html.to_s)).to eq([])
28
+ end
29
+
30
+ it 'should grep organization pinterest profiles' do
31
+ html = <<~HTML
32
+ <a href="http://www.pinterest.com/blogher" target="_blank">pinterest</a>
33
+ <a href="http://pinterest.com/orientaltrading?cm_sp=Footer-_-SocialLinks-_-Pinterest" target="_blank">pinterest</a>
34
+ <a href="http://pinterest.com/poppin&quot;" target="_blank">pinterest</a>
35
+ HTML
36
+ pinterest_profiles = dummy_object.grep_pinterest_profile(html.to_s)
37
+ expected_pinterest_profiles = [
38
+ 'http://www.pinterest.com/blogher',
39
+ 'http://pinterest.com/orientaltrading',
40
+ 'http://pinterest.com/poppin'
41
+ ]
42
+ expect(dummy_object.grep_pinterest_profile(html.to_s)).to eq(expected_pinterest_profiles)
43
+ end
44
+ end
@@ -0,0 +1,207 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Website Redirected To' do
4
+
5
+ class DummyTestClass
6
+ include RedirectedTo
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+
11
+ it 'should return nil for invalid input' do
12
+ expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
13
+ expect(dummy_object.grep_redirected_to_url('')).to be_nil
14
+ end
15
+
16
+ describe 'Website grep from link tag' do
17
+ describe 'rel attribute first ' do
18
+
19
+ it 'should return nil when canonical url is empty' do
20
+ html = <<~HTML
21
+ <link rel="canonical" href="">
22
+ <link rel="canonical" href=''>
23
+ HTML
24
+ website = dummy_object.grep_redirected_to_url(html.to_s)
25
+ expect(website).to be_nil
26
+ end
27
+
28
+ it 'should grep website' do
29
+ html = <<~HTML
30
+ <link rel="canonical" href="">
31
+ <link rel="canonical" href='https://www.apple.com/'>
32
+ HTML
33
+ website = dummy_object.grep_redirected_to_url(html.to_s)
34
+ expect(website).to eq('https://www.apple.com/')
35
+ end
36
+
37
+ it 'should grep website even with extra attributes' do
38
+ html = <<~HTML
39
+ <link rel="canonical" href="" itemprop="current_url">
40
+ <link rel="canonical" href='https://www.apple.com/'
41
+ itemprop="current_url" >
42
+ HTML
43
+ website = dummy_object.grep_redirected_to_url(html.to_s)
44
+ expect(website).to eq('https://www.apple.com/')
45
+ end
46
+ end
47
+ describe 'href attribute first' do
48
+ it 'should return nil when canonical url is empty' do
49
+ html = <<~HTML
50
+ <link href="" rel="canonical" >
51
+ <link href='' rel="canonical" >
52
+ HTML
53
+ website = dummy_object.grep_redirected_to_url(html.to_s)
54
+ expect(website).to be_nil
55
+ end
56
+
57
+ it 'should grep website' do
58
+ html = <<~HTML
59
+ <link rel="canonical" href="">
60
+ <link href='https://www.apple.com/' rel="canonical">
61
+ HTML
62
+ website = dummy_object.grep_redirected_to_url(html.to_s)
63
+ expect(website).to eq('https://www.apple.com/')
64
+ end
65
+
66
+ it 'should grep website even with extra attributes' do
67
+ html = <<~HTML
68
+ <link href="" itemprop="current_url" rel="canonical">
69
+ <link href='https://www.apple.com/' rel="canonical"
70
+ itemprop="current_url" >
71
+ HTML
72
+ website = dummy_object.grep_redirected_to_url(html.to_s)
73
+ expect(website).to eq('https://www.apple.com/')
74
+ end
75
+ end
76
+ end
77
+ describe 'Website grep from organization URL' do
78
+ describe 'property attribute first ' do
79
+ it 'should return nil when canonical url is empty' do
80
+ html = <<~HTML
81
+ <meta property="og:url" content="" />
82
+ <meta property="og:url" content='' />
83
+ HTML
84
+ website = dummy_object.grep_redirected_to_url(html.to_s)
85
+ expect(website).to be_nil
86
+ end
87
+
88
+ it 'should grep website' do
89
+ html = <<~HTML
90
+ <link property="og:url" content="">
91
+ <meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
92
+ HTML
93
+ website = dummy_object.grep_redirected_to_url(html.to_s)
94
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
95
+ end
96
+
97
+ it 'should grep website even with extra attributes' do
98
+ html = <<~HTML
99
+ <link property="og:url" content="" calss="og-url">
100
+ <meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
101
+ class="og-url" />
102
+ HTML
103
+ website = dummy_object.grep_redirected_to_url(html.to_s)
104
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
105
+ end
106
+ end
107
+ describe 'content attribute first ' do
108
+ it 'should return nil when canonical url is empty' do
109
+ html = <<~HTML
110
+ <meta content="" property="og:url" />
111
+ <meta content='' property="og:url"/>
112
+ HTML
113
+ website = dummy_object.grep_redirected_to_url(html.to_s)
114
+ expect(website).to be_nil
115
+ end
116
+
117
+ it 'should grep website' do
118
+ html = <<~HTML
119
+ <link content="" property="og:url" >
120
+ <meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
121
+ HTML
122
+ website = dummy_object.grep_redirected_to_url(html.to_s)
123
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
124
+ end
125
+
126
+ it 'should grep website even with extra attributes' do
127
+ html = <<~HTML
128
+ <link content="" calss="og-url" property="og:url">
129
+ <meta content='https://www.dieppe.ca/fr/index.aspx'
130
+ class="og-url" property="og:url" />
131
+ HTML
132
+ website = dummy_object.grep_redirected_to_url(html.to_s)
133
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
134
+ end
135
+ end
136
+ end
137
+ describe 'grep website' do
138
+ it 'it should return nil when link or og:url is absent' do
139
+ html = <<~HTML
140
+ <head>
141
+ <meta charset="utf-8">
142
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
143
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
144
+ <title>Techmologic | index</title>
145
+ <!-- Font Awesome -->
146
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
147
+ <!-- Bootstrap core CSS -->
148
+ <link href="css/bootstrap.min.css" rel="stylesheet">
149
+ <!-- Material Design Bootstrap -->
150
+ <link href="css/mdb.min.css" rel="stylesheet">
151
+ <!-- Your custom styles (optional) -->
152
+ <link href="css/style.css" rel="stylesheet">
153
+ </head>
154
+ HTML
155
+ website = dummy_object.grep_redirected_to_url(html.to_s)
156
+ expect(website).to be_nil
157
+ end
158
+ it 'should grep one of canonical or og:url' do
159
+ html = <<~HTML
160
+ <head>
161
+ <meta charset="utf-8">
162
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
163
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
164
+ <title>Techmologic | index</title>
165
+ <link rel="canonical" href="">
166
+ <meta property="og:url" content="" />
167
+ <!-- Font Awesome -->
168
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
169
+ <!-- Bootstrap core CSS -->
170
+ <link href="css/bootstrap.min.css" rel="stylesheet">
171
+ <!-- Material Design Bootstrap -->
172
+ <link href="css/mdb.min.css" rel="stylesheet">
173
+ <!-- Your custom styles (optional) -->
174
+ <link href="css/style.css" rel="stylesheet">
175
+ <link rel="canonical" href="http://techmologics.com/">
176
+ <meta property="og:url" content="http://techmologics.com/" />
177
+ </head>
178
+ HTML
179
+ website = dummy_object.grep_redirected_to_url(html.to_s)
180
+ expect(website).to eq('http://techmologics.com/')
181
+ end
182
+ it 'should grep one of canonical or og:url whatever it\'s position' do
183
+ html = <<~HTML
184
+ <head>
185
+ <meta charset="utf-8">
186
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
187
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
188
+ <title>Techmologic | index</title>
189
+ <link href="" rel="canonical">
190
+ <meta content="" property="og:url"/>
191
+ <!-- Font Awesome -->
192
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
193
+ <!-- Bootstrap core CSS -->
194
+ <link href="css/bootstrap.min.css" rel="stylesheet">
195
+ <!-- Material Design Bootstrap -->
196
+ <link href="css/mdb.min.css" rel="stylesheet">
197
+ <!-- Your custom styles (optional) -->
198
+ <link href="css/style.css" rel="stylesheet">
199
+ <link href="http://techmologics.com/" rel="canonical" class="canonical">
200
+ <meta content="http://techmologics.com/" property="og:url"/>
201
+ </head>
202
+ HTML
203
+ website = dummy_object.grep_redirected_to_url(html.to_s)
204
+ expect(website).to eq('http://techmologics.com/')
205
+ end
206
+ end
207
+ end