brilliant_web_scraper 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/README.md +31 -0
  4. data/brilliant_web_scraper-1.0.0.gem +0 -0
  5. data/brilliant_web_scraper-1.0.gem +0 -0
  6. data/brilliant_web_scraper.gemspec +30 -0
  7. data/lib/brilliant_web_scraper.rb +55 -0
  8. data/lib/parsers/description_helper.rb +28 -0
  9. data/lib/parsers/emails.rb +30 -0
  10. data/lib/parsers/facebook_profile.rb +11 -0
  11. data/lib/parsers/instagram_profile.rb +11 -0
  12. data/lib/parsers/linkedin_profile.rb +11 -0
  13. data/lib/parsers/meta_description.rb +13 -0
  14. data/lib/parsers/org_description.rb +13 -0
  15. data/lib/parsers/phone_numbers.rb +34 -0
  16. data/lib/parsers/pinterest_profile.rb +11 -0
  17. data/lib/parsers/redirected_to.rb +29 -0
  18. data/lib/parsers/title.rb +13 -0
  19. data/lib/parsers/twitter_description.rb +13 -0
  20. data/lib/parsers/twitter_profile.rb +11 -0
  21. data/lib/parsers/unescape_html_helper.rb +17 -0
  22. data/lib/parsers/vimeo_profile.rb +11 -0
  23. data/lib/parsers/youtube_channel.rb +29 -0
  24. data/lib/scraper/errors.rb +19 -0
  25. data/lib/scraper/scrape_exceptions.rb +49 -0
  26. data/lib/scraper/scrape_helper.rb +59 -0
  27. data/lib/scraper/scrape_request.rb +29 -0
  28. data/lib/version.rb +6 -0
  29. data/spec/lib/parsers/description_helper_spec.rb +24 -0
  30. data/spec/lib/parsers/emails_spec.rb +60 -0
  31. data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
  32. data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
  33. data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
  34. data/spec/lib/parsers/meta_description_spec.rb +321 -0
  35. data/spec/lib/parsers/org_description_spec.rb +316 -0
  36. data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
  37. data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
  38. data/spec/lib/parsers/redirected_to_spec.rb +207 -0
  39. data/spec/lib/parsers/title_spec.rb +87 -0
  40. data/spec/lib/parsers/twitter_description_spec.rb +314 -0
  41. data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
  42. data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
  43. data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
  44. data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
  45. data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
  46. data/spec/lib/scraper/scrape_request_test.rb +34 -0
  47. data/spec/spec_helper.rb +111 -0
  48. data/spec/vcr/encoding_compatibility_error.yml +316 -0
  49. data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
  50. data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
  51. data/spec/vcr/non_html_scrape.yml +163 -0
  52. data/spec/vcr/valid_scrape_response.yml +696 -0
  53. metadata +250 -0
@@ -0,0 +1,87 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Title' do
4
+
5
+ class DummyTestClass
6
+ include Title
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil response for invalid inputs' do
11
+ expect(dummy_object.grep_title(nil)).to be_nil
12
+ expect(dummy_object.grep_title('')).to be_nil
13
+ end
14
+
15
+ it 'should return nil for no title presence' do
16
+ no_title_html = <<~HTML
17
+ <meta charset="UTF-8">
18
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
19
+ <meta name="viewport" content="width=device-width, initial-scale=1">
20
+ <title> </title>
21
+ <link href="/on/demandware.static/Sites-Marmot_US-Site/-/default/dw8b6e883e/images/favicon.ico" rel="shortcut icon">
22
+ <meta name="description" content=" Shop the official Marmot online store. Maker of performance outdoor clothing and gear for travel, hiking, camping, snowsports, and more.Marmot">
23
+ HTML
24
+ title = dummy_object.grep_title(no_title_html.to_s)
25
+ expect(title).to eq(nil)
26
+ end
27
+
28
+ it 'should get title even from partially closed title tag' do
29
+ partially_closed_title_tag = <<~HTML
30
+ <title>Harmony (a Mediware company) is now WellSky. You are being redirected to WellSky.com.” /title>
31
+ HTML
32
+ title = dummy_object.grep_title(partially_closed_title_tag.to_s)
33
+ expect(title).to eq('Harmony (a Mediware company) is now WellSky. You are being redirected to WellSky.com.”')
34
+ end
35
+
36
+ it 'should get title even title is multi line' do
37
+ multi_line_title_tag = <<~HTML
38
+ <title>
39
+ Smartphone App Development Company | iPhone iPad App Development | Fredericton, Atlantic Canada | SEO Internet Marketing Website Design
40
+ </title>
41
+ <meta name="robots" content="index, follow" />
42
+ HTML
43
+ title = dummy_object.grep_title(multi_line_title_tag.to_s)
44
+ expect(title).to eq('Smartphone App Development Company | iPhone iPad App Development | Fredericton, Atlantic Canada | SEO Internet Marketing Website Design')
45
+ end
46
+
47
+ it 'should unescpe html encodings from title' do
48
+ partially_html_encoded_title = <<~HTML
49
+ <link rel="pingback" href="https://www.idirect.net/xmlrpc.php">
50
+ <title>ST Engineering iDirect &#8211; Shaping the Future of How the World Connects</title>
51
+ <link rel="dns-prefetch" href="//platform.twitter.com">
52
+ HTML
53
+ title = dummy_object.grep_title(partially_html_encoded_title.to_s)
54
+ expect(title).to eq('ST Engineering iDirect – Shaping the Future of How the World Connects')
55
+ end
56
+
57
+ it 'should remove unnecessary white spaces, new lines, tabs from title' do
58
+ extra_spacing_title = <<~HTML
59
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8" /><meta http-equiv="Expires" content="0" /><meta http-equiv='content-language' content='en' /><title>
60
+
61
+ A global technology and services company committed to innovation| Cegedim
62
+
63
+ </title>
64
+ <link rel="stylesheet" type="text/css" href="/_layouts/15/1033/styles/Themable/corev15.css?rev=2bpHeX9U8DH09TB5zpJcsQ%3D%3D"/>
65
+ HTML
66
+ title = dummy_object.grep_title(extra_spacing_title.to_s)
67
+ expect(title).to eq('A global technology and services company committed to innovation| Cegedim')
68
+ end
69
+ it 'should pick very first title available' do
70
+ multiple_title_html = <<~HTML
71
+ <link rel="pingback" href="https://www.idirect.net/xmlrpc.php">
72
+ <title>ST Engineering iDirect &#8211; Shaping the Future of How the World Connects</title>
73
+ <link rel="dns-prefetch" href="//platform.twitter.com">
74
+ <title>Title 2 - ST Engineering iDirect &#8211; Shaping the Future of How the World Connects</title>
75
+ HTML
76
+ title = dummy_object.grep_title(multiple_title_html.to_s)
77
+ expect(title).to eq('ST Engineering iDirect – Shaping the Future of How the World Connects')
78
+ end
79
+ it 'should grep title with extra atrributes' do
80
+ html = <<~HTML
81
+ <title data-component-id="AdaptiveHtmlHead_01_6930" data-component-name="adaptiveHtmlHead" data-component-endpoint="/aries-common/v1/adaptiveHtmlHead.comp">Vancouver, Canada Hotel - City Center | Sheraton Vancouver Wall Centre</title>
82
+ HTML
83
+ title = dummy_object.grep_title(html.to_s)
84
+ expect(title).to eq('Vancouver, Canada Hotel - City Center | Sheraton Vancouver Wall Centre')
85
+ end
86
+ end
87
+
@@ -0,0 +1,314 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Twitter Description' do
4
+
5
+ class DummyTestClass
6
+ include TwitterDescription
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid inputs' do
11
+ expect(dummy_object.grep_twitter_description('')).to be_nil
12
+ expect(dummy_object.grep_twitter_description(nil)).to be_nil
13
+ end
14
+ describe 'Name key first twitter description tag' do
15
+ it 'should return nil for no twitter description tag presence' do
16
+ no_org_description = <<~HTML
17
+ <head>
18
+ <meta charset="utf-8">
19
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
20
+ <meta name="viewport" content="width=device-width, initial-scale=1">
21
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
22
+ <meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
23
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
24
+ </head>
25
+ HTML
26
+ twitter_description = dummy_object.grep_twitter_description(no_org_description.to_s)
27
+ expect(twitter_description).to be_nil
28
+ end
29
+
30
+ it 'should return nil when content part is empty' do
31
+ html = <<~HTML
32
+ <head>
33
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
34
+ <meta name="twitter:description" content="">
35
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
36
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
37
+ </head>
38
+ HTML
39
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
40
+ expect(twitter_description).to be_nil
41
+ end
42
+
43
+ it 'should return description from valid tag' do
44
+ html = <<~HTML
45
+ <head>
46
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description">
47
+ <meta content="" property="uid">
48
+ <meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
49
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
50
+ </head>
51
+ HTML
52
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
53
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
54
+ end
55
+
56
+ it 'should return description even tag is multilined and partially encoded' do
57
+ html = <<~HTML
58
+ <head>
59
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description">
60
+ <meta content="" property="uid">
61
+ <meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
62
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
63
+ name="viewport">
64
+ </head>
65
+ HTML
66
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
67
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
68
+ end
69
+
70
+ it 'should parse meta tag even it is partially single quoted' do
71
+ html = <<~HTML
72
+ <head>
73
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name=\'twitter:description">
74
+ <meta content="" property="uid">
75
+ <meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
76
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
77
+ name="viewport">
78
+ </head>
79
+ HTML
80
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
81
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
82
+ end
83
+
84
+ it 'should parse meta tag even it is having other attributes defined' do
85
+ html = <<~HTML
86
+ <head>
87
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description">
88
+ <meta content="" property="uid">
89
+ <meta class="metadescription" property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
90
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
91
+ name="viewport">
92
+ </head>
93
+ HTML
94
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
95
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
96
+ end
97
+
98
+ it 'should parse meta tag with itemprop as description key' do
99
+ html = <<~HTML
100
+ <head>
101
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'twitter:description">
102
+ <meta content="" property="uid">
103
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
104
+ name="viewport">
105
+ </head>
106
+ HTML
107
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
108
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
109
+ end
110
+
111
+ it 'should parse even name/itemprop key content is improperly assigned' do
112
+ html = <<~HTML
113
+ <head>
114
+ <meta content="" property="uid">
115
+ <meta class="metadescription" name=twitter:description content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" />
116
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
117
+ name="viewport">
118
+ </head>
119
+ HTML
120
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
121
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
122
+ end
123
+
124
+ it 'should bring description having single quote' do
125
+ html = <<~HTML
126
+ <html lang="en">
127
+ <head>
128
+ <META charset="utf-8">
129
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
130
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
131
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
132
+ <meta name="twitter:description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
133
+ </head>
134
+ <html>
135
+ HTML
136
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
137
+ expect(twitter_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
138
+ end
139
+
140
+ it 'should bring description having double quote' do
141
+ html = <<~HTML
142
+ <html lang="en">
143
+ <head>
144
+ <META charset="utf-8">
145
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
146
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
147
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
148
+ <meta name="twitter:description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
149
+ </head>
150
+ <html>
151
+ HTML
152
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
153
+ expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
154
+ end
155
+
156
+ it "should bring description even some other meta tag is empty" do
157
+ html = <<~HTML
158
+ <html lang="en">
159
+ <head>
160
+ <META charset="utf-8">
161
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
162
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
163
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
164
+ <meta name="twitter:description" content="">
165
+ <meta name="twitter:description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
166
+ </head>
167
+ <html>
168
+ HTML
169
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
170
+ expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
171
+ end
172
+ end
173
+ describe 'Content key first organization description tag' do
174
+ it 'should return nil when content part is empty' do
175
+ no_twitter_description = <<~HTML
176
+ <head>
177
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
178
+ <meta content="" name="twitter:description">
179
+ <meta content='' name="twitter:description">
180
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
181
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
182
+ </head>
183
+ HTML
184
+ twitter_description = dummy_object.grep_twitter_description(no_twitter_description.to_s)
185
+ expect(twitter_description).to be_nil
186
+ end
187
+
188
+ it 'should return description from valid tag' do
189
+ html = <<~HTML
190
+ <head>
191
+ <meta content="" property="uid">
192
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description">
193
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
194
+ </head>
195
+ HTML
196
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
197
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
198
+ end
199
+
200
+ it 'should return description even tag is multilined and partially encoded' do
201
+ html = <<~HTML
202
+ <head>
203
+ <meta content="" property="uid">
204
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description" >
205
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
206
+ name="viewport">
207
+ </head>
208
+ HTML
209
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
210
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
211
+ end
212
+
213
+ it 'should parse meta tag even it is partially single quoted' do
214
+ html = <<~HTML
215
+ <head>
216
+ <meta content="" property="uid">
217
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name=\'twitter:description">
218
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
219
+ name="viewport">
220
+ </head>
221
+ HTML
222
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
223
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
224
+ end
225
+
226
+ it 'should parse meta tag even it is having other attributes defined' do
227
+ html = <<~HTML
228
+ <head>
229
+ <meta content="" property="uid">
230
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=\'twitter:description">
231
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
232
+ name="viewport">
233
+ </head>
234
+ HTML
235
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
236
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
237
+ end
238
+
239
+ it 'should parse meta tag with itemprop as description key' do
240
+ html = <<~HTML
241
+ <head>
242
+ <meta content="" property="uid">
243
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'twitter:description" charset="UTF-8">
244
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
245
+ name="viewport">
246
+ </head>
247
+ HTML
248
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
249
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
250
+ end
251
+
252
+ it 'should parse even name/itemprop key content is improperly assigned' do
253
+ html = <<~HTML
254
+ <head>
255
+ <meta content="" property="uid">
256
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=twitter:description />
257
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
258
+ name="viewport">
259
+ </head>
260
+ HTML
261
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
262
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
263
+ end
264
+
265
+ it 'should bring description having single quote' do
266
+ html = <<~HTML
267
+ <html lang="en">
268
+ <head>
269
+ <META charset="utf-8">
270
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
271
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
272
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
273
+ <meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." name="twitter:description" />
274
+ </head>
275
+ <html>
276
+ HTML
277
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
278
+ expect(twitter_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
279
+ end
280
+
281
+ it 'should bring description having double quote' do
282
+ html = <<~HTML
283
+ <html lang="en">
284
+ <head>
285
+ <META charset="utf-8">
286
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
287
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
288
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
289
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="twitter:description" />
290
+ </head>
291
+ <html>
292
+ HTML
293
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
294
+ expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
295
+ end
296
+
297
+ it "should bring description even some other meta tag is empty" do
298
+ html = <<~HTML
299
+ <html lang="en">
300
+ <head>
301
+ <META charset="utf-8">
302
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
303
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
304
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
305
+ <meta content="" name="twitter:description">
306
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="twitter:description"/>
307
+ </head>
308
+ <html>
309
+ HTML
310
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
311
+ expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
312
+ end
313
+ end
314
+ end
@@ -0,0 +1,59 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Twitter Profile' do
4
+
5
+ class DummyTestClass
6
+ include TwitterProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_twitter_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_twitter_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep any non profile url' do
16
+ html = <<~HTML
17
+ <a href="http://twitter.com/download/iphone\\" target="_blank">
18
+ <a href="http://twitter.com/%user_screen_name%/statuses/%id%" target="_blank">
19
+ <a href="https://twitter.com/i/web/status/1116404686133686272" target="_blank">
20
+ <a href="https://twitter.com/" target="_blank">
21
+ <a href="http://twitter.com/\'+reply.substring(1)+" target="_blank">
22
+ <a href="http://twitter.com/#" target="_blank">
23
+ <a href="https://twitter.com/intent/tweet?text=https://www.facebook.com/ChoosePremiere/photos/a.10151220913587649/10157236078952649/?type=3'," target="_blank">
24
+ <a href="https://twitter.com/share?url=https://dirigoagency.com/" target="_blank">
25
+ <a href="https://twitter.com/search?q=%23solicitors&src=hash" target="_blank">
26
+ <a href="https://twitter.com/hashtag/salisburysalutes?src=hash" target="_blank">
27
+ <a href="https://twitter.com/privacy" target="_blank">
28
+ <a href="https://twitter.com/home?status=Hey" target="_blank">
29
+ <a href="https://twitter.com/statuses/1113546402863312896" target="_blank">
30
+ <a href="https://twitter.com/login" target="_blank">
31
+ <a href=" http://twitter.com/share/" target="_blank">
32
+ <a href="https://twitter.com/#!/Farmer_Brothers" target="_blank">
33
+ <a href="http://twitter.com/javascripts/blogger.js" target="_blank">
34
+ HTML
35
+ expect(dummy_object.grep_twitter_profile(html.to_s)).to eq([])
36
+ end
37
+
38
+ it 'should grep valid urls' do
39
+ html = <<~HTML
40
+ <a href="http://twitter.com/_titaniumrings" target="_blank">
41
+ <a href="http://twitter.com/_Titaniumrings" target="_blank">
42
+ <a href="http://twitter.com/@clindiatwitt" target="_blank">
43
+ <a href="http://twitter.com/8of12" target="_blank">
44
+ <a href="http://twitter.com/AAB_Accountants" target="_blank">
45
+ <a href="http://twitter.com/SundanceCompany/statuses/1148708421308485637" target="_blank">
46
+ HTML
47
+ twitter_profiles = dummy_object.grep_twitter_profile(html.to_s)
48
+
49
+ expected_profiles = [
50
+ "http://twitter.com/_titaniumrings",
51
+ "http://twitter.com/_Titaniumrings",
52
+ "http://twitter.com/@clindiatwitt",
53
+ "http://twitter.com/8of12",
54
+ "http://twitter.com/AAB_Accountants",
55
+ "http://twitter.com/SundanceCompany/statuses/1148708421308485637"
56
+ ]
57
+ expect(twitter_profiles).to eq(expected_profiles)
58
+ end
59
+ end