brilliant_web_scraper 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/README.md +31 -0
  4. data/brilliant_web_scraper-1.0.0.gem +0 -0
  5. data/brilliant_web_scraper-1.0.gem +0 -0
  6. data/brilliant_web_scraper.gemspec +30 -0
  7. data/lib/brilliant_web_scraper.rb +55 -0
  8. data/lib/parsers/description_helper.rb +28 -0
  9. data/lib/parsers/emails.rb +30 -0
  10. data/lib/parsers/facebook_profile.rb +11 -0
  11. data/lib/parsers/instagram_profile.rb +11 -0
  12. data/lib/parsers/linkedin_profile.rb +11 -0
  13. data/lib/parsers/meta_description.rb +13 -0
  14. data/lib/parsers/org_description.rb +13 -0
  15. data/lib/parsers/phone_numbers.rb +34 -0
  16. data/lib/parsers/pinterest_profile.rb +11 -0
  17. data/lib/parsers/redirected_to.rb +29 -0
  18. data/lib/parsers/title.rb +13 -0
  19. data/lib/parsers/twitter_description.rb +13 -0
  20. data/lib/parsers/twitter_profile.rb +11 -0
  21. data/lib/parsers/unescape_html_helper.rb +17 -0
  22. data/lib/parsers/vimeo_profile.rb +11 -0
  23. data/lib/parsers/youtube_channel.rb +29 -0
  24. data/lib/scraper/errors.rb +19 -0
  25. data/lib/scraper/scrape_exceptions.rb +49 -0
  26. data/lib/scraper/scrape_helper.rb +59 -0
  27. data/lib/scraper/scrape_request.rb +29 -0
  28. data/lib/version.rb +6 -0
  29. data/spec/lib/parsers/description_helper_spec.rb +24 -0
  30. data/spec/lib/parsers/emails_spec.rb +60 -0
  31. data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
  32. data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
  33. data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
  34. data/spec/lib/parsers/meta_description_spec.rb +321 -0
  35. data/spec/lib/parsers/org_description_spec.rb +316 -0
  36. data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
  37. data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
  38. data/spec/lib/parsers/redirected_to_spec.rb +207 -0
  39. data/spec/lib/parsers/title_spec.rb +87 -0
  40. data/spec/lib/parsers/twitter_description_spec.rb +314 -0
  41. data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
  42. data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
  43. data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
  44. data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
  45. data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
  46. data/spec/lib/scraper/scrape_request_test.rb +34 -0
  47. data/spec/spec_helper.rb +111 -0
  48. data/spec/vcr/encoding_compatibility_error.yml +316 -0
  49. data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
  50. data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
  51. data/spec/vcr/non_html_scrape.yml +163 -0
  52. data/spec/vcr/valid_scrape_response.yml +696 -0
  53. metadata +250 -0
@@ -0,0 +1,87 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Title' do
4
+
5
+ class DummyTestClass
6
+ include Title
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil response for invalid inputs' do
11
+ expect(dummy_object.grep_title(nil)).to be_nil
12
+ expect(dummy_object.grep_title('')).to be_nil
13
+ end
14
+
15
+ it 'should return nil for no title presence' do
16
+ no_title_html = <<~HTML
17
+ <meta charset="UTF-8">
18
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
19
+ <meta name="viewport" content="width=device-width, initial-scale=1">
20
+ <title> </title>
21
+ <link href="/on/demandware.static/Sites-Marmot_US-Site/-/default/dw8b6e883e/images/favicon.ico" rel="shortcut icon">
22
+ <meta name="description" content=" Shop the official Marmot online store. Maker of performance outdoor clothing and gear for travel, hiking, camping, snowsports, and more.Marmot">
23
+ HTML
24
+ title = dummy_object.grep_title(no_title_html.to_s)
25
+ expect(title).to eq(nil)
26
+ end
27
+
28
+ it 'should get title even from partially closed title tag' do
29
+ partially_closed_title_tag = <<~HTML
30
+ <title>Harmony (a Mediware company) is now WellSky. You are being redirected to WellSky.com.” /title>
31
+ HTML
32
+ title = dummy_object.grep_title(partially_closed_title_tag.to_s)
33
+ expect(title).to eq('Harmony (a Mediware company) is now WellSky. You are being redirected to WellSky.com.”')
34
+ end
35
+
36
+ it 'should get title even title is multi line' do
37
+ multi_line_title_tag = <<~HTML
38
+ <title>
39
+ Smartphone App Development Company | iPhone iPad App Development | Fredericton, Atlantic Canada | SEO Internet Marketing Website Design
40
+ </title>
41
+ <meta name="robots" content="index, follow" />
42
+ HTML
43
+ title = dummy_object.grep_title(multi_line_title_tag.to_s)
44
+ expect(title).to eq('Smartphone App Development Company | iPhone iPad App Development | Fredericton, Atlantic Canada | SEO Internet Marketing Website Design')
45
+ end
46
+
47
+ it 'should unescpe html encodings from title' do
48
+ partially_html_encoded_title = <<~HTML
49
+ <link rel="pingback" href="https://www.idirect.net/xmlrpc.php">
50
+ <title>ST Engineering iDirect &#8211; Shaping the Future of How the World Connects</title>
51
+ <link rel="dns-prefetch" href="//platform.twitter.com">
52
+ HTML
53
+ title = dummy_object.grep_title(partially_html_encoded_title.to_s)
54
+ expect(title).to eq('ST Engineering iDirect – Shaping the Future of How the World Connects')
55
+ end
56
+
57
+ it 'should remove unnecessary white spaces, new lines, tabs from title' do
58
+ extra_spacing_title = <<~HTML
59
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8" /><meta http-equiv="Expires" content="0" /><meta http-equiv='content-language' content='en' /><title>
60
+
61
+ A global technology and services company committed to innovation| Cegedim
62
+
63
+ </title>
64
+ <link rel="stylesheet" type="text/css" href="/_layouts/15/1033/styles/Themable/corev15.css?rev=2bpHeX9U8DH09TB5zpJcsQ%3D%3D"/>
65
+ HTML
66
+ title = dummy_object.grep_title(extra_spacing_title.to_s)
67
+ expect(title).to eq('A global technology and services company committed to innovation| Cegedim')
68
+ end
69
+ it 'should pick very first title available' do
70
+ multiple_title_html = <<~HTML
71
+ <link rel="pingback" href="https://www.idirect.net/xmlrpc.php">
72
+ <title>ST Engineering iDirect &#8211; Shaping the Future of How the World Connects</title>
73
+ <link rel="dns-prefetch" href="//platform.twitter.com">
74
+ <title>Title 2 - ST Engineering iDirect &#8211; Shaping the Future of How the World Connects</title>
75
+ HTML
76
+ title = dummy_object.grep_title(multiple_title_html.to_s)
77
+ expect(title).to eq('ST Engineering iDirect – Shaping the Future of How the World Connects')
78
+ end
79
+ it 'should grep title with extra atrributes' do
80
+ html = <<~HTML
81
+ <title data-component-id="AdaptiveHtmlHead_01_6930" data-component-name="adaptiveHtmlHead" data-component-endpoint="/aries-common/v1/adaptiveHtmlHead.comp">Vancouver, Canada Hotel - City Center | Sheraton Vancouver Wall Centre</title>
82
+ HTML
83
+ title = dummy_object.grep_title(html.to_s)
84
+ expect(title).to eq('Vancouver, Canada Hotel - City Center | Sheraton Vancouver Wall Centre')
85
+ end
86
+ end
87
+
@@ -0,0 +1,314 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Twitter Description' do
4
+
5
+ class DummyTestClass
6
+ include TwitterDescription
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid inputs' do
11
+ expect(dummy_object.grep_twitter_description('')).to be_nil
12
+ expect(dummy_object.grep_twitter_description(nil)).to be_nil
13
+ end
14
+ describe 'Name key first twitter description tag' do
15
+ it 'should return nil for no twitter description tag presence' do
16
+ no_org_description = <<~HTML
17
+ <head>
18
+ <meta charset="utf-8">
19
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
20
+ <meta name="viewport" content="width=device-width, initial-scale=1">
21
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
22
+ <meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
23
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
24
+ </head>
25
+ HTML
26
+ twitter_description = dummy_object.grep_twitter_description(no_org_description.to_s)
27
+ expect(twitter_description).to be_nil
28
+ end
29
+
30
+ it 'should return nil when content part is empty' do
31
+ html = <<~HTML
32
+ <head>
33
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
34
+ <meta name="twitter:description" content="">
35
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
36
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
37
+ </head>
38
+ HTML
39
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
40
+ expect(twitter_description).to be_nil
41
+ end
42
+
43
+ it 'should return description from valid tag' do
44
+ html = <<~HTML
45
+ <head>
46
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description">
47
+ <meta content="" property="uid">
48
+ <meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
49
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
50
+ </head>
51
+ HTML
52
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
53
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
54
+ end
55
+
56
+ it 'should return description even tag is multilined and partially encoded' do
57
+ html = <<~HTML
58
+ <head>
59
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description">
60
+ <meta content="" property="uid">
61
+ <meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
62
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
63
+ name="viewport">
64
+ </head>
65
+ HTML
66
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
67
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
68
+ end
69
+
70
+ it 'should parse meta tag even it is partially single quoted' do
71
+ html = <<~HTML
72
+ <head>
73
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name=\'twitter:description">
74
+ <meta content="" property="uid">
75
+ <meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
76
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
77
+ name="viewport">
78
+ </head>
79
+ HTML
80
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
81
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
82
+ end
83
+
84
+ it 'should parse meta tag even it is having other attributes defined' do
85
+ html = <<~HTML
86
+ <head>
87
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description">
88
+ <meta content="" property="uid">
89
+ <meta class="metadescription" property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
90
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
91
+ name="viewport">
92
+ </head>
93
+ HTML
94
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
95
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
96
+ end
97
+
98
+ it 'should parse meta tag with itemprop as description key' do
99
+ html = <<~HTML
100
+ <head>
101
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'twitter:description">
102
+ <meta content="" property="uid">
103
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
104
+ name="viewport">
105
+ </head>
106
+ HTML
107
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
108
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
109
+ end
110
+
111
+ it 'should parse even name/itemprop key content is improperly assigned' do
112
+ html = <<~HTML
113
+ <head>
114
+ <meta content="" property="uid">
115
+ <meta class="metadescription" name=twitter:description content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" />
116
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
117
+ name="viewport">
118
+ </head>
119
+ HTML
120
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
121
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
122
+ end
123
+
124
+ it 'should bring description having single quote' do
125
+ html = <<~HTML
126
+ <html lang="en">
127
+ <head>
128
+ <META charset="utf-8">
129
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
130
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
131
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
132
+ <meta name="twitter:description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
133
+ </head>
134
+ <html>
135
+ HTML
136
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
137
+ expect(twitter_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
138
+ end
139
+
140
+ it 'should bring description having double quote' do
141
+ html = <<~HTML
142
+ <html lang="en">
143
+ <head>
144
+ <META charset="utf-8">
145
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
146
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
147
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
148
+ <meta name="twitter:description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
149
+ </head>
150
+ <html>
151
+ HTML
152
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
153
+ expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
154
+ end
155
+
156
+ it "should bring description even some other meta tag is empty" do
157
+ html = <<~HTML
158
+ <html lang="en">
159
+ <head>
160
+ <META charset="utf-8">
161
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
162
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
163
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
164
+ <meta name="twitter:description" content="">
165
+ <meta name="twitter:description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
166
+ </head>
167
+ <html>
168
+ HTML
169
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
170
+ expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
171
+ end
172
+ end
173
+ describe 'Content key first organization description tag' do
174
+ it 'should return nil when content part is empty' do
175
+ no_twitter_description = <<~HTML
176
+ <head>
177
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
178
+ <meta content="" name="twitter:description">
179
+ <meta content='' name="twitter:description">
180
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
181
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
182
+ </head>
183
+ HTML
184
+ twitter_description = dummy_object.grep_twitter_description(no_twitter_description.to_s)
185
+ expect(twitter_description).to be_nil
186
+ end
187
+
188
+ it 'should return description from valid tag' do
189
+ html = <<~HTML
190
+ <head>
191
+ <meta content="" property="uid">
192
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description">
193
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
194
+ </head>
195
+ HTML
196
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
197
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
198
+ end
199
+
200
+ it 'should return description even tag is multilined and partially encoded' do
201
+ html = <<~HTML
202
+ <head>
203
+ <meta content="" property="uid">
204
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="twitter:description" >
205
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
206
+ name="viewport">
207
+ </head>
208
+ HTML
209
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
210
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
211
+ end
212
+
213
+ it 'should parse meta tag even it is partially single quoted' do
214
+ html = <<~HTML
215
+ <head>
216
+ <meta content="" property="uid">
217
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name=\'twitter:description">
218
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
219
+ name="viewport">
220
+ </head>
221
+ HTML
222
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
223
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
224
+ end
225
+
226
+ it 'should parse meta tag even it is having other attributes defined' do
227
+ html = <<~HTML
228
+ <head>
229
+ <meta content="" property="uid">
230
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=\'twitter:description">
231
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
232
+ name="viewport">
233
+ </head>
234
+ HTML
235
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
236
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
237
+ end
238
+
239
+ it 'should parse meta tag with itemprop as description key' do
240
+ html = <<~HTML
241
+ <head>
242
+ <meta content="" property="uid">
243
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'twitter:description" charset="UTF-8">
244
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
245
+ name="viewport">
246
+ </head>
247
+ HTML
248
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
249
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
250
+ end
251
+
252
+ it 'should parse even name/itemprop key content is improperly assigned' do
253
+ html = <<~HTML
254
+ <head>
255
+ <meta content="" property="uid">
256
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=twitter:description />
257
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
258
+ name="viewport">
259
+ </head>
260
+ HTML
261
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
262
+ expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
263
+ end
264
+
265
+ it 'should bring description having single quote' do
266
+ html = <<~HTML
267
+ <html lang="en">
268
+ <head>
269
+ <META charset="utf-8">
270
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
271
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
272
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
273
+ <meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." name="twitter:description" />
274
+ </head>
275
+ <html>
276
+ HTML
277
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
278
+ expect(twitter_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
279
+ end
280
+
281
+ it 'should bring description having double quote' do
282
+ html = <<~HTML
283
+ <html lang="en">
284
+ <head>
285
+ <META charset="utf-8">
286
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
287
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
288
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
289
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="twitter:description" />
290
+ </head>
291
+ <html>
292
+ HTML
293
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
294
+ expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
295
+ end
296
+
297
+ it "should bring description even some other meta tag is empty" do
298
+ html = <<~HTML
299
+ <html lang="en">
300
+ <head>
301
+ <META charset="utf-8">
302
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
303
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
304
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
305
+ <meta content="" name="twitter:description">
306
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="twitter:description"/>
307
+ </head>
308
+ <html>
309
+ HTML
310
+ twitter_description = dummy_object.grep_twitter_description(html.to_s)
311
+ expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
312
+ end
313
+ end
314
+ end
@@ -0,0 +1,59 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Twitter Profile' do
4
+
5
+ class DummyTestClass
6
+ include TwitterProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_twitter_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_twitter_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep any non profile url' do
16
+ html = <<~HTML
17
+ <a href="http://twitter.com/download/iphone\\" target="_blank">
18
+ <a href="http://twitter.com/%user_screen_name%/statuses/%id%" target="_blank">
19
+ <a href="https://twitter.com/i/web/status/1116404686133686272" target="_blank">
20
+ <a href="https://twitter.com/" target="_blank">
21
+ <a href="http://twitter.com/\'+reply.substring(1)+" target="_blank">
22
+ <a href="http://twitter.com/#" target="_blank">
23
+ <a href="https://twitter.com/intent/tweet?text=https://www.facebook.com/ChoosePremiere/photos/a.10151220913587649/10157236078952649/?type=3'," target="_blank">
24
+ <a href="https://twitter.com/share?url=https://dirigoagency.com/" target="_blank">
25
+ <a href="https://twitter.com/search?q=%23solicitors&src=hash" target="_blank">
26
+ <a href="https://twitter.com/hashtag/salisburysalutes?src=hash" target="_blank">
27
+ <a href="https://twitter.com/privacy" target="_blank">
28
+ <a href="https://twitter.com/home?status=Hey" target="_blank">
29
+ <a href="https://twitter.com/statuses/1113546402863312896" target="_blank">
30
+ <a href="https://twitter.com/login" target="_blank">
31
+ <a href=" http://twitter.com/share/" target="_blank">
32
+ <a href="https://twitter.com/#!/Farmer_Brothers" target="_blank">
33
+ <a href="http://twitter.com/javascripts/blogger.js" target="_blank">
34
+ HTML
35
+ expect(dummy_object.grep_twitter_profile(html.to_s)).to eq([])
36
+ end
37
+
38
+ it 'should grep valid urls' do
39
+ html = <<~HTML
40
+ <a href="http://twitter.com/_titaniumrings" target="_blank">
41
+ <a href="http://twitter.com/_Titaniumrings" target="_blank">
42
+ <a href="http://twitter.com/@clindiatwitt" target="_blank">
43
+ <a href="http://twitter.com/8of12" target="_blank">
44
+ <a href="http://twitter.com/AAB_Accountants" target="_blank">
45
+ <a href="http://twitter.com/SundanceCompany/statuses/1148708421308485637" target="_blank">
46
+ HTML
47
+ twitter_profiles = dummy_object.grep_twitter_profile(html.to_s)
48
+
49
+ expected_profiles = [
50
+ "http://twitter.com/_titaniumrings",
51
+ "http://twitter.com/_Titaniumrings",
52
+ "http://twitter.com/@clindiatwitt",
53
+ "http://twitter.com/8of12",
54
+ "http://twitter.com/AAB_Accountants",
55
+ "http://twitter.com/SundanceCompany/statuses/1148708421308485637"
56
+ ]
57
+ expect(twitter_profiles).to eq(expected_profiles)
58
+ end
59
+ end