brilliant_web_scraper 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/README.md +31 -0
- data/brilliant_web_scraper-1.0.0.gem +0 -0
- data/brilliant_web_scraper-1.0.gem +0 -0
- data/brilliant_web_scraper.gemspec +30 -0
- data/lib/brilliant_web_scraper.rb +55 -0
- data/lib/parsers/description_helper.rb +28 -0
- data/lib/parsers/emails.rb +30 -0
- data/lib/parsers/facebook_profile.rb +11 -0
- data/lib/parsers/instagram_profile.rb +11 -0
- data/lib/parsers/linkedin_profile.rb +11 -0
- data/lib/parsers/meta_description.rb +13 -0
- data/lib/parsers/org_description.rb +13 -0
- data/lib/parsers/phone_numbers.rb +34 -0
- data/lib/parsers/pinterest_profile.rb +11 -0
- data/lib/parsers/redirected_to.rb +29 -0
- data/lib/parsers/title.rb +13 -0
- data/lib/parsers/twitter_description.rb +13 -0
- data/lib/parsers/twitter_profile.rb +11 -0
- data/lib/parsers/unescape_html_helper.rb +17 -0
- data/lib/parsers/vimeo_profile.rb +11 -0
- data/lib/parsers/youtube_channel.rb +29 -0
- data/lib/scraper/errors.rb +19 -0
- data/lib/scraper/scrape_exceptions.rb +49 -0
- data/lib/scraper/scrape_helper.rb +59 -0
- data/lib/scraper/scrape_request.rb +29 -0
- data/lib/version.rb +6 -0
- data/spec/lib/parsers/description_helper_spec.rb +24 -0
- data/spec/lib/parsers/emails_spec.rb +60 -0
- data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
- data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
- data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
- data/spec/lib/parsers/meta_description_spec.rb +321 -0
- data/spec/lib/parsers/org_description_spec.rb +316 -0
- data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
- data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
- data/spec/lib/parsers/redirected_to_spec.rb +207 -0
- data/spec/lib/parsers/title_spec.rb +87 -0
- data/spec/lib/parsers/twitter_description_spec.rb +314 -0
- data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
- data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
- data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
- data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
- data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
- data/spec/lib/scraper/scrape_request_test.rb +34 -0
- data/spec/spec_helper.rb +111 -0
- data/spec/vcr/encoding_compatibility_error.yml +316 -0
- data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
- data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
- data/spec/vcr/non_html_scrape.yml +163 -0
- data/spec/vcr/valid_scrape_response.yml +696 -0
- metadata +250 -0
@@ -0,0 +1,316 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Orgnisation Description' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include OrgDescription
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid inputs' do
|
11
|
+
expect(dummy_object.grep_org_description('')).to be_nil
|
12
|
+
expect(dummy_object.grep_org_description(nil)).to be_nil
|
13
|
+
end
|
14
|
+
describe 'Name key first organization description tag' do
|
15
|
+
it 'should return nil for no organization description tag presence' do
|
16
|
+
no_org_description = <<~HTML
|
17
|
+
<head>
|
18
|
+
<meta charset="utf-8">
|
19
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
20
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
21
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
22
|
+
<meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
|
23
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
24
|
+
</head>
|
25
|
+
HTML
|
26
|
+
meta_description = dummy_object.grep_org_description(no_org_description.to_s)
|
27
|
+
expect(meta_description).to be_nil
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should return nil when content part is empty' do
|
31
|
+
no_meta_description = <<~HTML
|
32
|
+
<head>
|
33
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
34
|
+
<meta property="og:description" content="">
|
35
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
36
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
37
|
+
</head>
|
38
|
+
HTML
|
39
|
+
meta_description = dummy_object.grep_org_description(no_meta_description.to_s)
|
40
|
+
expect(meta_description).to be_nil
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should return description from valid tag' do
|
44
|
+
html = <<~HTML
|
45
|
+
<head>
|
46
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
47
|
+
<meta content="" property="uid">
|
48
|
+
<meta property="og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
49
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
|
50
|
+
</head>
|
51
|
+
HTML
|
52
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
53
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should return description even tag is multilined and partially encoded' do
|
57
|
+
html = <<~HTML
|
58
|
+
<head>
|
59
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
60
|
+
<meta content="" property="uid">
|
61
|
+
<meta property="og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
62
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
63
|
+
name="viewport">
|
64
|
+
</head>
|
65
|
+
HTML
|
66
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
67
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should parse meta tag even it is partially single quoted' do
|
71
|
+
html = <<~HTML
|
72
|
+
<head>
|
73
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
74
|
+
<meta content="" property="uid">
|
75
|
+
<meta property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
76
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
77
|
+
name="viewport">
|
78
|
+
</head>
|
79
|
+
HTML
|
80
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
81
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should parse meta tag even it is having other attributes defined' do
|
85
|
+
html = <<~HTML
|
86
|
+
<head>
|
87
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
88
|
+
<meta content="" property="uid">
|
89
|
+
<meta class="metadescription" property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8">
|
90
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
91
|
+
name="viewport">
|
92
|
+
</head>
|
93
|
+
HTML
|
94
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
95
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'should parse meta tag with itemprop as description key' do
|
99
|
+
html = <<~HTML
|
100
|
+
<head>
|
101
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
102
|
+
<meta content="" property="uid">
|
103
|
+
<meta class="metadescription" itemprop=\'og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8">
|
104
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
105
|
+
name="viewport">
|
106
|
+
</head>
|
107
|
+
HTML
|
108
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
109
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
110
|
+
end
|
111
|
+
|
112
|
+
it 'should parse even name/itemprop key content is improperly assigned' do
|
113
|
+
html = <<~HTML
|
114
|
+
<head>
|
115
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
116
|
+
<meta content="" property="uid">
|
117
|
+
<meta class="metadescription" property=og:description content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" />
|
118
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
119
|
+
name="viewport">
|
120
|
+
</head>
|
121
|
+
HTML
|
122
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
123
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should bring description having single quote' do
|
127
|
+
html = <<~HTML
|
128
|
+
<html lang="en">
|
129
|
+
<head>
|
130
|
+
<META charset="utf-8">
|
131
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
132
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
133
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
134
|
+
<meta property="og:description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
|
135
|
+
</head>
|
136
|
+
<html>
|
137
|
+
HTML
|
138
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
139
|
+
expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'should bring description having double quote' do
|
143
|
+
html = <<~HTML
|
144
|
+
<html lang="en">
|
145
|
+
<head>
|
146
|
+
<META charset="utf-8">
|
147
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
148
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
149
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
150
|
+
<meta property="og:description" content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' />
|
151
|
+
</head>
|
152
|
+
<html>
|
153
|
+
HTML
|
154
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
155
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should bring description even some other meta tag is empty" do
|
159
|
+
html = <<~HTML
|
160
|
+
<html lang="en">
|
161
|
+
<head>
|
162
|
+
<META charset="utf-8">
|
163
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
164
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
165
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
166
|
+
<meta property="og:description" content="">
|
167
|
+
<meta property="og:description" content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' />
|
168
|
+
</head>
|
169
|
+
<html>
|
170
|
+
HTML
|
171
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
172
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
173
|
+
end
|
174
|
+
end
|
175
|
+
describe 'Content key first organization description tag' do
|
176
|
+
it 'should return nil when content part is empty' do
|
177
|
+
no_meta_description = <<~HTML
|
178
|
+
<head>
|
179
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
180
|
+
<meta content="" property="og:description">
|
181
|
+
<meta content='' property="og:description">
|
182
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
183
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
184
|
+
</head>
|
185
|
+
HTML
|
186
|
+
meta_description = dummy_object.grep_org_description(no_meta_description.to_s)
|
187
|
+
expect(meta_description).to be_nil
|
188
|
+
end
|
189
|
+
|
190
|
+
it 'should return description from valid tag' do
|
191
|
+
html = <<~HTML
|
192
|
+
<head>
|
193
|
+
<meta content="" property="uid">
|
194
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
195
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
|
196
|
+
</head>
|
197
|
+
HTML
|
198
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
199
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
200
|
+
end
|
201
|
+
|
202
|
+
it 'should return description even tag is multilined and partially encoded' do
|
203
|
+
html = <<~HTML
|
204
|
+
<head>
|
205
|
+
<meta content="" property="uid">
|
206
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description" >
|
207
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
208
|
+
name="viewport">
|
209
|
+
</head>
|
210
|
+
HTML
|
211
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
212
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
213
|
+
end
|
214
|
+
|
215
|
+
it 'should parse meta tag even it is partially single quoted' do
|
216
|
+
html = <<~HTML
|
217
|
+
<head>
|
218
|
+
<meta content="" property="uid">
|
219
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property=\'og:description">
|
220
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
221
|
+
name="viewport">
|
222
|
+
</head>
|
223
|
+
HTML
|
224
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
225
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
226
|
+
end
|
227
|
+
|
228
|
+
it 'should parse meta tag even it is having other attributes defined' do
|
229
|
+
html = <<~HTML
|
230
|
+
<head>
|
231
|
+
<meta content="" property="uid">
|
232
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" property=\'og:description">
|
233
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
234
|
+
name="viewport">
|
235
|
+
</head>
|
236
|
+
HTML
|
237
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
238
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
239
|
+
end
|
240
|
+
|
241
|
+
it 'should parse meta tag with itemprop as description key' do
|
242
|
+
html = <<~HTML
|
243
|
+
<head>
|
244
|
+
<meta content="" property="uid">
|
245
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." itemprop=\'og:description" charset="UTF-8">
|
246
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
247
|
+
name="viewport">
|
248
|
+
</head>
|
249
|
+
HTML
|
250
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
251
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
252
|
+
end
|
253
|
+
|
254
|
+
it 'should parse even name/itemprop key content is improperly assigned' do
|
255
|
+
html = <<~HTML
|
256
|
+
<head>
|
257
|
+
<meta content="" property="uid">
|
258
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" property=og:description />
|
259
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
260
|
+
name="viewport">
|
261
|
+
</head>
|
262
|
+
HTML
|
263
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
264
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
265
|
+
end
|
266
|
+
|
267
|
+
it 'should bring description having single quote' do
|
268
|
+
html = <<~HTML
|
269
|
+
<html lang="en">
|
270
|
+
<head>
|
271
|
+
<META charset="utf-8">
|
272
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
273
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
274
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
275
|
+
<meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." property="og:description" />
|
276
|
+
</head>
|
277
|
+
<html>
|
278
|
+
HTML
|
279
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
280
|
+
expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
|
281
|
+
end
|
282
|
+
|
283
|
+
it 'should bring description having double quote' do
|
284
|
+
html = <<~HTML
|
285
|
+
<html lang="en">
|
286
|
+
<head>
|
287
|
+
<META charset="utf-8">
|
288
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
289
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
290
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
291
|
+
<meta content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' property="og:description" />
|
292
|
+
</head>
|
293
|
+
<html>
|
294
|
+
HTML
|
295
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
296
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
297
|
+
end
|
298
|
+
|
299
|
+
it "should bring description even some other meta tag is empty" do
|
300
|
+
html = <<~HTML
|
301
|
+
<html lang="en">
|
302
|
+
<head>
|
303
|
+
<META charset="utf-8">
|
304
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
305
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
306
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
307
|
+
<meta content="" property="og:description">
|
308
|
+
<meta content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' property="og:description"/>
|
309
|
+
</head>
|
310
|
+
<html>
|
311
|
+
HTML
|
312
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
313
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'PhoneNumbers' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include PhoneNumbers
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_phone_numbers(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_phone_numbers('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should give []' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="tel:{{ location1Phone }}">
|
18
|
+
<a href="tel:{{pdModel.preferredDealer.phoneNumbers[0].CompleteNumber.$}}">
|
19
|
+
<a href="tel:[value]" data-store-phone data-attr-replace="href"></a>
|
20
|
+
<a class="a--reset main-nav--mobile__link--secondary ">Grab us on Live Chat</a>
|
21
|
+
HTML
|
22
|
+
expect(dummy_object.grep_phone_numbers(html.to_s)).to eq([])
|
23
|
+
end
|
24
|
+
it 'should grep organization phoneNumbers' do
|
25
|
+
html = <<~HTML
|
26
|
+
<a href="tel: 
						01598 760 700
					" class="button tel">Tel: 01598 760 700</a>
|
27
|
+
<a href="tel: +1 (856) 393-2082">+1 (856) 393-2082</a>
|
28
|
+
<a href="tel: 800%20843%203269" data-interaction-context data-interaction-type="phone" data-interaction-name="800 843 3269" class="cta" tabindex="0">
|
29
|
+
<span class="cta-content">
|
30
|
+
<span class="cta-text" tabindex="-1">
|
31
|
+
Call Now 800 843 3269
|
32
|
+
</span>
|
33
|
+
</span>
|
34
|
+
</a>
|
35
|
+
<a href="tel://1-859-422-6000">859.422.6000</a>
|
36
|
+
<a href="tel:+18009634816" class="btn btn-tertiary" data-tag="call">Call</a>
|
37
|
+
<a href="tel:%2B44-161-468-1234">0161 468 1234</a>
|
38
|
+
<a class="navbar-left" href="tel:781-788-8180 Ext. 4"> <b>Call Sales</b> 781-788-8180 Ext. 4 </a>
|
39
|
+
<a href="tel:8663033809">Call</a>
|
40
|
+
<a href="tel:http://484-373-7700"><span class="icon fa fa-phone"></span>484-373-7700</a>
|
41
|
+
<a href="tel:877.720.0411" data-organic="877.720.0411" data-metro="877.720.0411" data-display="877.720.0411" data-paid="877.720.0411" class="phone"><span class="number">877.720.0411</span></a>
|
42
|
+
<a href="tel:1866INTRALINKS">1-866-INTRALINKS</a>
|
43
|
+
<a href=tel:1888%20810%207464 itemprop=url>Call</a>
|
44
|
+
<a href=tel:18664946627 style="color: inherit; display: inline;">1 (866) 4-WINMAR</a>
|
45
|
+
<a href=tel:312-379-9329 class=phone>312-379-9329</a>
|
46
|
+
<a href=tel:312-379-9329 class=phone>312-379-9329</a>
|
47
|
+
<a\ndata-animsition-out=none href=tel:01722412512>Tel: 01722 412512</a>
|
48
|
+
|
49
|
+
HTML
|
50
|
+
phone_numbers = dummy_object.grep_phone_numbers(html.to_s)
|
51
|
+
expected_phone_numbers = [
|
52
|
+
"01598 760 700",
|
53
|
+
"+1 (856) 393-2082",
|
54
|
+
"800 843 3269",
|
55
|
+
"1-859-422-6000",
|
56
|
+
"+18009634816",
|
57
|
+
"+44-161-468-1234",
|
58
|
+
"781-788-8180 Ext. 4",
|
59
|
+
"8663033809", "484-373-7700",
|
60
|
+
"877.720.0411",
|
61
|
+
"1866INTRALINKS",
|
62
|
+
"1888 810 7464 ",
|
63
|
+
"18664946627 ",
|
64
|
+
"312-379-9329 ",
|
65
|
+
"01722412512"
|
66
|
+
]
|
67
|
+
expect(phone_numbers).to eq(expected_phone_numbers)
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Pinterest Profile' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include PinterestProfile
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_pinterest_profile(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_pinterest_profile('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should not grep below url format' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="http://pinterest.com/#" style="color: white;" class="fa fa-pinterest"></a>
|
18
|
+
<a href="http://pinterest.com/" style="color: white;" class="fa fa-pinterest"></a>
|
19
|
+
<a href="https://www.pinterest.com/feed/" style="color: white;" class="fa fa-pinterest"></a>
|
20
|
+
<a href="https://ct.pinterest.com/v3/?tid=2620913945757&noscript=1" style="color: white;" class="fa fa-pinterest"></a>
|
21
|
+
<a href="http://assets.pinterest.com/js/pinmarklet.js?r=" style="color: white;" class="fa fa-pinterest"></a>
|
22
|
+
<a href="http://pinterest.com/pin/create/bookmarklet/" style="color: white;" class="fa fa-pinterest"></a>
|
23
|
+
<a href="http://uk.pinterest.com/pin/create/bookmarklet/" style="color: white;" class="fa fa-pinterest"></a>
|
24
|
+
<a href="https://ct.pinterest.com/?tid=8KRsk0UkbVS&value=0.00&quantity=1" style="color: white;" class="fa fa-pinterest"></a>
|
25
|
+
<a href="https://policy.pinterest.com/cookies" style="color: white;" class="fa fa-pinterest"></a>
|
26
|
+
HTML
|
27
|
+
expect(dummy_object.grep_pinterest_profile(html.to_s)).to eq([])
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should grep organization pinterest profiles' do
|
31
|
+
html = <<~HTML
|
32
|
+
<a href="http://www.pinterest.com/blogher" target="_blank">pinterest</a>
|
33
|
+
<a href="http://pinterest.com/orientaltrading?cm_sp=Footer-_-SocialLinks-_-Pinterest" target="_blank">pinterest</a>
|
34
|
+
<a href="http://pinterest.com/poppin"" target="_blank">pinterest</a>
|
35
|
+
HTML
|
36
|
+
pinterest_profiles = dummy_object.grep_pinterest_profile(html.to_s)
|
37
|
+
expected_pinterest_profiles = [
|
38
|
+
'http://www.pinterest.com/blogher',
|
39
|
+
'http://pinterest.com/orientaltrading',
|
40
|
+
'http://pinterest.com/poppin'
|
41
|
+
]
|
42
|
+
expect(dummy_object.grep_pinterest_profile(html.to_s)).to eq(expected_pinterest_profiles)
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,207 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Website Redirected To' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include RedirectedTo
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
|
11
|
+
it 'should return nil for invalid input' do
|
12
|
+
expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
|
13
|
+
expect(dummy_object.grep_redirected_to_url('')).to be_nil
|
14
|
+
end
|
15
|
+
|
16
|
+
describe 'Website grep from link tag' do
|
17
|
+
describe 'rel attribute first ' do
|
18
|
+
|
19
|
+
it 'should return nil when canonical url is empty' do
|
20
|
+
html = <<~HTML
|
21
|
+
<link rel="canonical" href="">
|
22
|
+
<link rel="canonical" href=''>
|
23
|
+
HTML
|
24
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
25
|
+
expect(website).to be_nil
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should grep website' do
|
29
|
+
html = <<~HTML
|
30
|
+
<link rel="canonical" href="">
|
31
|
+
<link rel="canonical" href='https://www.apple.com/'>
|
32
|
+
HTML
|
33
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
34
|
+
expect(website).to eq('https://www.apple.com/')
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should grep website even with extra attributes' do
|
38
|
+
html = <<~HTML
|
39
|
+
<link rel="canonical" href="" itemprop="current_url">
|
40
|
+
<link rel="canonical" href='https://www.apple.com/'
|
41
|
+
itemprop="current_url" >
|
42
|
+
HTML
|
43
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
44
|
+
expect(website).to eq('https://www.apple.com/')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
describe 'href attribute first' do
|
48
|
+
it 'should return nil when canonical url is empty' do
|
49
|
+
html = <<~HTML
|
50
|
+
<link href="" rel="canonical" >
|
51
|
+
<link href='' rel="canonical" >
|
52
|
+
HTML
|
53
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
54
|
+
expect(website).to be_nil
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should grep website' do
|
58
|
+
html = <<~HTML
|
59
|
+
<link rel="canonical" href="">
|
60
|
+
<link href='https://www.apple.com/' rel="canonical">
|
61
|
+
HTML
|
62
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
63
|
+
expect(website).to eq('https://www.apple.com/')
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should grep website even with extra attributes' do
|
67
|
+
html = <<~HTML
|
68
|
+
<link href="" itemprop="current_url" rel="canonical">
|
69
|
+
<link href='https://www.apple.com/' rel="canonical"
|
70
|
+
itemprop="current_url" >
|
71
|
+
HTML
|
72
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
73
|
+
expect(website).to eq('https://www.apple.com/')
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
describe 'Website grep from organization URL' do
|
78
|
+
describe 'property attribute first ' do
|
79
|
+
it 'should return nil when canonical url is empty' do
|
80
|
+
html = <<~HTML
|
81
|
+
<meta property="og:url" content="" />
|
82
|
+
<meta property="og:url" content='' />
|
83
|
+
HTML
|
84
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
85
|
+
expect(website).to be_nil
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should grep website' do
|
89
|
+
html = <<~HTML
|
90
|
+
<link property="og:url" content="">
|
91
|
+
<meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
|
92
|
+
HTML
|
93
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
94
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
95
|
+
end
|
96
|
+
|
97
|
+
it 'should grep website even with extra attributes' do
|
98
|
+
html = <<~HTML
|
99
|
+
<link property="og:url" content="" calss="og-url">
|
100
|
+
<meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
|
101
|
+
class="og-url" />
|
102
|
+
HTML
|
103
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
104
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
describe 'content attribute first ' do
|
108
|
+
it 'should return nil when canonical url is empty' do
|
109
|
+
html = <<~HTML
|
110
|
+
<meta content="" property="og:url" />
|
111
|
+
<meta content='' property="og:url"/>
|
112
|
+
HTML
|
113
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
114
|
+
expect(website).to be_nil
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should grep website' do
|
118
|
+
html = <<~HTML
|
119
|
+
<link content="" property="og:url" >
|
120
|
+
<meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
|
121
|
+
HTML
|
122
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
123
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should grep website even with extra attributes' do
|
127
|
+
html = <<~HTML
|
128
|
+
<link content="" calss="og-url" property="og:url">
|
129
|
+
<meta content='https://www.dieppe.ca/fr/index.aspx'
|
130
|
+
class="og-url" property="og:url" />
|
131
|
+
HTML
|
132
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
133
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
describe 'grep website' do
|
138
|
+
it 'it should return nil when link or og:url is absent' do
|
139
|
+
html = <<~HTML
|
140
|
+
<head>
|
141
|
+
<meta charset="utf-8">
|
142
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
143
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
144
|
+
<title>Techmologic | index</title>
|
145
|
+
<!-- Font Awesome -->
|
146
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
147
|
+
<!-- Bootstrap core CSS -->
|
148
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
149
|
+
<!-- Material Design Bootstrap -->
|
150
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
151
|
+
<!-- Your custom styles (optional) -->
|
152
|
+
<link href="css/style.css" rel="stylesheet">
|
153
|
+
</head>
|
154
|
+
HTML
|
155
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
156
|
+
expect(website).to be_nil
|
157
|
+
end
|
158
|
+
it 'should grep one of canonical or og:url' do
|
159
|
+
html = <<~HTML
|
160
|
+
<head>
|
161
|
+
<meta charset="utf-8">
|
162
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
163
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
164
|
+
<title>Techmologic | index</title>
|
165
|
+
<link rel="canonical" href="">
|
166
|
+
<meta property="og:url" content="" />
|
167
|
+
<!-- Font Awesome -->
|
168
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
169
|
+
<!-- Bootstrap core CSS -->
|
170
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
171
|
+
<!-- Material Design Bootstrap -->
|
172
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
173
|
+
<!-- Your custom styles (optional) -->
|
174
|
+
<link href="css/style.css" rel="stylesheet">
|
175
|
+
<link rel="canonical" href="http://techmologics.com/">
|
176
|
+
<meta property="og:url" content="http://techmologics.com/" />
|
177
|
+
</head>
|
178
|
+
HTML
|
179
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
180
|
+
expect(website).to eq('http://techmologics.com/')
|
181
|
+
end
|
182
|
+
it 'should grep one of canonical or og:url whatever it\'s position' do
|
183
|
+
html = <<~HTML
|
184
|
+
<head>
|
185
|
+
<meta charset="utf-8">
|
186
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
187
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
188
|
+
<title>Techmologic | index</title>
|
189
|
+
<link href="" rel="canonical">
|
190
|
+
<meta content="" property="og:url"/>
|
191
|
+
<!-- Font Awesome -->
|
192
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
193
|
+
<!-- Bootstrap core CSS -->
|
194
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
195
|
+
<!-- Material Design Bootstrap -->
|
196
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
197
|
+
<!-- Your custom styles (optional) -->
|
198
|
+
<link href="css/style.css" rel="stylesheet">
|
199
|
+
<link href="http://techmologics.com/" rel="canonical" class="canonical">
|
200
|
+
<meta content="http://techmologics.com/" property="og:url"/>
|
201
|
+
</head>
|
202
|
+
HTML
|
203
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
204
|
+
expect(website).to eq('http://techmologics.com/')
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|