brilliant_web_scraper 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/README.md +31 -0
- data/brilliant_web_scraper-1.0.0.gem +0 -0
- data/brilliant_web_scraper-1.0.gem +0 -0
- data/brilliant_web_scraper.gemspec +30 -0
- data/lib/brilliant_web_scraper.rb +55 -0
- data/lib/parsers/description_helper.rb +28 -0
- data/lib/parsers/emails.rb +30 -0
- data/lib/parsers/facebook_profile.rb +11 -0
- data/lib/parsers/instagram_profile.rb +11 -0
- data/lib/parsers/linkedin_profile.rb +11 -0
- data/lib/parsers/meta_description.rb +13 -0
- data/lib/parsers/org_description.rb +13 -0
- data/lib/parsers/phone_numbers.rb +34 -0
- data/lib/parsers/pinterest_profile.rb +11 -0
- data/lib/parsers/redirected_to.rb +29 -0
- data/lib/parsers/title.rb +13 -0
- data/lib/parsers/twitter_description.rb +13 -0
- data/lib/parsers/twitter_profile.rb +11 -0
- data/lib/parsers/unescape_html_helper.rb +17 -0
- data/lib/parsers/vimeo_profile.rb +11 -0
- data/lib/parsers/youtube_channel.rb +29 -0
- data/lib/scraper/errors.rb +19 -0
- data/lib/scraper/scrape_exceptions.rb +49 -0
- data/lib/scraper/scrape_helper.rb +59 -0
- data/lib/scraper/scrape_request.rb +29 -0
- data/lib/version.rb +6 -0
- data/spec/lib/parsers/description_helper_spec.rb +24 -0
- data/spec/lib/parsers/emails_spec.rb +60 -0
- data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
- data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
- data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
- data/spec/lib/parsers/meta_description_spec.rb +321 -0
- data/spec/lib/parsers/org_description_spec.rb +316 -0
- data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
- data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
- data/spec/lib/parsers/redirected_to_spec.rb +207 -0
- data/spec/lib/parsers/title_spec.rb +87 -0
- data/spec/lib/parsers/twitter_description_spec.rb +314 -0
- data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
- data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
- data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
- data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
- data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
- data/spec/lib/scraper/scrape_request_test.rb +34 -0
- data/spec/spec_helper.rb +111 -0
- data/spec/vcr/encoding_compatibility_error.yml +316 -0
- data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
- data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
- data/spec/vcr/non_html_scrape.yml +163 -0
- data/spec/vcr/valid_scrape_response.yml +696 -0
- metadata +250 -0
@@ -0,0 +1,316 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Orgnisation Description' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include OrgDescription
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid inputs' do
|
11
|
+
expect(dummy_object.grep_org_description('')).to be_nil
|
12
|
+
expect(dummy_object.grep_org_description(nil)).to be_nil
|
13
|
+
end
|
14
|
+
describe 'Name key first organization description tag' do
|
15
|
+
it 'should return nil for no organization description tag presence' do
|
16
|
+
no_org_description = <<~HTML
|
17
|
+
<head>
|
18
|
+
<meta charset="utf-8">
|
19
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
20
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
21
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
22
|
+
<meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
|
23
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
24
|
+
</head>
|
25
|
+
HTML
|
26
|
+
meta_description = dummy_object.grep_org_description(no_org_description.to_s)
|
27
|
+
expect(meta_description).to be_nil
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should return nil when content part is empty' do
|
31
|
+
no_meta_description = <<~HTML
|
32
|
+
<head>
|
33
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
34
|
+
<meta property="og:description" content="">
|
35
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
36
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
37
|
+
</head>
|
38
|
+
HTML
|
39
|
+
meta_description = dummy_object.grep_org_description(no_meta_description.to_s)
|
40
|
+
expect(meta_description).to be_nil
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should return description from valid tag' do
|
44
|
+
html = <<~HTML
|
45
|
+
<head>
|
46
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
47
|
+
<meta content="" property="uid">
|
48
|
+
<meta property="og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
49
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
|
50
|
+
</head>
|
51
|
+
HTML
|
52
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
53
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should return description even tag is multilined and partially encoded' do
|
57
|
+
html = <<~HTML
|
58
|
+
<head>
|
59
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
60
|
+
<meta content="" property="uid">
|
61
|
+
<meta property="og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
62
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
63
|
+
name="viewport">
|
64
|
+
</head>
|
65
|
+
HTML
|
66
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
67
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should parse meta tag even it is partially single quoted' do
|
71
|
+
html = <<~HTML
|
72
|
+
<head>
|
73
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
74
|
+
<meta content="" property="uid">
|
75
|
+
<meta property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
76
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
77
|
+
name="viewport">
|
78
|
+
</head>
|
79
|
+
HTML
|
80
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
81
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should parse meta tag even it is having other attributes defined' do
|
85
|
+
html = <<~HTML
|
86
|
+
<head>
|
87
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
88
|
+
<meta content="" property="uid">
|
89
|
+
<meta class="metadescription" property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8">
|
90
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
91
|
+
name="viewport">
|
92
|
+
</head>
|
93
|
+
HTML
|
94
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
95
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'should parse meta tag with itemprop as description key' do
|
99
|
+
html = <<~HTML
|
100
|
+
<head>
|
101
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
102
|
+
<meta content="" property="uid">
|
103
|
+
<meta class="metadescription" itemprop=\'og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8">
|
104
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
105
|
+
name="viewport">
|
106
|
+
</head>
|
107
|
+
HTML
|
108
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
109
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
110
|
+
end
|
111
|
+
|
112
|
+
it 'should parse even name/itemprop key content is improperly assigned' do
|
113
|
+
html = <<~HTML
|
114
|
+
<head>
|
115
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
116
|
+
<meta content="" property="uid">
|
117
|
+
<meta class="metadescription" property=og:description content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" />
|
118
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
119
|
+
name="viewport">
|
120
|
+
</head>
|
121
|
+
HTML
|
122
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
123
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should bring description having single quote' do
|
127
|
+
html = <<~HTML
|
128
|
+
<html lang="en">
|
129
|
+
<head>
|
130
|
+
<META charset="utf-8">
|
131
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
132
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
133
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
134
|
+
<meta property="og:description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
|
135
|
+
</head>
|
136
|
+
<html>
|
137
|
+
HTML
|
138
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
139
|
+
expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'should bring description having double quote' do
|
143
|
+
html = <<~HTML
|
144
|
+
<html lang="en">
|
145
|
+
<head>
|
146
|
+
<META charset="utf-8">
|
147
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
148
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
149
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
150
|
+
<meta property="og:description" content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' />
|
151
|
+
</head>
|
152
|
+
<html>
|
153
|
+
HTML
|
154
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
155
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should bring description even some other meta tag is empty" do
|
159
|
+
html = <<~HTML
|
160
|
+
<html lang="en">
|
161
|
+
<head>
|
162
|
+
<META charset="utf-8">
|
163
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
164
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
165
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
166
|
+
<meta property="og:description" content="">
|
167
|
+
<meta property="og:description" content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' />
|
168
|
+
</head>
|
169
|
+
<html>
|
170
|
+
HTML
|
171
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
172
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
173
|
+
end
|
174
|
+
end
|
175
|
+
describe 'Content key first organization description tag' do
|
176
|
+
it 'should return nil when content part is empty' do
|
177
|
+
no_meta_description = <<~HTML
|
178
|
+
<head>
|
179
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
180
|
+
<meta content="" property="og:description">
|
181
|
+
<meta content='' property="og:description">
|
182
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
183
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
184
|
+
</head>
|
185
|
+
HTML
|
186
|
+
meta_description = dummy_object.grep_org_description(no_meta_description.to_s)
|
187
|
+
expect(meta_description).to be_nil
|
188
|
+
end
|
189
|
+
|
190
|
+
it 'should return description from valid tag' do
|
191
|
+
html = <<~HTML
|
192
|
+
<head>
|
193
|
+
<meta content="" property="uid">
|
194
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
195
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
|
196
|
+
</head>
|
197
|
+
HTML
|
198
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
199
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
200
|
+
end
|
201
|
+
|
202
|
+
it 'should return description even tag is multilined and partially encoded' do
|
203
|
+
html = <<~HTML
|
204
|
+
<head>
|
205
|
+
<meta content="" property="uid">
|
206
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description" >
|
207
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
208
|
+
name="viewport">
|
209
|
+
</head>
|
210
|
+
HTML
|
211
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
212
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
213
|
+
end
|
214
|
+
|
215
|
+
it 'should parse meta tag even it is partially single quoted' do
|
216
|
+
html = <<~HTML
|
217
|
+
<head>
|
218
|
+
<meta content="" property="uid">
|
219
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property=\'og:description">
|
220
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
221
|
+
name="viewport">
|
222
|
+
</head>
|
223
|
+
HTML
|
224
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
225
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
226
|
+
end
|
227
|
+
|
228
|
+
it 'should parse meta tag even it is having other attributes defined' do
|
229
|
+
html = <<~HTML
|
230
|
+
<head>
|
231
|
+
<meta content="" property="uid">
|
232
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" property=\'og:description">
|
233
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
234
|
+
name="viewport">
|
235
|
+
</head>
|
236
|
+
HTML
|
237
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
238
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
239
|
+
end
|
240
|
+
|
241
|
+
it 'should parse meta tag with itemprop as description key' do
|
242
|
+
html = <<~HTML
|
243
|
+
<head>
|
244
|
+
<meta content="" property="uid">
|
245
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." itemprop=\'og:description" charset="UTF-8">
|
246
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
247
|
+
name="viewport">
|
248
|
+
</head>
|
249
|
+
HTML
|
250
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
251
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
252
|
+
end
|
253
|
+
|
254
|
+
it 'should parse even name/itemprop key content is improperly assigned' do
|
255
|
+
html = <<~HTML
|
256
|
+
<head>
|
257
|
+
<meta content="" property="uid">
|
258
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" property=og:description />
|
259
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
260
|
+
name="viewport">
|
261
|
+
</head>
|
262
|
+
HTML
|
263
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
264
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
265
|
+
end
|
266
|
+
|
267
|
+
it 'should bring description having single quote' do
|
268
|
+
html = <<~HTML
|
269
|
+
<html lang="en">
|
270
|
+
<head>
|
271
|
+
<META charset="utf-8">
|
272
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
273
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
274
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
275
|
+
<meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." property="og:description" />
|
276
|
+
</head>
|
277
|
+
<html>
|
278
|
+
HTML
|
279
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
280
|
+
expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
|
281
|
+
end
|
282
|
+
|
283
|
+
it 'should bring description having double quote' do
|
284
|
+
html = <<~HTML
|
285
|
+
<html lang="en">
|
286
|
+
<head>
|
287
|
+
<META charset="utf-8">
|
288
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
289
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
290
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
291
|
+
<meta content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' property="og:description" />
|
292
|
+
</head>
|
293
|
+
<html>
|
294
|
+
HTML
|
295
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
296
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
297
|
+
end
|
298
|
+
|
299
|
+
it "should bring description even some other meta tag is empty" do
|
300
|
+
html = <<~HTML
|
301
|
+
<html lang="en">
|
302
|
+
<head>
|
303
|
+
<META charset="utf-8">
|
304
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
305
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
306
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
307
|
+
<meta content="" property="og:description">
|
308
|
+
<meta content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' property="og:description"/>
|
309
|
+
</head>
|
310
|
+
<html>
|
311
|
+
HTML
|
312
|
+
meta_description = dummy_object.grep_org_description(html.to_s)
|
313
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'PhoneNumbers' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include PhoneNumbers
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_phone_numbers(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_phone_numbers('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should give []' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="tel:{{ location1Phone }}">
|
18
|
+
<a href="tel:{{pdModel.preferredDealer.phoneNumbers[0].CompleteNumber.$}}">
|
19
|
+
<a href="tel:[value]" data-store-phone data-attr-replace="href"></a>
|
20
|
+
<a class="a--reset main-nav--mobile__link--secondary ">Grab us on Live Chat</a>
|
21
|
+
HTML
|
22
|
+
expect(dummy_object.grep_phone_numbers(html.to_s)).to eq([])
|
23
|
+
end
|
24
|
+
it 'should grep organization phoneNumbers' do
|
25
|
+
html = <<~HTML
|
26
|
+
<a href="tel: 
						01598 760 700
					" class="button tel">Tel: 01598 760 700</a>
|
27
|
+
<a href="tel: +1 (856) 393-2082">+1 (856) 393-2082</a>
|
28
|
+
<a href="tel: 800%20843%203269" data-interaction-context data-interaction-type="phone" data-interaction-name="800 843 3269" class="cta" tabindex="0">
|
29
|
+
<span class="cta-content">
|
30
|
+
<span class="cta-text" tabindex="-1">
|
31
|
+
Call Now 800 843 3269
|
32
|
+
</span>
|
33
|
+
</span>
|
34
|
+
</a>
|
35
|
+
<a href="tel://1-859-422-6000">859.422.6000</a>
|
36
|
+
<a href="tel:+18009634816" class="btn btn-tertiary" data-tag="call">Call</a>
|
37
|
+
<a href="tel:%2B44-161-468-1234">0161 468 1234</a>
|
38
|
+
<a class="navbar-left" href="tel:781-788-8180 Ext. 4"> <b>Call Sales</b> 781-788-8180 Ext. 4 </a>
|
39
|
+
<a href="tel:8663033809">Call</a>
|
40
|
+
<a href="tel:http://484-373-7700"><span class="icon fa fa-phone"></span>484-373-7700</a>
|
41
|
+
<a href="tel:877.720.0411" data-organic="877.720.0411" data-metro="877.720.0411" data-display="877.720.0411" data-paid="877.720.0411" class="phone"><span class="number">877.720.0411</span></a>
|
42
|
+
<a href="tel:1866INTRALINKS">1-866-INTRALINKS</a>
|
43
|
+
<a href=tel:1888%20810%207464 itemprop=url>Call</a>
|
44
|
+
<a href=tel:18664946627 style="color: inherit; display: inline;">1 (866) 4-WINMAR</a>
|
45
|
+
<a href=tel:312-379-9329 class=phone>312-379-9329</a>
|
46
|
+
<a href=tel:312-379-9329 class=phone>312-379-9329</a>
|
47
|
+
<a\ndata-animsition-out=none href=tel:01722412512>Tel: 01722 412512</a>
|
48
|
+
|
49
|
+
HTML
|
50
|
+
phone_numbers = dummy_object.grep_phone_numbers(html.to_s)
|
51
|
+
expected_phone_numbers = [
|
52
|
+
"01598 760 700",
|
53
|
+
"+1 (856) 393-2082",
|
54
|
+
"800 843 3269",
|
55
|
+
"1-859-422-6000",
|
56
|
+
"+18009634816",
|
57
|
+
"+44-161-468-1234",
|
58
|
+
"781-788-8180 Ext. 4",
|
59
|
+
"8663033809", "484-373-7700",
|
60
|
+
"877.720.0411",
|
61
|
+
"1866INTRALINKS",
|
62
|
+
"1888 810 7464 ",
|
63
|
+
"18664946627 ",
|
64
|
+
"312-379-9329 ",
|
65
|
+
"01722412512"
|
66
|
+
]
|
67
|
+
expect(phone_numbers).to eq(expected_phone_numbers)
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Pinterest Profile' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include PinterestProfile
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_pinterest_profile(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_pinterest_profile('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should not grep below url format' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="http://pinterest.com/#" style="color: white;" class="fa fa-pinterest"></a>
|
18
|
+
<a href="http://pinterest.com/" style="color: white;" class="fa fa-pinterest"></a>
|
19
|
+
<a href="https://www.pinterest.com/feed/" style="color: white;" class="fa fa-pinterest"></a>
|
20
|
+
<a href="https://ct.pinterest.com/v3/?tid=2620913945757&noscript=1" style="color: white;" class="fa fa-pinterest"></a>
|
21
|
+
<a href="http://assets.pinterest.com/js/pinmarklet.js?r=" style="color: white;" class="fa fa-pinterest"></a>
|
22
|
+
<a href="http://pinterest.com/pin/create/bookmarklet/" style="color: white;" class="fa fa-pinterest"></a>
|
23
|
+
<a href="http://uk.pinterest.com/pin/create/bookmarklet/" style="color: white;" class="fa fa-pinterest"></a>
|
24
|
+
<a href="https://ct.pinterest.com/?tid=8KRsk0UkbVS&value=0.00&quantity=1" style="color: white;" class="fa fa-pinterest"></a>
|
25
|
+
<a href="https://policy.pinterest.com/cookies" style="color: white;" class="fa fa-pinterest"></a>
|
26
|
+
HTML
|
27
|
+
expect(dummy_object.grep_pinterest_profile(html.to_s)).to eq([])
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should grep organization pinterest profiles' do
|
31
|
+
html = <<~HTML
|
32
|
+
<a href="http://www.pinterest.com/blogher" target="_blank">pinterest</a>
|
33
|
+
<a href="http://pinterest.com/orientaltrading?cm_sp=Footer-_-SocialLinks-_-Pinterest" target="_blank">pinterest</a>
|
34
|
+
<a href="http://pinterest.com/poppin"" target="_blank">pinterest</a>
|
35
|
+
HTML
|
36
|
+
pinterest_profiles = dummy_object.grep_pinterest_profile(html.to_s)
|
37
|
+
expected_pinterest_profiles = [
|
38
|
+
'http://www.pinterest.com/blogher',
|
39
|
+
'http://pinterest.com/orientaltrading',
|
40
|
+
'http://pinterest.com/poppin'
|
41
|
+
]
|
42
|
+
expect(dummy_object.grep_pinterest_profile(html.to_s)).to eq(expected_pinterest_profiles)
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,207 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Website Redirected To' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include RedirectedTo
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
|
11
|
+
it 'should return nil for invalid input' do
|
12
|
+
expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
|
13
|
+
expect(dummy_object.grep_redirected_to_url('')).to be_nil
|
14
|
+
end
|
15
|
+
|
16
|
+
describe 'Website grep from link tag' do
|
17
|
+
describe 'rel attribute first ' do
|
18
|
+
|
19
|
+
it 'should return nil when canonical url is empty' do
|
20
|
+
html = <<~HTML
|
21
|
+
<link rel="canonical" href="">
|
22
|
+
<link rel="canonical" href=''>
|
23
|
+
HTML
|
24
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
25
|
+
expect(website).to be_nil
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should grep website' do
|
29
|
+
html = <<~HTML
|
30
|
+
<link rel="canonical" href="">
|
31
|
+
<link rel="canonical" href='https://www.apple.com/'>
|
32
|
+
HTML
|
33
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
34
|
+
expect(website).to eq('https://www.apple.com/')
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should grep website even with extra attributes' do
|
38
|
+
html = <<~HTML
|
39
|
+
<link rel="canonical" href="" itemprop="current_url">
|
40
|
+
<link rel="canonical" href='https://www.apple.com/'
|
41
|
+
itemprop="current_url" >
|
42
|
+
HTML
|
43
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
44
|
+
expect(website).to eq('https://www.apple.com/')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
describe 'href attribute first' do
|
48
|
+
it 'should return nil when canonical url is empty' do
|
49
|
+
html = <<~HTML
|
50
|
+
<link href="" rel="canonical" >
|
51
|
+
<link href='' rel="canonical" >
|
52
|
+
HTML
|
53
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
54
|
+
expect(website).to be_nil
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should grep website' do
|
58
|
+
html = <<~HTML
|
59
|
+
<link rel="canonical" href="">
|
60
|
+
<link href='https://www.apple.com/' rel="canonical">
|
61
|
+
HTML
|
62
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
63
|
+
expect(website).to eq('https://www.apple.com/')
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should grep website even with extra attributes' do
|
67
|
+
html = <<~HTML
|
68
|
+
<link href="" itemprop="current_url" rel="canonical">
|
69
|
+
<link href='https://www.apple.com/' rel="canonical"
|
70
|
+
itemprop="current_url" >
|
71
|
+
HTML
|
72
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
73
|
+
expect(website).to eq('https://www.apple.com/')
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
describe 'Website grep from organization URL' do
|
78
|
+
describe 'property attribute first ' do
|
79
|
+
it 'should return nil when canonical url is empty' do
|
80
|
+
html = <<~HTML
|
81
|
+
<meta property="og:url" content="" />
|
82
|
+
<meta property="og:url" content='' />
|
83
|
+
HTML
|
84
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
85
|
+
expect(website).to be_nil
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should grep website' do
|
89
|
+
html = <<~HTML
|
90
|
+
<link property="og:url" content="">
|
91
|
+
<meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
|
92
|
+
HTML
|
93
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
94
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
95
|
+
end
|
96
|
+
|
97
|
+
it 'should grep website even with extra attributes' do
|
98
|
+
html = <<~HTML
|
99
|
+
<link property="og:url" content="" calss="og-url">
|
100
|
+
<meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
|
101
|
+
class="og-url" />
|
102
|
+
HTML
|
103
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
104
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
describe 'content attribute first ' do
|
108
|
+
it 'should return nil when canonical url is empty' do
|
109
|
+
html = <<~HTML
|
110
|
+
<meta content="" property="og:url" />
|
111
|
+
<meta content='' property="og:url"/>
|
112
|
+
HTML
|
113
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
114
|
+
expect(website).to be_nil
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should grep website' do
|
118
|
+
html = <<~HTML
|
119
|
+
<link content="" property="og:url" >
|
120
|
+
<meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
|
121
|
+
HTML
|
122
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
123
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should grep website even with extra attributes' do
|
127
|
+
html = <<~HTML
|
128
|
+
<link content="" calss="og-url" property="og:url">
|
129
|
+
<meta content='https://www.dieppe.ca/fr/index.aspx'
|
130
|
+
class="og-url" property="og:url" />
|
131
|
+
HTML
|
132
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
133
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
describe 'grep website' do
|
138
|
+
it 'it should return nil when link or og:url is absent' do
|
139
|
+
html = <<~HTML
|
140
|
+
<head>
|
141
|
+
<meta charset="utf-8">
|
142
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
143
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
144
|
+
<title>Techmologic | index</title>
|
145
|
+
<!-- Font Awesome -->
|
146
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
147
|
+
<!-- Bootstrap core CSS -->
|
148
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
149
|
+
<!-- Material Design Bootstrap -->
|
150
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
151
|
+
<!-- Your custom styles (optional) -->
|
152
|
+
<link href="css/style.css" rel="stylesheet">
|
153
|
+
</head>
|
154
|
+
HTML
|
155
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
156
|
+
expect(website).to be_nil
|
157
|
+
end
|
158
|
+
it 'should grep one of canonical or og:url' do
|
159
|
+
html = <<~HTML
|
160
|
+
<head>
|
161
|
+
<meta charset="utf-8">
|
162
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
163
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
164
|
+
<title>Techmologic | index</title>
|
165
|
+
<link rel="canonical" href="">
|
166
|
+
<meta property="og:url" content="" />
|
167
|
+
<!-- Font Awesome -->
|
168
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
169
|
+
<!-- Bootstrap core CSS -->
|
170
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
171
|
+
<!-- Material Design Bootstrap -->
|
172
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
173
|
+
<!-- Your custom styles (optional) -->
|
174
|
+
<link href="css/style.css" rel="stylesheet">
|
175
|
+
<link rel="canonical" href="http://techmologics.com/">
|
176
|
+
<meta property="og:url" content="http://techmologics.com/" />
|
177
|
+
</head>
|
178
|
+
HTML
|
179
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
180
|
+
expect(website).to eq('http://techmologics.com/')
|
181
|
+
end
|
182
|
+
it 'should grep one of canonical or og:url whatever it\'s position' do
|
183
|
+
html = <<~HTML
|
184
|
+
<head>
|
185
|
+
<meta charset="utf-8">
|
186
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
187
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
188
|
+
<title>Techmologic | index</title>
|
189
|
+
<link href="" rel="canonical">
|
190
|
+
<meta content="" property="og:url"/>
|
191
|
+
<!-- Font Awesome -->
|
192
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
193
|
+
<!-- Bootstrap core CSS -->
|
194
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
195
|
+
<!-- Material Design Bootstrap -->
|
196
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
197
|
+
<!-- Your custom styles (optional) -->
|
198
|
+
<link href="css/style.css" rel="stylesheet">
|
199
|
+
<link href="http://techmologics.com/" rel="canonical" class="canonical">
|
200
|
+
<meta content="http://techmologics.com/" property="og:url"/>
|
201
|
+
</head>
|
202
|
+
HTML
|
203
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
204
|
+
expect(website).to eq('http://techmologics.com/')
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|