brilliant_web_scraper 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/README.md +31 -0
- data/brilliant_web_scraper-1.0.0.gem +0 -0
- data/brilliant_web_scraper-1.0.gem +0 -0
- data/brilliant_web_scraper.gemspec +30 -0
- data/lib/brilliant_web_scraper.rb +55 -0
- data/lib/parsers/description_helper.rb +28 -0
- data/lib/parsers/emails.rb +30 -0
- data/lib/parsers/facebook_profile.rb +11 -0
- data/lib/parsers/instagram_profile.rb +11 -0
- data/lib/parsers/linkedin_profile.rb +11 -0
- data/lib/parsers/meta_description.rb +13 -0
- data/lib/parsers/org_description.rb +13 -0
- data/lib/parsers/phone_numbers.rb +34 -0
- data/lib/parsers/pinterest_profile.rb +11 -0
- data/lib/parsers/redirected_to.rb +29 -0
- data/lib/parsers/title.rb +13 -0
- data/lib/parsers/twitter_description.rb +13 -0
- data/lib/parsers/twitter_profile.rb +11 -0
- data/lib/parsers/unescape_html_helper.rb +17 -0
- data/lib/parsers/vimeo_profile.rb +11 -0
- data/lib/parsers/youtube_channel.rb +29 -0
- data/lib/scraper/errors.rb +19 -0
- data/lib/scraper/scrape_exceptions.rb +49 -0
- data/lib/scraper/scrape_helper.rb +59 -0
- data/lib/scraper/scrape_request.rb +29 -0
- data/lib/version.rb +6 -0
- data/spec/lib/parsers/description_helper_spec.rb +24 -0
- data/spec/lib/parsers/emails_spec.rb +60 -0
- data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
- data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
- data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
- data/spec/lib/parsers/meta_description_spec.rb +321 -0
- data/spec/lib/parsers/org_description_spec.rb +316 -0
- data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
- data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
- data/spec/lib/parsers/redirected_to_spec.rb +207 -0
- data/spec/lib/parsers/title_spec.rb +87 -0
- data/spec/lib/parsers/twitter_description_spec.rb +314 -0
- data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
- data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
- data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
- data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
- data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
- data/spec/lib/scraper/scrape_request_test.rb +34 -0
- data/spec/spec_helper.rb +111 -0
- data/spec/vcr/encoding_compatibility_error.yml +316 -0
- data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
- data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
- data/spec/vcr/non_html_scrape.yml +163 -0
- data/spec/vcr/valid_scrape_response.yml +696 -0
- metadata +250 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Title' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include Title
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil response for invalid inputs' do
|
11
|
+
expect(dummy_object.grep_title(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_title('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should return nil for no title presence' do
|
16
|
+
no_title_html = <<~HTML
|
17
|
+
<meta charset="UTF-8">
|
18
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
19
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
20
|
+
<title> </title>
|
21
|
+
<link href="/on/demandware.static/Sites-Marmot_US-Site/-/default/dw8b6e883e/images/favicon.ico" rel="shortcut icon">
|
22
|
+
<meta name="description" content=" Shop the official Marmot online store. Maker of performance outdoor clothing and gear for travel, hiking, camping, snowsports, and more.Marmot">
|
23
|
+
HTML
|
24
|
+
title = dummy_object.grep_title(no_title_html.to_s)
|
25
|
+
expect(title).to eq(nil)
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should get title even from partially closed title tag' do
|
29
|
+
partially_closed_title_tag = <<~HTML
|
30
|
+
<title>Harmony (a Mediware company) is now WellSky. You are being redirected to WellSky.com.” /title>
|
31
|
+
HTML
|
32
|
+
title = dummy_object.grep_title(partially_closed_title_tag.to_s)
|
33
|
+
expect(title).to eq('Harmony (a Mediware company) is now WellSky. You are being redirected to WellSky.com.”')
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should get title even title is multi line' do
|
37
|
+
multi_line_title_tag = <<~HTML
|
38
|
+
<title>
|
39
|
+
Smartphone App Development Company | iPhone iPad App Development | Fredericton, Atlantic Canada | SEO Internet Marketing Website Design
|
40
|
+
</title>
|
41
|
+
<meta name="robots" content="index, follow" />
|
42
|
+
HTML
|
43
|
+
title = dummy_object.grep_title(multi_line_title_tag.to_s)
|
44
|
+
expect(title).to eq('Smartphone App Development Company | iPhone iPad App Development | Fredericton, Atlantic Canada | SEO Internet Marketing Website Design')
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should unescpe html encodings from title' do
|
48
|
+
partially_html_encoded_title = <<~HTML
|
49
|
+
<link rel="pingback" href="https://www.idirect.net/xmlrpc.php">
|
50
|
+
<title>ST Engineering iDirect – Shaping the Future of How the World Connects</title>
|
51
|
+
<link rel="dns-prefetch" href="//platform.twitter.com">
|
52
|
+
HTML
|
53
|
+
title = dummy_object.grep_title(partially_html_encoded_title.to_s)
|
54
|
+
expect(title).to eq('ST Engineering iDirect – Shaping the Future of How the World Connects')
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should remove unnecessary white spaces, new lines, tabs from title' do
|
58
|
+
extra_spacing_title = <<~HTML
|
59
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8" /><meta http-equiv="Expires" content="0" /><meta http-equiv='content-language' content='en' /><title>
|
60
|
+
|
61
|
+
A global technology and services company committed to innovation| Cegedim
|
62
|
+
|
63
|
+
</title>
|
64
|
+
<link rel="stylesheet" type="text/css" href="/_layouts/15/1033/styles/Themable/corev15.css?rev=2bpHeX9U8DH09TB5zpJcsQ%3D%3D"/>
|
65
|
+
HTML
|
66
|
+
title = dummy_object.grep_title(extra_spacing_title.to_s)
|
67
|
+
expect(title).to eq('A global technology and services company committed to innovation| Cegedim')
|
68
|
+
end
|
69
|
+
it 'should pick very first title available' do
|
70
|
+
multiple_title_html = <<~HTML
|
71
|
+
<link rel="pingback" href="https://www.idirect.net/xmlrpc.php">
|
72
|
+
<title>ST Engineering iDirect – Shaping the Future of How the World Connects</title>
|
73
|
+
<link rel="dns-prefetch" href="//platform.twitter.com">
|
74
|
+
<title>Title 2 - ST Engineering iDirect – Shaping the Future of How the World Connects</title>
|
75
|
+
HTML
|
76
|
+
title = dummy_object.grep_title(multiple_title_html.to_s)
|
77
|
+
expect(title).to eq('ST Engineering iDirect – Shaping the Future of How the World Connects')
|
78
|
+
end
|
79
|
+
it 'should grep title with extra atrributes' do
|
80
|
+
html = <<~HTML
|
81
|
+
<title data-component-id="AdaptiveHtmlHead_01_6930" data-component-name="adaptiveHtmlHead" data-component-endpoint="/aries-common/v1/adaptiveHtmlHead.comp">Vancouver, Canada Hotel - City Center | Sheraton Vancouver Wall Centre</title>
|
82
|
+
HTML
|
83
|
+
title = dummy_object.grep_title(html.to_s)
|
84
|
+
expect(title).to eq('Vancouver, Canada Hotel - City Center | Sheraton Vancouver Wall Centre')
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
@@ -0,0 +1,314 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Twitter Description' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include TwitterDescription
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid inputs' do
|
11
|
+
expect(dummy_object.grep_twitter_description('')).to be_nil
|
12
|
+
expect(dummy_object.grep_twitter_description(nil)).to be_nil
|
13
|
+
end
|
14
|
+
describe 'Name key first twitter description tag' do
|
15
|
+
it 'should return nil for no twitter description tag presence' do
|
16
|
+
no_org_description = <<~HTML
|
17
|
+
<head>
|
18
|
+
<meta charset="utf-8">
|
19
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
20
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
21
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
22
|
+
<meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
|
23
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
24
|
+
</head>
|
25
|
+
HTML
|
26
|
+
twitter_description = dummy_object.grep_twitter_description(no_org_description.to_s)
|
27
|
+
expect(twitter_description).to be_nil
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should return nil when content part is empty' do
|
31
|
+
html = <<~HTML
|
32
|
+
<head>
|
33
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
34
|
+
<meta name="twitter:description" content="">
|
35
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
36
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
37
|
+
</head>
|
38
|
+
HTML
|
39
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
40
|
+
expect(twitter_description).to be_nil
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should return description from valid tag' do
|
44
|
+
html = <<~HTML
|
45
|
+
<head>
|
46
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name="twitter:description">
|
47
|
+
<meta content="" property="uid">
|
48
|
+
<meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
49
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
|
50
|
+
</head>
|
51
|
+
HTML
|
52
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
53
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should return description even tag is multilined and partially encoded' do
|
57
|
+
html = <<~HTML
|
58
|
+
<head>
|
59
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name="twitter:description">
|
60
|
+
<meta content="" property="uid">
|
61
|
+
<meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
62
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
63
|
+
name="viewport">
|
64
|
+
</head>
|
65
|
+
HTML
|
66
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
67
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should parse meta tag even it is partially single quoted' do
|
71
|
+
html = <<~HTML
|
72
|
+
<head>
|
73
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name=\'twitter:description">
|
74
|
+
<meta content="" property="uid">
|
75
|
+
<meta name="twitter:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
76
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
77
|
+
name="viewport">
|
78
|
+
</head>
|
79
|
+
HTML
|
80
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
81
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should parse meta tag even it is having other attributes defined' do
|
85
|
+
html = <<~HTML
|
86
|
+
<head>
|
87
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name="twitter:description">
|
88
|
+
<meta content="" property="uid">
|
89
|
+
<meta class="metadescription" property=\'og:description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8">
|
90
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
91
|
+
name="viewport">
|
92
|
+
</head>
|
93
|
+
HTML
|
94
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
95
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'should parse meta tag with itemprop as description key' do
|
99
|
+
html = <<~HTML
|
100
|
+
<head>
|
101
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." itemprop=\'twitter:description">
|
102
|
+
<meta content="" property="uid">
|
103
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
104
|
+
name="viewport">
|
105
|
+
</head>
|
106
|
+
HTML
|
107
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
108
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'should parse even name/itemprop key content is improperly assigned' do
|
112
|
+
html = <<~HTML
|
113
|
+
<head>
|
114
|
+
<meta content="" property="uid">
|
115
|
+
<meta class="metadescription" name=twitter:description content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" />
|
116
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
117
|
+
name="viewport">
|
118
|
+
</head>
|
119
|
+
HTML
|
120
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
121
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'should bring description having single quote' do
|
125
|
+
html = <<~HTML
|
126
|
+
<html lang="en">
|
127
|
+
<head>
|
128
|
+
<META charset="utf-8">
|
129
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
130
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
131
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
132
|
+
<meta name="twitter:description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
|
133
|
+
</head>
|
134
|
+
<html>
|
135
|
+
HTML
|
136
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
137
|
+
expect(twitter_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'should bring description having double quote' do
|
141
|
+
html = <<~HTML
|
142
|
+
<html lang="en">
|
143
|
+
<head>
|
144
|
+
<META charset="utf-8">
|
145
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
146
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
147
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
148
|
+
<meta name="twitter:description" content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' />
|
149
|
+
</head>
|
150
|
+
<html>
|
151
|
+
HTML
|
152
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
153
|
+
expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
154
|
+
end
|
155
|
+
|
156
|
+
it "should bring description even some other meta tag is empty" do
|
157
|
+
html = <<~HTML
|
158
|
+
<html lang="en">
|
159
|
+
<head>
|
160
|
+
<META charset="utf-8">
|
161
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
162
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
163
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
164
|
+
<meta name="twitter:description" content="">
|
165
|
+
<meta name="twitter:description" content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' />
|
166
|
+
</head>
|
167
|
+
<html>
|
168
|
+
HTML
|
169
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
170
|
+
expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
171
|
+
end
|
172
|
+
end
|
173
|
+
describe 'Content key first organization description tag' do
|
174
|
+
it 'should return nil when content part is empty' do
|
175
|
+
no_twitter_description = <<~HTML
|
176
|
+
<head>
|
177
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
178
|
+
<meta content="" name="twitter:description">
|
179
|
+
<meta content='' name="twitter:description">
|
180
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
181
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
182
|
+
</head>
|
183
|
+
HTML
|
184
|
+
twitter_description = dummy_object.grep_twitter_description(no_twitter_description.to_s)
|
185
|
+
expect(twitter_description).to be_nil
|
186
|
+
end
|
187
|
+
|
188
|
+
it 'should return description from valid tag' do
|
189
|
+
html = <<~HTML
|
190
|
+
<head>
|
191
|
+
<meta content="" property="uid">
|
192
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name="twitter:description">
|
193
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
|
194
|
+
</head>
|
195
|
+
HTML
|
196
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
197
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
198
|
+
end
|
199
|
+
|
200
|
+
it 'should return description even tag is multilined and partially encoded' do
|
201
|
+
html = <<~HTML
|
202
|
+
<head>
|
203
|
+
<meta content="" property="uid">
|
204
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name="twitter:description" >
|
205
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
206
|
+
name="viewport">
|
207
|
+
</head>
|
208
|
+
HTML
|
209
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
210
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
211
|
+
end
|
212
|
+
|
213
|
+
it 'should parse meta tag even it is partially single quoted' do
|
214
|
+
html = <<~HTML
|
215
|
+
<head>
|
216
|
+
<meta content="" property="uid">
|
217
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name=\'twitter:description">
|
218
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
219
|
+
name="viewport">
|
220
|
+
</head>
|
221
|
+
HTML
|
222
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
223
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
224
|
+
end
|
225
|
+
|
226
|
+
it 'should parse meta tag even it is having other attributes defined' do
|
227
|
+
html = <<~HTML
|
228
|
+
<head>
|
229
|
+
<meta content="" property="uid">
|
230
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" name=\'twitter:description">
|
231
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
232
|
+
name="viewport">
|
233
|
+
</head>
|
234
|
+
HTML
|
235
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
236
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
237
|
+
end
|
238
|
+
|
239
|
+
it 'should parse meta tag with itemprop as description key' do
|
240
|
+
html = <<~HTML
|
241
|
+
<head>
|
242
|
+
<meta content="" property="uid">
|
243
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." itemprop=\'twitter:description" charset="UTF-8">
|
244
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
245
|
+
name="viewport">
|
246
|
+
</head>
|
247
|
+
HTML
|
248
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
249
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
250
|
+
end
|
251
|
+
|
252
|
+
it 'should parse even name/itemprop key content is improperly assigned' do
|
253
|
+
html = <<~HTML
|
254
|
+
<head>
|
255
|
+
<meta content="" property="uid">
|
256
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" name=twitter:description />
|
257
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
258
|
+
name="viewport">
|
259
|
+
</head>
|
260
|
+
HTML
|
261
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
262
|
+
expect(twitter_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
263
|
+
end
|
264
|
+
|
265
|
+
it 'should bring description having single quote' do
|
266
|
+
html = <<~HTML
|
267
|
+
<html lang="en">
|
268
|
+
<head>
|
269
|
+
<META charset="utf-8">
|
270
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
271
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
272
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
273
|
+
<meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." name="twitter:description" />
|
274
|
+
</head>
|
275
|
+
<html>
|
276
|
+
HTML
|
277
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
278
|
+
expect(twitter_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
|
279
|
+
end
|
280
|
+
|
281
|
+
it 'should bring description having double quote' do
|
282
|
+
html = <<~HTML
|
283
|
+
<html lang="en">
|
284
|
+
<head>
|
285
|
+
<META charset="utf-8">
|
286
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
287
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
288
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
289
|
+
<meta content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' name="twitter:description" />
|
290
|
+
</head>
|
291
|
+
<html>
|
292
|
+
HTML
|
293
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
294
|
+
expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
295
|
+
end
|
296
|
+
|
297
|
+
it "should bring description even some other meta tag is empty" do
|
298
|
+
html = <<~HTML
|
299
|
+
<html lang="en">
|
300
|
+
<head>
|
301
|
+
<META charset="utf-8">
|
302
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
303
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
304
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
305
|
+
<meta content="" name="twitter:description">
|
306
|
+
<meta content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' name="twitter:description"/>
|
307
|
+
</head>
|
308
|
+
<html>
|
309
|
+
HTML
|
310
|
+
twitter_description = dummy_object.grep_twitter_description(html.to_s)
|
311
|
+
expect(twitter_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Twitter Profile' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include TwitterProfile
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_twitter_profile(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_twitter_profile('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should not grep any non profile url' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="http://twitter.com/download/iphone\\" target="_blank">
|
18
|
+
<a href="http://twitter.com/%user_screen_name%/statuses/%id%" target="_blank">
|
19
|
+
<a href="https://twitter.com/i/web/status/1116404686133686272" target="_blank">
|
20
|
+
<a href="https://twitter.com/" target="_blank">
|
21
|
+
<a href="http://twitter.com/\'+reply.substring(1)+" target="_blank">
|
22
|
+
<a href="http://twitter.com/#" target="_blank">
|
23
|
+
<a href="https://twitter.com/intent/tweet?text=https://www.facebook.com/ChoosePremiere/photos/a.10151220913587649/10157236078952649/?type=3'," target="_blank">
|
24
|
+
<a href="https://twitter.com/share?url=https://dirigoagency.com/" target="_blank">
|
25
|
+
<a href="https://twitter.com/search?q=%23solicitors&src=hash" target="_blank">
|
26
|
+
<a href="https://twitter.com/hashtag/salisburysalutes?src=hash" target="_blank">
|
27
|
+
<a href="https://twitter.com/privacy" target="_blank">
|
28
|
+
<a href="https://twitter.com/home?status=Hey" target="_blank">
|
29
|
+
<a href="https://twitter.com/statuses/1113546402863312896" target="_blank">
|
30
|
+
<a href="https://twitter.com/login" target="_blank">
|
31
|
+
<a href=" http://twitter.com/share/" target="_blank">
|
32
|
+
<a href="https://twitter.com/#!/Farmer_Brothers" target="_blank">
|
33
|
+
<a href="http://twitter.com/javascripts/blogger.js" target="_blank">
|
34
|
+
HTML
|
35
|
+
expect(dummy_object.grep_twitter_profile(html.to_s)).to eq([])
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should grep valid urls' do
|
39
|
+
html = <<~HTML
|
40
|
+
<a href="http://twitter.com/_titaniumrings" target="_blank">
|
41
|
+
<a href="http://twitter.com/_Titaniumrings" target="_blank">
|
42
|
+
<a href="http://twitter.com/@clindiatwitt" target="_blank">
|
43
|
+
<a href="http://twitter.com/8of12" target="_blank">
|
44
|
+
<a href="http://twitter.com/AAB_Accountants" target="_blank">
|
45
|
+
<a href="http://twitter.com/SundanceCompany/statuses/1148708421308485637" target="_blank">
|
46
|
+
HTML
|
47
|
+
twitter_profiles = dummy_object.grep_twitter_profile(html.to_s)
|
48
|
+
|
49
|
+
expected_profiles = [
|
50
|
+
"http://twitter.com/_titaniumrings",
|
51
|
+
"http://twitter.com/_Titaniumrings",
|
52
|
+
"http://twitter.com/@clindiatwitt",
|
53
|
+
"http://twitter.com/8of12",
|
54
|
+
"http://twitter.com/AAB_Accountants",
|
55
|
+
"http://twitter.com/SundanceCompany/statuses/1148708421308485637"
|
56
|
+
]
|
57
|
+
expect(twitter_profiles).to eq(expected_profiles)
|
58
|
+
end
|
59
|
+
end
|