brilliant_web_scraper 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/README.md +31 -0
- data/brilliant_web_scraper-1.0.0.gem +0 -0
- data/brilliant_web_scraper-1.0.gem +0 -0
- data/brilliant_web_scraper.gemspec +30 -0
- data/lib/brilliant_web_scraper.rb +55 -0
- data/lib/parsers/description_helper.rb +28 -0
- data/lib/parsers/emails.rb +30 -0
- data/lib/parsers/facebook_profile.rb +11 -0
- data/lib/parsers/instagram_profile.rb +11 -0
- data/lib/parsers/linkedin_profile.rb +11 -0
- data/lib/parsers/meta_description.rb +13 -0
- data/lib/parsers/org_description.rb +13 -0
- data/lib/parsers/phone_numbers.rb +34 -0
- data/lib/parsers/pinterest_profile.rb +11 -0
- data/lib/parsers/redirected_to.rb +29 -0
- data/lib/parsers/title.rb +13 -0
- data/lib/parsers/twitter_description.rb +13 -0
- data/lib/parsers/twitter_profile.rb +11 -0
- data/lib/parsers/unescape_html_helper.rb +17 -0
- data/lib/parsers/vimeo_profile.rb +11 -0
- data/lib/parsers/youtube_channel.rb +29 -0
- data/lib/scraper/errors.rb +19 -0
- data/lib/scraper/scrape_exceptions.rb +49 -0
- data/lib/scraper/scrape_helper.rb +59 -0
- data/lib/scraper/scrape_request.rb +29 -0
- data/lib/version.rb +6 -0
- data/spec/lib/parsers/description_helper_spec.rb +24 -0
- data/spec/lib/parsers/emails_spec.rb +60 -0
- data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
- data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
- data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
- data/spec/lib/parsers/meta_description_spec.rb +321 -0
- data/spec/lib/parsers/org_description_spec.rb +316 -0
- data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
- data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
- data/spec/lib/parsers/redirected_to_spec.rb +207 -0
- data/spec/lib/parsers/title_spec.rb +87 -0
- data/spec/lib/parsers/twitter_description_spec.rb +314 -0
- data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
- data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
- data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
- data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
- data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
- data/spec/lib/scraper/scrape_request_test.rb +34 -0
- data/spec/spec_helper.rb +111 -0
- data/spec/vcr/encoding_compatibility_error.yml +316 -0
- data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
- data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
- data/spec/vcr/non_html_scrape.yml +163 -0
- data/spec/vcr/valid_scrape_response.yml +696 -0
- metadata +250 -0
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nesty'
|
4
|
+
|
5
|
+
# Raise error as WebScraper Error
|
6
|
+
module WebScraper
|
7
|
+
# Inclide nesty to have actual stacktrace of bug
|
8
|
+
class Error < StandardError
|
9
|
+
include Nesty::NestedError
|
10
|
+
end
|
11
|
+
|
12
|
+
class TimeoutError < Error; end
|
13
|
+
|
14
|
+
class RequestError < Error; end
|
15
|
+
|
16
|
+
class ParserError < Error; end
|
17
|
+
|
18
|
+
class NonHtmlError < ParserError; end
|
19
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# List all the possible exceptions
|
4
|
+
module ScrapeExceptions
|
5
|
+
GENERAL_EXCEPTIONS = [
|
6
|
+
URI::InvalidURIError,
|
7
|
+
RestClient::NotAcceptable,
|
8
|
+
RestClient::BadGateway,
|
9
|
+
RestClient::URITooLong,
|
10
|
+
Encoding::CompatibilityError,
|
11
|
+
RestClient::SeeOther,
|
12
|
+
RestClient::LoopDetected,
|
13
|
+
RestClient::PermanentRedirect,
|
14
|
+
RestClient::Locked,
|
15
|
+
RestClient::MethodNotAllowed,
|
16
|
+
RestClient::NotImplemented,
|
17
|
+
RestClient::PaymentRequired,
|
18
|
+
RestClient::TooManyRequests,
|
19
|
+
RestClient::RangeNotSatisfiable,
|
20
|
+
Errno::ENETUNREACH,
|
21
|
+
RestClient::Conflict,
|
22
|
+
RestClient::ProxyAuthenticationRequired,
|
23
|
+
Net::HTTPBadResponse,
|
24
|
+
Errno::ECONNREFUSED,
|
25
|
+
Errno::ECONNRESET,
|
26
|
+
Errno::EHOSTUNREACH,
|
27
|
+
Errno::EINVAL,
|
28
|
+
OpenSSL::SSL::SSLError,
|
29
|
+
RestClient::BadRequest,
|
30
|
+
RestClient::Forbidden,
|
31
|
+
RestClient::GatewayTimeout,
|
32
|
+
RestClient::Gone,
|
33
|
+
RestClient::InternalServerError,
|
34
|
+
RestClient::MovedPermanently,
|
35
|
+
RestClient::NotFound,
|
36
|
+
RestClient::RequestFailed,
|
37
|
+
RestClient::ServerBrokeConnection,
|
38
|
+
RestClient::ServiceUnavailable,
|
39
|
+
RestClient::SSLCertificateNotVerified,
|
40
|
+
RestClient::Unauthorized,
|
41
|
+
SocketError
|
42
|
+
].freeze
|
43
|
+
|
44
|
+
TIMEOUT_EXCEPTIONS = [
|
45
|
+
RestClient::Exceptions::OpenTimeout,
|
46
|
+
RestClient::Exceptions::ReadTimeout,
|
47
|
+
RestClient::RequestTimeout
|
48
|
+
].freeze
|
49
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Scrapes below data
|
4
|
+
# @Title
|
5
|
+
# @Descriptions
|
6
|
+
# @Social Profiles
|
7
|
+
# @Contact Details
|
8
|
+
module ScrapeHelper
|
9
|
+
def perform_scrape(url, read_timeout, connection_timeout)
|
10
|
+
response = nil
|
11
|
+
request_duration = Benchmark.measure do
|
12
|
+
response = ScrapeRequest.new(url, read_timeout, connection_timeout)
|
13
|
+
end.real
|
14
|
+
retry_count = 0
|
15
|
+
begin
|
16
|
+
scrape_data = nil
|
17
|
+
scrape_duration = Benchmark.measure do
|
18
|
+
scrape_data = grep_data(response.body)
|
19
|
+
end.real
|
20
|
+
|
21
|
+
data_hash = {
|
22
|
+
web_request_duration: request_duration,
|
23
|
+
response_scrape_duraton: scrape_duration,
|
24
|
+
scrape_data: scrape_data
|
25
|
+
}
|
26
|
+
rescue ArgumentError => e
|
27
|
+
retry_count += 1
|
28
|
+
raise WebScraper::ParserError, e.message if retry_count > 1
|
29
|
+
|
30
|
+
response = response.encode('UTF-16be', invalid: :replace, replace: '?')
|
31
|
+
response = response.encode('UTF-8')
|
32
|
+
retry
|
33
|
+
rescue Encoding::CompatibilityError => e
|
34
|
+
raise WebScraper::ParserError, e.message
|
35
|
+
end
|
36
|
+
data_hash
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def grep_data(response)
|
42
|
+
{
|
43
|
+
title: grep_title(response),
|
44
|
+
meta_description: grep_meta_description(response),
|
45
|
+
org_description: grep_org_description(response),
|
46
|
+
twitter_description: grep_twitter_description(response),
|
47
|
+
twitter_profile: grep_twitter_profile(response),
|
48
|
+
linkedin_profile: grep_linkedin_profile(response),
|
49
|
+
facebook_profile: grep_facebook_profile(response),
|
50
|
+
instagram_profile: grep_instagram_profile(response),
|
51
|
+
vimeo_profile: grep_vimeo_profile(response),
|
52
|
+
pinterest_profile: grep_pinterest_profile(response),
|
53
|
+
youtube_channel: grep_youtube_channel(response),
|
54
|
+
emails: grep_emails(response),
|
55
|
+
phone_numbers: grep_phone_numbers(response),
|
56
|
+
redirected_to: grep_redirected_to_url(response)
|
57
|
+
}
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# @Makes actual scrape request, either raises exception or response
|
4
|
+
module ScrapeRequest
|
5
|
+
extend ScrapeExceptions
|
6
|
+
class << self
|
7
|
+
def new(url, read_timeout, connection_timeout)
|
8
|
+
begin
|
9
|
+
params_hash = {
|
10
|
+
method: :get,
|
11
|
+
url: url,
|
12
|
+
read_timeout: read_timeout,
|
13
|
+
connection_timeout: connection_timeout,
|
14
|
+
headers: { 'accept-encoding': 'identity' }
|
15
|
+
}
|
16
|
+
response = RestClient::Request.execute(params_hash)
|
17
|
+
content_type = response.headers[:content_type]
|
18
|
+
return response if content_type =~ %r{(?i)text\s*\/\s*html}
|
19
|
+
|
20
|
+
exception_message = "Invalid response format received: #{content_type}"
|
21
|
+
raise WebScraper::NonHtmlError, exception_message
|
22
|
+
rescue *TIMEOUT_EXCEPTIONS => e
|
23
|
+
raise WebScraper::TimeoutError, e.message
|
24
|
+
rescue *GENERAL_EXCEPTIONS => e
|
25
|
+
raise WebScraper::RequestError, e.message
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/version.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'DescriptionHelper' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include DescriptionHelper
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'it should return nil for inalid description' do
|
11
|
+
descriptions = [" ", "", "|", "-"]
|
12
|
+
expect(dummy_object.send(:parse_description, *[descriptions])).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should return valid description' do
|
16
|
+
descriptions = [
|
17
|
+
'2019年趣味幽默猜生肖,1358884不像看图找生肖,2019年看图猜生肖网站,2019看图找生肖83期,2019看图找生肖,2019看图找生肖109期,2019看图猜生肖买马,2019看图猜生肖买,2019全年看图找生肖图',
|
18
|
+
"-"
|
19
|
+
]
|
20
|
+
expect(
|
21
|
+
dummy_object.send(:parse_description, *[descriptions])
|
22
|
+
).to eq('2019年趣味幽默猜生肖,1358884不像看图找生肖,2019年看图猜生肖网站,2019看图找生肖83期,2019看图找生肖,2019看图找生肖109期,2019看图猜生肖买马,2019看图猜生肖买,2019全年看图找生肖图')
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Emails' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include Emails
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_emails(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_emails('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should give []' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="mailto:abc@example.com">abc@example.com</a>
|
18
|
+
<a href="mailto:example@mail.com">example@email.com</a>
|
19
|
+
<a href="mailto:name@domain.com">name@domain.com</a>
|
20
|
+
<a href="mailto:name@company.com">name@company.com</a>
|
21
|
+
<a href="mailto:you@youremail.com">you@youremail.com</a>
|
22
|
+
<a href="mailto:your@emailaddress.com">your@emailaddress.com</a>
|
23
|
+
<a href="mailto:yourname@yourdomain.com">yourname@yourdomain.com</a>
|
24
|
+
<a href="mailto:yourname@yourcompany.com">yourname@yourcompany.com</a>
|
25
|
+
<a href="mailto:YOU@EMAILADRESS.COM">YOU@EMAILADRESS.COM</a>
|
26
|
+
<a href="mailto:you@address.com">you@address.com</a>
|
27
|
+
<a href="mailto:xxx@yyy.zzz">xxx@yyy.zzz</a>
|
28
|
+
<a href="mailto:test@test.com">test@test.com</a>
|
29
|
+
<a href="mailto:@example.com">@example.com"</a>
|
30
|
+
HTML
|
31
|
+
expect(dummy_object.grep_emails(html.to_s)).to eq([])
|
32
|
+
end
|
33
|
+
it 'should grep organization contact emailaddresses' do
|
34
|
+
html = <<~HTML
|
35
|
+
<a href="mailto:abc@example.com">abc@example.com</a>
|
36
|
+
<a class="fusion-social-network-icon fusion-tooltip " style="color:#ffffff;" href="mailto:wilson@wilson.nb.ca" target="_self" title="Email">
|
37
|
+
<span class="screen-reader-text">Email</span>
|
38
|
+
</a>
|
39
|
+
<div>
|
40
|
+
<br><strong>Mailing address</strong>
|
41
|
+
: 1320 Yonge Street, Toronto, Ontario  M4T 1X2<br><br>
|
42
|
+
<strong>Attendance</strong>: <br>Junior School: 1639attendance@yorkschool.com<br>Middle & Senior School: 1320attendance@yorkschool.com<br><br>
|
43
|
+
</div>
|
44
|
+
<a href="mailto:%20support@switcherstudio.com">
|
45
|
+
<a href="mailto:%20support@switcherstudio.com">
|
46
|
+
<a href="mailto:ekerlow@hellermanllc.com">
|
47
|
+
<a href=\\"mailto:Michael.O%27Brien@idga.org?subject=Editorial%20Calendar%20Contributor\\">
|
48
|
+
HTML
|
49
|
+
emails = dummy_object.grep_emails(html.to_s)
|
50
|
+
expected_emails = [
|
51
|
+
"wilson@wilson.nb.ca",
|
52
|
+
"support@switcherstudio.com",
|
53
|
+
"ekerlow@hellermanllc.com",
|
54
|
+
"michael.o'brien@idga.org",
|
55
|
+
"1639attendance@yorkschool.com",
|
56
|
+
"1320attendance@yorkschool.com"
|
57
|
+
]
|
58
|
+
expect(emails).to eq(expected_emails)
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'FaceBook Profile' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include FacebookProfile
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_facebook_profile(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_facebook_profile('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should not grep any non profile url' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="http://www.facebook.com/2008/fbml" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
18
|
+
<a href="https://www.facebook.com/v2.0/dialog/share" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
19
|
+
<a href="https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2FHFXMooseheads%2Fvideos" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
20
|
+
<a href="https://www.facebook.com/search.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
21
|
+
<a href="http://www.facebook.com/home.php#/pages/Zend-Technologies/190917412139" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
22
|
+
<img height="1" width="1" style="display:none" src="https://www.facebook.com/tr?id=1501718829946651&ev=PageView&noscript=1"/>
|
23
|
+
<a href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fbroadreachstaffing.com&t=Broadreach" class="et_social_share" rel="nofollow" data-social_name="facebook" data-post_id="68" data-social_type="share" data-location="sidebar"></a>
|
24
|
+
<a href="https://www.facebook.com/photo.php?fbid=10157409473244808&set=p.10157409473244808&type=3" class="et_social_share" rel="nofollow" data-social_name="facebook" data-post_id="68" data-social_type="share" data-location="sidebar"></a>
|
25
|
+
<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.facebook.com%2Fchoosepremiere%2Fposts%2F10157307766122649" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
26
|
+
<a href="https://www.facebook.com/dialog/send?display=popup&link=https%3A%2F%2Fsmartcookiemedia.com%2F&redirect_uri=https://smartcookiemedia.com/" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
27
|
+
<a href="https://www.facebook.com/hashtag/beeryoga" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
28
|
+
<a href="http://facebook.com/privacy" title="Facebook Privacy" target="_blank">Facebook</a>
|
29
|
+
<a target="_blank" title="Facebook - Social Gastronomy" href="http://www.facebook.com/home.php#/pages/Social-Gastronomy/187440209207?ref=ts"><img alt="images" src="/images/stories/social/images.jpg" width="30"></a>
|
30
|
+
<a href="http://www.facebook.com/plugins/like.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
31
|
+
<iframe src="http://www.facebook.com/plugins/likebox.php?href=https%3A%2F%2Fwww.facebook.com%2Fbandcart&width=220&colorscheme=dark&show_faces=false&stream=false&header=false&height=65" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:220px; height: 65px;" allowtransparency="true"></iframe>
|
32
|
+
<iframe style="border: none; overflow: hidden;" src="https://www.facebook.com/plugins/page.php?href=https%3A%2F%2Fwww.facebook.com%2FCarterBrothersCompany&tabs=timeline&width=340&height=500&small_header=false&adapt_container_width=true&hide_cover=false&show_facepile=true&appId=210697455750478" width="340" height="500" frameborder="0" scrolling="no"></iframe>
|
33
|
+
<a href="https://www.facebook.com/offsite_event.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
34
|
+
<a href="http://www.facebook.com/share.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
35
|
+
<a href="https://www.facebook.com/login/device-based/regular/login/?login_attempt=1" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
36
|
+
<a href="https://www.facebook.com/recover/initiate?lwv=110" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
37
|
+
<a href="https://www.facebook.com/help/568137493302217" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
38
|
+
<a href="https://www.facebook.com/help/2687943754764396" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
39
|
+
<a href="https://www.facebook.com/help/www/1573156092981768/" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
40
|
+
<a href="https://facebook.com/security/hsts-pixel.gif?c=3.2.5" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
41
|
+
|
42
|
+
HTML
|
43
|
+
expect(dummy_object.grep_facebook_profile(html.to_s)).to eq([])
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should grep valid urls' do
|
47
|
+
html = <<~HTML
|
48
|
+
<a href="http://facebook.com/AAEurope"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
|
49
|
+
<a target="_blank" href="https://www.facebook.com/pages/Basketball-New-Brunswick/156176001133032?sk=wall" title="Follow us on Facebook">facebook</a>
|
50
|
+
<a class="cff-photo cff-multiple cff-img-layout-4 cff-portrait nofancybox" style="max-width: 540px;" data-cff-page-name="Allied Printing Services" data-cff-post-time="4 days ago" href="https://www.facebook.com/alliedprinting/posts/2323507841068448"> FB Posts</a>
|
51
|
+
<a class="cff-photo cff-multiple cff-img-layout-4 cff-portrait nofancybox" style="max-width: 540px;" data-cff-page-name="Allied Printing Services" data-cff-post-time="4 days ago" href="https://www.facebook.com/groups/1004350633012081/"> FB Posts</a>
|
52
|
+
<a href="https://www.facebook.com/events/116316035951805/"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
|
53
|
+
<a href="https://www.facebook.com/arithane.foamroofing"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
|
54
|
+
<a href="http://www.facebook.com/pages/Surgical+Information+Systems/75322028321"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
|
55
|
+
<a href="https://www.facebook.com/Baylor-School-124353897738/"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
|
56
|
+
<a href="http://www.facebook.com/profile.php?id=100000325114186&v=info#!/pages/Blackstone-Counsel/150651724966482" target="_blank">
|
57
|
+
<img class="social" src="facebook.jpg" alt="Facebook"/>
|
58
|
+
</a>
|
59
|
+
<a href="http://facebook.com/profile.php?id=205682532825685" target="_blank"><img class="social" src="facebook.jpg" alt="Facebook"/></a>
|
60
|
+
<a href="http://www.facebook.com/share.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
61
|
+
HTML
|
62
|
+
fb_profiles = dummy_object.grep_facebook_profile(html.to_s)
|
63
|
+
expected_profiles = [
|
64
|
+
'http://facebook.com/AAEurope',
|
65
|
+
'https://www.facebook.com/pages/Basketball-New-Brunswick/156176001133032?sk=wall',
|
66
|
+
'https://www.facebook.com/alliedprinting/posts/2323507841068448',
|
67
|
+
'https://www.facebook.com/groups/1004350633012081/',
|
68
|
+
'https://www.facebook.com/events/116316035951805/',
|
69
|
+
'https://www.facebook.com/arithane.foamroofing',
|
70
|
+
'http://www.facebook.com/pages/Surgical+Information+Systems/75322028321',
|
71
|
+
'https://www.facebook.com/Baylor-School-124353897738/',
|
72
|
+
'http://www.facebook.com/profile.php?id=100000325114186',
|
73
|
+
'http://facebook.com/profile.php?id=205682532825685'
|
74
|
+
]
|
75
|
+
expect(fb_profiles).to eq(expected_profiles)
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Instagram Profile' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include InstagramProfile
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_instagram_profile(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_instagram_profile('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should not grep below url format' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="http://instagram.com/" style="color: white;" class="fa fa-instagram"></a>
|
18
|
+
<a href="http://instagram.com/#" style="color: white;" class="fa fa-instagram"></a>
|
19
|
+
<a href="https://www.instagram.com/%username%" style="color: white;" class="fa fa-instagram"></a>
|
20
|
+
<a href="https://www.instagram.com/explore/tags/Talent/" style="color: white;" class="fa fa-instagram"></a>
|
21
|
+
HTML
|
22
|
+
expect(dummy_object.grep_instagram_profile(html.to_s)).to eq([])
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should grep organization instagram profiles' do
|
26
|
+
html = <<~HTML
|
27
|
+
<a href="https://www.instagram.com/nextgenhealthcare" target="_blank">Instagram</a>
|
28
|
+
<a href="https://instagram.com/nextgenhealthcare" target="_blank">Instagram</a>
|
29
|
+
<a href="https://www.instagram.com/printed4you.co.uk" target="_blank">Instagram</a>
|
30
|
+
<a href="https://www.instagram.com/web_spiders" target="_blank">Instagram</a>
|
31
|
+
<a href="http://instagram.com/mccaincanada?ref=badge" target="_blank">Instagram</a>
|
32
|
+
<a href="http://instagram.com/mcdermottscholars","target":"_blank"}},"displayMode":"fill"}" target="_blank">Instagram</a>
|
33
|
+
HTML
|
34
|
+
instagram_profiles = dummy_object.grep_instagram_profile(html.to_s)
|
35
|
+
expected_instagram_profiles = [
|
36
|
+
'https://www.instagram.com/nextgenhealthcare',
|
37
|
+
'https://instagram.com/nextgenhealthcare',
|
38
|
+
'https://www.instagram.com/printed4you.co.uk',
|
39
|
+
'https://www.instagram.com/web_spiders',
|
40
|
+
'http://instagram.com/mccaincanada',
|
41
|
+
'http://instagram.com/mcdermottscholars'
|
42
|
+
]
|
43
|
+
expect(dummy_object.grep_instagram_profile(html.to_s)).to eq(instagram_profiles)
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Linkedin Profile' do
|
4
|
+
|
5
|
+
class DummyTestClass
|
6
|
+
include LinkedinProfile
|
7
|
+
end
|
8
|
+
let(:dummy_object) { DummyTestClass.new }
|
9
|
+
|
10
|
+
it 'should return nil for invalid input' do
|
11
|
+
expect(dummy_object.grep_linkedin_profile(nil)).to be_nil
|
12
|
+
expect(dummy_object.grep_linkedin_profile('')).to be_nil
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should not grep below url format' do
|
16
|
+
html = <<~HTML
|
17
|
+
<a href="https://www.linkedin.com" style="color: white;" class="fa fa-linkedin"></a>
|
18
|
+
<a href="https://www.linkedin.com/feed/" style="color: white;" class="fa fa-linkedin"></a>
|
19
|
+
<a href="https://www.linkedin.com/mynetwork/" style="color: white;" class="fa fa-linkedin"></a>
|
20
|
+
<a href="https://www.linkedin.com/jobs/" style="color: white;" class="fa fa-linkedin"></a>
|
21
|
+
<a href="https://www.linkedin.com/messaging/" style="color: white;" class="fa fa-linkedin"></a>
|
22
|
+
<a href="https://www.linkedin.com/notifications/" style="color: white;" class="fa fa-linkedin"></a>
|
23
|
+
<a href="https://www.linkedin.com/psettings/" style="color: white;" class="fa fa-linkedin"></a>
|
24
|
+
<a href="https://www.linkedin.com/ca/pet-32/" style="color: white;" class="fa fa-linkedin"></a>
|
25
|
+
HTML
|
26
|
+
expect(dummy_object.grep_linkedin_profile(html.to_s)).to eq([])
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should grep organization linkedin profiles' do
|
30
|
+
html = <<~HTML
|
31
|
+
<a href="https://www.linkedin.com/company/13247248/" target="_blank">Linkedin</a>
|
32
|
+
<a href="https://www.linkedin.com/company/m-files-corporation" target="_blank">Linkedin</a>
|
33
|
+
<a href="https://www.linkedin.com/company/dataendure" target="_blank">Linkedin</a>
|
34
|
+
HTML
|
35
|
+
linkedin_profiles = dummy_object.grep_linkedin_profile(html.to_s)
|
36
|
+
expected_linkedin_profiles = [
|
37
|
+
'https://www.linkedin.com/company/13247248',
|
38
|
+
'https://www.linkedin.com/company/m-files-corporation',
|
39
|
+
'https://www.linkedin.com/company/dataendure'
|
40
|
+
]
|
41
|
+
expect(dummy_object.grep_linkedin_profile(html.to_s)).to eq(expected_linkedin_profiles)
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,321 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Meta Description' do
|
4
|
+
class DummyTestClass
|
5
|
+
include MetaDescription
|
6
|
+
end
|
7
|
+
let(:dummy_object) { DummyTestClass.new }
|
8
|
+
|
9
|
+
it 'should return nil for invalid inputs' do
|
10
|
+
expect(dummy_object.grep_meta_description('')).to be_nil
|
11
|
+
expect(dummy_object.grep_meta_description(nil)).to be_nil
|
12
|
+
end
|
13
|
+
describe 'Name key first meta description tag' do
|
14
|
+
it 'should return nil for no meta description tag presence' do
|
15
|
+
no_meta_description = <<~HTML
|
16
|
+
<head>
|
17
|
+
<meta charset="utf-8">
|
18
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
19
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
20
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
21
|
+
<meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
|
22
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
23
|
+
</head>
|
24
|
+
HTML
|
25
|
+
meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
|
26
|
+
expect(meta_description).to be_nil
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should return nil when content part is empty' do
|
30
|
+
no_meta_description = <<~HTML
|
31
|
+
<head>
|
32
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
33
|
+
<meta name="description" content="">
|
34
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
35
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
36
|
+
</head>
|
37
|
+
HTML
|
38
|
+
meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
|
39
|
+
expect(meta_description).to be_nil
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should return description from valid tag' do
|
43
|
+
html = <<~HTML
|
44
|
+
<head>
|
45
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
46
|
+
<meta content="" property="uid">
|
47
|
+
<meta name="description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
48
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
|
49
|
+
</head>
|
50
|
+
HTML
|
51
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
52
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should return description even tag is multilined and partially encoded' do
|
56
|
+
html = <<~HTML
|
57
|
+
<head>
|
58
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
59
|
+
<meta content="" property="uid">
|
60
|
+
<meta name="description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
61
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
62
|
+
name="viewport">
|
63
|
+
</head>
|
64
|
+
HTML
|
65
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
66
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should parse meta tag even it is partially single quoted' do
|
70
|
+
html = <<~HTML
|
71
|
+
<head>
|
72
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
73
|
+
<meta content="" property="uid">
|
74
|
+
<meta name=\'description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.">
|
75
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
76
|
+
name="viewport">
|
77
|
+
</head>
|
78
|
+
HTML
|
79
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
80
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
81
|
+
end
|
82
|
+
|
83
|
+
it 'should parse meta tag even it is having other attributes defined' do
|
84
|
+
html = <<~HTML
|
85
|
+
<head>
|
86
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
87
|
+
<meta content="" property="uid">
|
88
|
+
<meta class="metadescription" name=\'description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8">
|
89
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
90
|
+
name="viewport">
|
91
|
+
</head>
|
92
|
+
HTML
|
93
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
94
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
95
|
+
end
|
96
|
+
|
97
|
+
it 'should parse meta tag with itemprop as description key' do
|
98
|
+
html = <<~HTML
|
99
|
+
<head>
|
100
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
101
|
+
<meta content="" property="uid">
|
102
|
+
<meta class="metadescription" itemprop=\'description" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8">
|
103
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
104
|
+
name="viewport">
|
105
|
+
</head>
|
106
|
+
HTML
|
107
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
108
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'should parse even name/itemprop key content is improperly assigned' do
|
112
|
+
html = <<~HTML
|
113
|
+
<head>
|
114
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
115
|
+
<meta content="" property="uid">
|
116
|
+
<meta class="metadescription" name=description content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" />
|
117
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
118
|
+
name="viewport">
|
119
|
+
</head>
|
120
|
+
HTML
|
121
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
122
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
123
|
+
end
|
124
|
+
|
125
|
+
it 'should bring description having single quote' do
|
126
|
+
html = <<~HTML
|
127
|
+
<html lang="en">
|
128
|
+
<head>
|
129
|
+
<META charset="utf-8">
|
130
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
131
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
132
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
133
|
+
<meta name="description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
|
134
|
+
</head>
|
135
|
+
<html>
|
136
|
+
HTML
|
137
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
138
|
+
expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
|
139
|
+
end
|
140
|
+
|
141
|
+
it 'should bring description having double quote' do
|
142
|
+
html = <<~HTML
|
143
|
+
<html lang="en">
|
144
|
+
<head>
|
145
|
+
<META charset="utf-8">
|
146
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
147
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
148
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
149
|
+
<meta name="description" content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' />
|
150
|
+
</head>
|
151
|
+
<html>
|
152
|
+
HTML
|
153
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
154
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
155
|
+
end
|
156
|
+
|
157
|
+
it "should bring description even some other meta tag is empty" do
|
158
|
+
html = <<~HTML
|
159
|
+
<html lang="en">
|
160
|
+
<head>
|
161
|
+
<META charset="utf-8">
|
162
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
163
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
164
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
165
|
+
<meta name="description" content="">
|
166
|
+
<meta name="description" content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' />
|
167
|
+
</head>
|
168
|
+
<html>
|
169
|
+
HTML
|
170
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
171
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
172
|
+
end
|
173
|
+
end
|
174
|
+
describe 'Content key first meta description tag' do
|
175
|
+
it 'should return nil when content part is empty' do
|
176
|
+
no_meta_description = <<~HTML
|
177
|
+
<head>
|
178
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
179
|
+
<meta content="" name="description">
|
180
|
+
<meta content='' name="description">
|
181
|
+
<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
|
182
|
+
<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
|
183
|
+
</head>
|
184
|
+
HTML
|
185
|
+
meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
|
186
|
+
expect(meta_description).to be_nil
|
187
|
+
end
|
188
|
+
|
189
|
+
it 'should return description from valid tag' do
|
190
|
+
html = <<~HTML
|
191
|
+
<head>
|
192
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
193
|
+
<meta content="" property="uid">
|
194
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name="description">
|
195
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
|
196
|
+
</head>
|
197
|
+
HTML
|
198
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
199
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
200
|
+
end
|
201
|
+
|
202
|
+
it 'should return description even tag is multilined and partially encoded' do
|
203
|
+
html = <<~HTML
|
204
|
+
<head>
|
205
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
206
|
+
<meta content="" property="uid">
|
207
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name="description" >
|
208
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
209
|
+
name="viewport">
|
210
|
+
</head>
|
211
|
+
HTML
|
212
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
213
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
214
|
+
end
|
215
|
+
|
216
|
+
it 'should parse meta tag even it is partially single quoted' do
|
217
|
+
html = <<~HTML
|
218
|
+
<head>
|
219
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
220
|
+
<meta content="" property="uid">
|
221
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." name=\'description">
|
222
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
223
|
+
name="viewport">
|
224
|
+
</head>
|
225
|
+
HTML
|
226
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
227
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'should parse meta tag even it is having other attributes defined' do
|
231
|
+
html = <<~HTML
|
232
|
+
<head>
|
233
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
234
|
+
<meta content="" property="uid">
|
235
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" name=\'description">
|
236
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
237
|
+
name="viewport">
|
238
|
+
</head>
|
239
|
+
HTML
|
240
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
241
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
242
|
+
end
|
243
|
+
|
244
|
+
it 'should parse meta tag with itemprop as description key' do
|
245
|
+
html = <<~HTML
|
246
|
+
<head>
|
247
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
248
|
+
<meta content="" property="uid">
|
249
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." itemprop=\'description" charset="UTF-8">
|
250
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
251
|
+
name="viewport">
|
252
|
+
</head>
|
253
|
+
HTML
|
254
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
255
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
256
|
+
end
|
257
|
+
|
258
|
+
it 'should parse even name/itemprop key content is improperly assigned' do
|
259
|
+
html = <<~HTML
|
260
|
+
<head>
|
261
|
+
<meta content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." property="og:description">
|
262
|
+
<meta content="" property="uid">
|
263
|
+
<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront." charset="UTF-8" name=description />
|
264
|
+
<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
|
265
|
+
name="viewport">
|
266
|
+
</head>
|
267
|
+
HTML
|
268
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
269
|
+
expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
|
270
|
+
end
|
271
|
+
|
272
|
+
it 'should bring description having single quote' do
|
273
|
+
html = <<~HTML
|
274
|
+
<html lang="en">
|
275
|
+
<head>
|
276
|
+
<META charset="utf-8">
|
277
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
278
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
279
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
280
|
+
<meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." name="description" />
|
281
|
+
</head>
|
282
|
+
<html>
|
283
|
+
HTML
|
284
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
285
|
+
expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
|
286
|
+
end
|
287
|
+
|
288
|
+
it 'should bring description having double quote' do
|
289
|
+
html = <<~HTML
|
290
|
+
<html lang="en">
|
291
|
+
<head>
|
292
|
+
<META charset="utf-8">
|
293
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
294
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
295
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
296
|
+
<meta content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' name="description" />
|
297
|
+
</head>
|
298
|
+
<html>
|
299
|
+
HTML
|
300
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
301
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
302
|
+
end
|
303
|
+
|
304
|
+
it "should bring description even some other meta tag is empty" do
|
305
|
+
html = <<~HTML
|
306
|
+
<html lang="en">
|
307
|
+
<head>
|
308
|
+
<META charset="utf-8">
|
309
|
+
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
310
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
311
|
+
<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
|
312
|
+
<meta content="" name="description">
|
313
|
+
<meta content='Whether you're a Mac "die-hard" or an iPad "newbie" we give you the scoop on what's new, what's best and how to make the most out of the products you love.' name="description"/>
|
314
|
+
</head>
|
315
|
+
<html>
|
316
|
+
HTML
|
317
|
+
meta_description = dummy_object.grep_meta_description(html.to_s)
|
318
|
+
expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|