brilliant_web_scraper 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/README.md +31 -0
  4. data/brilliant_web_scraper-1.0.0.gem +0 -0
  5. data/brilliant_web_scraper-1.0.gem +0 -0
  6. data/brilliant_web_scraper.gemspec +30 -0
  7. data/lib/brilliant_web_scraper.rb +55 -0
  8. data/lib/parsers/description_helper.rb +28 -0
  9. data/lib/parsers/emails.rb +30 -0
  10. data/lib/parsers/facebook_profile.rb +11 -0
  11. data/lib/parsers/instagram_profile.rb +11 -0
  12. data/lib/parsers/linkedin_profile.rb +11 -0
  13. data/lib/parsers/meta_description.rb +13 -0
  14. data/lib/parsers/org_description.rb +13 -0
  15. data/lib/parsers/phone_numbers.rb +34 -0
  16. data/lib/parsers/pinterest_profile.rb +11 -0
  17. data/lib/parsers/redirected_to.rb +29 -0
  18. data/lib/parsers/title.rb +13 -0
  19. data/lib/parsers/twitter_description.rb +13 -0
  20. data/lib/parsers/twitter_profile.rb +11 -0
  21. data/lib/parsers/unescape_html_helper.rb +17 -0
  22. data/lib/parsers/vimeo_profile.rb +11 -0
  23. data/lib/parsers/youtube_channel.rb +29 -0
  24. data/lib/scraper/errors.rb +19 -0
  25. data/lib/scraper/scrape_exceptions.rb +49 -0
  26. data/lib/scraper/scrape_helper.rb +59 -0
  27. data/lib/scraper/scrape_request.rb +29 -0
  28. data/lib/version.rb +6 -0
  29. data/spec/lib/parsers/description_helper_spec.rb +24 -0
  30. data/spec/lib/parsers/emails_spec.rb +60 -0
  31. data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
  32. data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
  33. data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
  34. data/spec/lib/parsers/meta_description_spec.rb +321 -0
  35. data/spec/lib/parsers/org_description_spec.rb +316 -0
  36. data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
  37. data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
  38. data/spec/lib/parsers/redirected_to_spec.rb +207 -0
  39. data/spec/lib/parsers/title_spec.rb +87 -0
  40. data/spec/lib/parsers/twitter_description_spec.rb +314 -0
  41. data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
  42. data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
  43. data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
  44. data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
  45. data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
  46. data/spec/lib/scraper/scrape_request_test.rb +34 -0
  47. data/spec/spec_helper.rb +111 -0
  48. data/spec/vcr/encoding_compatibility_error.yml +316 -0
  49. data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
  50. data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
  51. data/spec/vcr/non_html_scrape.yml +163 -0
  52. data/spec/vcr/valid_scrape_response.yml +696 -0
  53. metadata +250 -0
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nesty'
4
+
5
+ # Raise error as WebScraper Error
6
+ module WebScraper
7
+ # Inclide nesty to have actual stacktrace of bug
8
+ class Error < StandardError
9
+ include Nesty::NestedError
10
+ end
11
+
12
+ class TimeoutError < Error; end
13
+
14
+ class RequestError < Error; end
15
+
16
+ class ParserError < Error; end
17
+
18
+ class NonHtmlError < ParserError; end
19
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ # List all the possible exceptions
4
+ module ScrapeExceptions
5
+ GENERAL_EXCEPTIONS = [
6
+ URI::InvalidURIError,
7
+ RestClient::NotAcceptable,
8
+ RestClient::BadGateway,
9
+ RestClient::URITooLong,
10
+ Encoding::CompatibilityError,
11
+ RestClient::SeeOther,
12
+ RestClient::LoopDetected,
13
+ RestClient::PermanentRedirect,
14
+ RestClient::Locked,
15
+ RestClient::MethodNotAllowed,
16
+ RestClient::NotImplemented,
17
+ RestClient::PaymentRequired,
18
+ RestClient::TooManyRequests,
19
+ RestClient::RangeNotSatisfiable,
20
+ Errno::ENETUNREACH,
21
+ RestClient::Conflict,
22
+ RestClient::ProxyAuthenticationRequired,
23
+ Net::HTTPBadResponse,
24
+ Errno::ECONNREFUSED,
25
+ Errno::ECONNRESET,
26
+ Errno::EHOSTUNREACH,
27
+ Errno::EINVAL,
28
+ OpenSSL::SSL::SSLError,
29
+ RestClient::BadRequest,
30
+ RestClient::Forbidden,
31
+ RestClient::GatewayTimeout,
32
+ RestClient::Gone,
33
+ RestClient::InternalServerError,
34
+ RestClient::MovedPermanently,
35
+ RestClient::NotFound,
36
+ RestClient::RequestFailed,
37
+ RestClient::ServerBrokeConnection,
38
+ RestClient::ServiceUnavailable,
39
+ RestClient::SSLCertificateNotVerified,
40
+ RestClient::Unauthorized,
41
+ SocketError
42
+ ].freeze
43
+
44
+ TIMEOUT_EXCEPTIONS = [
45
+ RestClient::Exceptions::OpenTimeout,
46
+ RestClient::Exceptions::ReadTimeout,
47
+ RestClient::RequestTimeout
48
+ ].freeze
49
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Scrapes below data
4
+ # @Title
5
+ # @Descriptions
6
+ # @Social Profiles
7
+ # @Contact Details
8
+ module ScrapeHelper
9
+ def perform_scrape(url, read_timeout, connection_timeout)
10
+ response = nil
11
+ request_duration = Benchmark.measure do
12
+ response = ScrapeRequest.new(url, read_timeout, connection_timeout)
13
+ end.real
14
+ retry_count = 0
15
+ begin
16
+ scrape_data = nil
17
+ scrape_duration = Benchmark.measure do
18
+ scrape_data = grep_data(response.body)
19
+ end.real
20
+
21
+ data_hash = {
22
+ web_request_duration: request_duration,
23
+ response_scrape_duraton: scrape_duration,
24
+ scrape_data: scrape_data
25
+ }
26
+ rescue ArgumentError => e
27
+ retry_count += 1
28
+ raise WebScraper::ParserError, e.message if retry_count > 1
29
+
30
+ response = response.encode('UTF-16be', invalid: :replace, replace: '?')
31
+ response = response.encode('UTF-8')
32
+ retry
33
+ rescue Encoding::CompatibilityError => e
34
+ raise WebScraper::ParserError, e.message
35
+ end
36
+ data_hash
37
+ end
38
+
39
+ private
40
+
41
+ def grep_data(response)
42
+ {
43
+ title: grep_title(response),
44
+ meta_description: grep_meta_description(response),
45
+ org_description: grep_org_description(response),
46
+ twitter_description: grep_twitter_description(response),
47
+ twitter_profile: grep_twitter_profile(response),
48
+ linkedin_profile: grep_linkedin_profile(response),
49
+ facebook_profile: grep_facebook_profile(response),
50
+ instagram_profile: grep_instagram_profile(response),
51
+ vimeo_profile: grep_vimeo_profile(response),
52
+ pinterest_profile: grep_pinterest_profile(response),
53
+ youtube_channel: grep_youtube_channel(response),
54
+ emails: grep_emails(response),
55
+ phone_numbers: grep_phone_numbers(response),
56
+ redirected_to: grep_redirected_to_url(response)
57
+ }
58
+ end
59
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ # @Makes actual scrape request, either raises exception or response
4
+ module ScrapeRequest
5
+ extend ScrapeExceptions
6
+ class << self
7
+ def new(url, read_timeout, connection_timeout)
8
+ begin
9
+ params_hash = {
10
+ method: :get,
11
+ url: url,
12
+ read_timeout: read_timeout,
13
+ connection_timeout: connection_timeout,
14
+ headers: { 'accept-encoding': 'identity' }
15
+ }
16
+ response = RestClient::Request.execute(params_hash)
17
+ content_type = response.headers[:content_type]
18
+ return response if content_type =~ %r{(?i)text\s*\/\s*html}
19
+
20
+ exception_message = "Invalid response format received: #{content_type}"
21
+ raise WebScraper::NonHtmlError, exception_message
22
+ rescue *TIMEOUT_EXCEPTIONS => e
23
+ raise WebScraper::TimeoutError, e.message
24
+ rescue *GENERAL_EXCEPTIONS => e
25
+ raise WebScraper::RequestError, e.message
26
+ end
27
+ end
28
+ end
29
+ end
data/lib/version.rb ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Holds current version number
4
+ module WebScraper
5
+ VERSION = '0.1'
6
+ end
@@ -0,0 +1,24 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'DescriptionHelper' do
4
+
5
+ class DummyTestClass
6
+ include DescriptionHelper
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'it should return nil for inalid description' do
11
+ descriptions = [" ", "", "|", "-"]
12
+ expect(dummy_object.send(:parse_description, *[descriptions])).to be_nil
13
+ end
14
+
15
+ it 'should return valid description' do
16
+ descriptions = [
17
+ '2019年趣味幽默猜生肖,1358884不像看图找生肖,2019年看图猜生肖网站,2019看图找生肖83期,2019看图找生肖,2019看图找生肖109期,2019看图猜生肖买马,2019看图猜生肖买,2019全年看图找生肖图',
18
+ "-"
19
+ ]
20
+ expect(
21
+ dummy_object.send(:parse_description, *[descriptions])
22
+ ).to eq('2019年趣味幽默猜生肖,1358884不像看图找生肖,2019年看图猜生肖网站,2019看图找生肖83期,2019看图找生肖,2019看图找生肖109期,2019看图猜生肖买马,2019看图猜生肖买,2019全年看图找生肖图')
23
+ end
24
+ end
@@ -0,0 +1,60 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Emails' do
4
+
5
+ class DummyTestClass
6
+ include Emails
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_emails(nil)).to be_nil
12
+ expect(dummy_object.grep_emails('')).to be_nil
13
+ end
14
+
15
+ it 'should give []' do
16
+ html = <<~HTML
17
+ <a href="mailto:abc@example.com">abc@example.com</a>
18
+ <a href="mailto:example@mail.com">example@email.com</a>
19
+ <a href="mailto:name@domain.com">name@domain.com</a>
20
+ <a href="mailto:name@company.com">name@company.com</a>
21
+ <a href="mailto:you@youremail.com">you@youremail.com</a>
22
+ <a href="mailto:your@emailaddress.com">your@emailaddress.com</a>
23
+ <a href="mailto:yourname@yourdomain.com">yourname@yourdomain.com</a>
24
+ <a href="mailto:yourname@yourcompany.com">yourname@yourcompany.com</a>
25
+ <a href="mailto:YOU@EMAILADRESS.COM">YOU@EMAILADRESS.COM</a>
26
+ <a href="mailto:you@address.com">you@address.com</a>
27
+ <a href="mailto:xxx@yyy.zzz">xxx@yyy.zzz</a>
28
+ <a href="mailto:test@test.com">test@test.com</a>
29
+ <a href="mailto:@example.com">@example.com"</a>
30
+ HTML
31
+ expect(dummy_object.grep_emails(html.to_s)).to eq([])
32
+ end
33
+ it 'should grep organization contact emailaddresses' do
34
+ html = <<~HTML
35
+ <a href="mailto:abc@example.com">abc@example.com</a>
36
+ <a class="fusion-social-network-icon fusion-tooltip " style="color:#ffffff;" href="mailto:&#119;ils&#111;&#110;&#064;&#119;&#105;&#108;&#115;on&#046;n&#098;&#046;c&#097;" target="_self" title="Email">
37
+ <span class="screen-reader-text">Email</span>
38
+ </a>
39
+ <div>
40
+ <br><strong>Mailing address</strong>
41
+ : 1320 Yonge Street, Toronto, Ontario&#160; M4T 1X2<br><br>
42
+ <strong>Attendance</strong>:&#160;<br>Junior School: 1639attendance@yorkschool.com<br>Middle &amp; Senior School: 1320attendance@yorkschool.com<br><br>
43
+ </div>
44
+ <a href="mailto:%20support@switcherstudio.com">
45
+ <a href="mailto:%20support@switcherstudio.com">
46
+ <a href="mailto:ekerlow@hellermanllc.com">
47
+ &lt;a href=\\&quot;mailto:Michael.O%27Brien@idga.org?subject=Editorial%20Calendar%20Contributor\\&quot;&gt
48
+ HTML
49
+ emails = dummy_object.grep_emails(html.to_s)
50
+ expected_emails = [
51
+ "wilson@wilson.nb.ca",
52
+ "support@switcherstudio.com",
53
+ "ekerlow@hellermanllc.com",
54
+ "michael.o'brien@idga.org",
55
+ "1639attendance@yorkschool.com",
56
+ "1320attendance@yorkschool.com"
57
+ ]
58
+ expect(emails).to eq(expected_emails)
59
+ end
60
+ end
@@ -0,0 +1,77 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'FaceBook Profile' do
4
+
5
+ class DummyTestClass
6
+ include FacebookProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_facebook_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_facebook_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep any non profile url' do
16
+ html = <<~HTML
17
+ <a href="http://www.facebook.com/2008/fbml" target="_blank" class="sqs-svg-icon--wrapper facebook">
18
+ <a href="https://www.facebook.com/v2.0/dialog/share" target="_blank" class="sqs-svg-icon--wrapper facebook">
19
+ <a href="https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2FHFXMooseheads%2Fvideos" target="_blank" class="sqs-svg-icon--wrapper facebook">
20
+ <a href="https://www.facebook.com/search.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
21
+ <a href="http://www.facebook.com/home.php#/pages/Zend-Technologies/190917412139" target="_blank" class="sqs-svg-icon--wrapper facebook">
22
+ <img height="1" width="1" style="display:none" src="https://www.facebook.com/tr?id=1501718829946651&ev=PageView&noscript=1"/>
23
+ <a href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fbroadreachstaffing.com&#038;t=Broadreach" class="et_social_share" rel="nofollow" data-social_name="facebook" data-post_id="68" data-social_type="share" data-location="sidebar"></a>
24
+ <a href="https://www.facebook.com/photo.php?fbid=10157409473244808&set=p.10157409473244808&type=3" class="et_social_share" rel="nofollow" data-social_name="facebook" data-post_id="68" data-social_type="share" data-location="sidebar"></a>
25
+ <a href="https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.facebook.com%2Fchoosepremiere%2Fposts%2F10157307766122649" target="_blank" class="sqs-svg-icon--wrapper facebook">
26
+ <a href="https://www.facebook.com/dialog/send?display=popup&#038;link=https%3A%2F%2Fsmartcookiemedia.com%2F&#038;redirect_uri=https://smartcookiemedia.com/" target="_blank" class="sqs-svg-icon--wrapper facebook">
27
+ <a href="https://www.facebook.com/hashtag/beeryoga" target="_blank" class="sqs-svg-icon--wrapper facebook">
28
+ <a href="http://facebook.com/privacy" title="Facebook Privacy" target="_blank">Facebook</a>
29
+ <a target="_blank" title="Facebook - Social Gastronomy" href="http://www.facebook.com/home.php#/pages/Social-Gastronomy/187440209207?ref=ts"><img alt="images" src="/images/stories/social/images.jpg" width="30"></a>
30
+ <a href="http://www.facebook.com/plugins/like.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
31
+ <iframe src="http://www.facebook.com/plugins/likebox.php?href=https%3A%2F%2Fwww.facebook.com%2Fbandcart&amp;width=220&amp;colorscheme=dark&amp;show_faces=false&amp;stream=false&amp;header=false&amp;height=65" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:220px; height: 65px;" allowtransparency="true"></iframe>
32
+ <iframe style="border: none; overflow: hidden;" src="https://www.facebook.com/plugins/page.php?href=https%3A%2F%2Fwww.facebook.com%2FCarterBrothersCompany&tabs=timeline&width=340&height=500&small_header=false&adapt_container_width=true&hide_cover=false&show_facepile=true&appId=210697455750478" width="340" height="500" frameborder="0" scrolling="no"></iframe>
33
+ <a href="https://www.facebook.com/offsite_event.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
34
+ <a href="http://www.facebook.com/share.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
35
+ <a href="https://www.facebook.com/login/device-based/regular/login/?login_attempt=1" target="_blank" class="sqs-svg-icon--wrapper facebook">
36
+ <a href="https://www.facebook.com/recover/initiate?lwv=110" target="_blank" class="sqs-svg-icon--wrapper facebook">
37
+ <a href="https://www.facebook.com/help/568137493302217" target="_blank" class="sqs-svg-icon--wrapper facebook">
38
+ <a href="https://www.facebook.com/help/2687943754764396" target="_blank" class="sqs-svg-icon--wrapper facebook">
39
+ <a href="https://www.facebook.com/help/www/1573156092981768/" target="_blank" class="sqs-svg-icon--wrapper facebook">
40
+ <a href="https://facebook.com/security/hsts-pixel.gif?c=3.2.5" target="_blank" class="sqs-svg-icon--wrapper facebook">
41
+
42
+ HTML
43
+ expect(dummy_object.grep_facebook_profile(html.to_s)).to eq([])
44
+ end
45
+
46
+ it 'should grep valid urls' do
47
+ html = <<~HTML
48
+ <a href="http://facebook.com/AAEurope"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
49
+ <a target="_blank" href="https://www.facebook.com/pages/Basketball-New-Brunswick/156176001133032?sk=wall" title="Follow us on Facebook">facebook</a>
50
+ <a class="cff-photo cff-multiple cff-img-layout-4 cff-portrait nofancybox" style="max-width: 540px;" data-cff-page-name="Allied Printing Services" data-cff-post-time="4 days ago" href="https://www.facebook.com/alliedprinting/posts/2323507841068448"> FB Posts</a>
51
+ <a class="cff-photo cff-multiple cff-img-layout-4 cff-portrait nofancybox" style="max-width: 540px;" data-cff-page-name="Allied Printing Services" data-cff-post-time="4 days ago" href="https://www.facebook.com/groups/1004350633012081/"> FB Posts</a>
52
+ <a href="https://www.facebook.com/events/116316035951805/"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
53
+ <a href="https://www.facebook.com/arithane.foamroofing"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
54
+ <a href="http://www.facebook.com/pages/Surgical+Information+Systems/75322028321"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
55
+ <a href="https://www.facebook.com/Baylor-School-124353897738/"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
56
+ <a href="http://www.facebook.com/profile.php?id=100000325114186&v=info#!/pages/Blackstone-Counsel/150651724966482" target="_blank">
57
+ <img class="social" src="facebook.jpg" alt="Facebook"/>
58
+ </a>
59
+ <a href="http://facebook.com/profile.php?id=205682532825685" target="_blank"><img class="social" src="facebook.jpg" alt="Facebook"/></a>
60
+ <a href="http://www.facebook.com/share.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
61
+ HTML
62
+ fb_profiles = dummy_object.grep_facebook_profile(html.to_s)
63
+ expected_profiles = [
64
+ 'http://facebook.com/AAEurope',
65
+ 'https://www.facebook.com/pages/Basketball-New-Brunswick/156176001133032?sk=wall',
66
+ 'https://www.facebook.com/alliedprinting/posts/2323507841068448',
67
+ 'https://www.facebook.com/groups/1004350633012081/',
68
+ 'https://www.facebook.com/events/116316035951805/',
69
+ 'https://www.facebook.com/arithane.foamroofing',
70
+ 'http://www.facebook.com/pages/Surgical+Information+Systems/75322028321',
71
+ 'https://www.facebook.com/Baylor-School-124353897738/',
72
+ 'http://www.facebook.com/profile.php?id=100000325114186',
73
+ 'http://facebook.com/profile.php?id=205682532825685'
74
+ ]
75
+ expect(fb_profiles).to eq(expected_profiles)
76
+ end
77
+ end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Instagram Profile' do
4
+
5
+ class DummyTestClass
6
+ include InstagramProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_instagram_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_instagram_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep below url format' do
16
+ html = <<~HTML
17
+ <a href="http://instagram.com/" style="color: white;" class="fa fa-instagram"></a>
18
+ <a href="http://instagram.com/#" style="color: white;" class="fa fa-instagram"></a>
19
+ <a href="https://www.instagram.com/%username%" style="color: white;" class="fa fa-instagram"></a>
20
+ <a href="https://www.instagram.com/explore/tags/Talent/" style="color: white;" class="fa fa-instagram"></a>
21
+ HTML
22
+ expect(dummy_object.grep_instagram_profile(html.to_s)).to eq([])
23
+ end
24
+
25
+ it 'should grep organization instagram profiles' do
26
+ html = <<~HTML
27
+ <a href="https://www.instagram.com/nextgenhealthcare" target="_blank">Instagram</a>
28
+ <a href="https://instagram.com/nextgenhealthcare" target="_blank">Instagram</a>
29
+ <a href="https://www.instagram.com/printed4you.co.uk" target="_blank">Instagram</a>
30
+ <a href="https://www.instagram.com/web_spiders" target="_blank">Instagram</a>
31
+ <a href="http://instagram.com/mccaincanada?ref=badge" target="_blank">Instagram</a>
32
+ <a href="http://instagram.com/mcdermottscholars&quot;,&quot;target&quot;:&quot;_blank&quot;}},&quot;displayMode&quot;:&quot;fill&quot;}" target="_blank">Instagram</a>
33
+ HTML
34
+ instagram_profiles = dummy_object.grep_instagram_profile(html.to_s)
35
+ expected_instagram_profiles = [
36
+ 'https://www.instagram.com/nextgenhealthcare',
37
+ 'https://instagram.com/nextgenhealthcare',
38
+ 'https://www.instagram.com/printed4you.co.uk',
39
+ 'https://www.instagram.com/web_spiders',
40
+ 'http://instagram.com/mccaincanada',
41
+ 'http://instagram.com/mcdermottscholars'
42
+ ]
43
+ expect(dummy_object.grep_instagram_profile(html.to_s)).to eq(instagram_profiles)
44
+ end
45
+ end
@@ -0,0 +1,43 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Linkedin Profile' do
4
+
5
+ class DummyTestClass
6
+ include LinkedinProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_linkedin_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_linkedin_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep below url format' do
16
+ html = <<~HTML
17
+ <a href="https://www.linkedin.com" style="color: white;" class="fa fa-linkedin"></a>
18
+ <a href="https://www.linkedin.com/feed/" style="color: white;" class="fa fa-linkedin"></a>
19
+ <a href="https://www.linkedin.com/mynetwork/" style="color: white;" class="fa fa-linkedin"></a>
20
+ <a href="https://www.linkedin.com/jobs/" style="color: white;" class="fa fa-linkedin"></a>
21
+ <a href="https://www.linkedin.com/messaging/" style="color: white;" class="fa fa-linkedin"></a>
22
+ <a href="https://www.linkedin.com/notifications/" style="color: white;" class="fa fa-linkedin"></a>
23
+ <a href="https://www.linkedin.com/psettings/" style="color: white;" class="fa fa-linkedin"></a>
24
+ <a href="https://www.linkedin.com/ca/pet-32/" style="color: white;" class="fa fa-linkedin"></a>
25
+ HTML
26
+ expect(dummy_object.grep_linkedin_profile(html.to_s)).to eq([])
27
+ end
28
+
29
+ it 'should grep organization linkedin profiles' do
30
+ html = <<~HTML
31
+ <a href="https://www.linkedin.com/company/13247248/" target="_blank">Linkedin</a>
32
+ <a href="https://www.linkedin.com/company/m-files-corporation" target="_blank">Linkedin</a>
33
+ <a href="https://www.linkedin.com/company/dataendure" target="_blank">Linkedin</a>
34
+ HTML
35
+ linkedin_profiles = dummy_object.grep_linkedin_profile(html.to_s)
36
+ expected_linkedin_profiles = [
37
+ 'https://www.linkedin.com/company/13247248',
38
+ 'https://www.linkedin.com/company/m-files-corporation',
39
+ 'https://www.linkedin.com/company/dataendure'
40
+ ]
41
+ expect(dummy_object.grep_linkedin_profile(html.to_s)).to eq(expected_linkedin_profiles)
42
+ end
43
+ end
@@ -0,0 +1,321 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Meta Description' do
4
+ class DummyTestClass
5
+ include MetaDescription
6
+ end
7
+ let(:dummy_object) { DummyTestClass.new }
8
+
9
+ it 'should return nil for invalid inputs' do
10
+ expect(dummy_object.grep_meta_description('')).to be_nil
11
+ expect(dummy_object.grep_meta_description(nil)).to be_nil
12
+ end
13
+ describe 'Name key first meta description tag' do
14
+ it 'should return nil for no meta description tag presence' do
15
+ no_meta_description = <<~HTML
16
+ <head>
17
+ <meta charset="utf-8">
18
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
19
+ <meta name="viewport" content="width=device-width, initial-scale=1">
20
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
21
+ <meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
22
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
23
+ </head>
24
+ HTML
25
+ meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
26
+ expect(meta_description).to be_nil
27
+ end
28
+
29
+ it 'should return nil when content part is empty' do
30
+ no_meta_description = <<~HTML
31
+ <head>
32
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
33
+ <meta name="description" content="">
34
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
35
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
36
+ </head>
37
+ HTML
38
+ meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
39
+ expect(meta_description).to be_nil
40
+ end
41
+
42
+ it 'should return description from valid tag' do
43
+ html = <<~HTML
44
+ <head>
45
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
46
+ <meta content="" property="uid">
47
+ <meta name="description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
48
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
49
+ </head>
50
+ HTML
51
+ meta_description = dummy_object.grep_meta_description(html.to_s)
52
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
53
+ end
54
+
55
+ it 'should return description even tag is multilined and partially encoded' do
56
+ html = <<~HTML
57
+ <head>
58
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
59
+ <meta content="" property="uid">
60
+ <meta name="description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
61
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
62
+ name="viewport">
63
+ </head>
64
+ HTML
65
+ meta_description = dummy_object.grep_meta_description(html.to_s)
66
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
67
+ end
68
+
69
+ it 'should parse meta tag even it is partially single quoted' do
70
+ html = <<~HTML
71
+ <head>
72
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
73
+ <meta content="" property="uid">
74
+ <meta name=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
75
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
76
+ name="viewport">
77
+ </head>
78
+ HTML
79
+ meta_description = dummy_object.grep_meta_description(html.to_s)
80
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
81
+ end
82
+
83
+ it 'should parse meta tag even it is having other attributes defined' do
84
+ html = <<~HTML
85
+ <head>
86
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
87
+ <meta content="" property="uid">
88
+ <meta class="metadescription" name=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
89
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
90
+ name="viewport">
91
+ </head>
92
+ HTML
93
+ meta_description = dummy_object.grep_meta_description(html.to_s)
94
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
95
+ end
96
+
97
+ it 'should parse meta tag with itemprop as description key' do
98
+ html = <<~HTML
99
+ <head>
100
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
101
+ <meta content="" property="uid">
102
+ <meta class="metadescription" itemprop=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
103
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
104
+ name="viewport">
105
+ </head>
106
+ HTML
107
+ meta_description = dummy_object.grep_meta_description(html.to_s)
108
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
109
+ end
110
+
111
+ it 'should parse even name/itemprop key content is improperly assigned' do
112
+ html = <<~HTML
113
+ <head>
114
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
115
+ <meta content="" property="uid">
116
+ <meta class="metadescription" name=description content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" />
117
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
118
+ name="viewport">
119
+ </head>
120
+ HTML
121
+ meta_description = dummy_object.grep_meta_description(html.to_s)
122
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
123
+ end
124
+
125
+ it 'should bring description having single quote' do
126
+ html = <<~HTML
127
+ <html lang="en">
128
+ <head>
129
+ <META charset="utf-8">
130
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
131
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
132
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
133
+ <meta name="description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
134
+ </head>
135
+ <html>
136
+ HTML
137
+ meta_description = dummy_object.grep_meta_description(html.to_s)
138
+ expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
139
+ end
140
+
141
+ it 'should bring description having double quote' do
142
+ html = <<~HTML
143
+ <html lang="en">
144
+ <head>
145
+ <META charset="utf-8">
146
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
147
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
148
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
149
+ <meta name="description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
150
+ </head>
151
+ <html>
152
+ HTML
153
+ meta_description = dummy_object.grep_meta_description(html.to_s)
154
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
155
+ end
156
+
157
+ it "should bring description even some other meta tag is empty" do
158
+ html = <<~HTML
159
+ <html lang="en">
160
+ <head>
161
+ <META charset="utf-8">
162
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
163
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
164
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
165
+ <meta name="description" content="">
166
+ <meta name="description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
167
+ </head>
168
+ <html>
169
+ HTML
170
+ meta_description = dummy_object.grep_meta_description(html.to_s)
171
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
172
+ end
173
+ end
174
+ describe 'Content key first meta description tag' do
175
+ it 'should return nil when content part is empty' do
176
+ no_meta_description = <<~HTML
177
+ <head>
178
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
179
+ <meta content="" name="description">
180
+ <meta content='' name="description">
181
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
182
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
183
+ </head>
184
+ HTML
185
+ meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
186
+ expect(meta_description).to be_nil
187
+ end
188
+
189
+ it 'should return description from valid tag' do
190
+ html = <<~HTML
191
+ <head>
192
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
193
+ <meta content="" property="uid">
194
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="description">
195
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
196
+ </head>
197
+ HTML
198
+ meta_description = dummy_object.grep_meta_description(html.to_s)
199
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
200
+ end
201
+
202
+ it 'should return description even tag is multilined and partially encoded' do
203
+ html = <<~HTML
204
+ <head>
205
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
206
+ <meta content="" property="uid">
207
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="description" >
208
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
209
+ name="viewport">
210
+ </head>
211
+ HTML
212
+ meta_description = dummy_object.grep_meta_description(html.to_s)
213
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
214
+ end
215
+
216
+ it 'should parse meta tag even it is partially single quoted' do
217
+ html = <<~HTML
218
+ <head>
219
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
220
+ <meta content="" property="uid">
221
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name=\'description">
222
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
223
+ name="viewport">
224
+ </head>
225
+ HTML
226
+ meta_description = dummy_object.grep_meta_description(html.to_s)
227
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
228
+ end
229
+
230
+ it 'should parse meta tag even it is having other attributes defined' do
231
+ html = <<~HTML
232
+ <head>
233
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
234
+ <meta content="" property="uid">
235
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=\'description">
236
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
237
+ name="viewport">
238
+ </head>
239
+ HTML
240
+ meta_description = dummy_object.grep_meta_description(html.to_s)
241
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
242
+ end
243
+
244
+ it 'should parse meta tag with itemprop as description key' do
245
+ html = <<~HTML
246
+ <head>
247
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
248
+ <meta content="" property="uid">
249
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'description" charset="UTF-8">
250
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
251
+ name="viewport">
252
+ </head>
253
+ HTML
254
+ meta_description = dummy_object.grep_meta_description(html.to_s)
255
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
256
+ end
257
+
258
+ it 'should parse even name/itemprop key content is improperly assigned' do
259
+ html = <<~HTML
260
+ <head>
261
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
262
+ <meta content="" property="uid">
263
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=description />
264
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
265
+ name="viewport">
266
+ </head>
267
+ HTML
268
+ meta_description = dummy_object.grep_meta_description(html.to_s)
269
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
270
+ end
271
+
272
+ it 'should bring description having single quote' do
273
+ html = <<~HTML
274
+ <html lang="en">
275
+ <head>
276
+ <META charset="utf-8">
277
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
278
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
279
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
280
+ <meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." name="description" />
281
+ </head>
282
+ <html>
283
+ HTML
284
+ meta_description = dummy_object.grep_meta_description(html.to_s)
285
+ expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
286
+ end
287
+
288
+ it 'should bring description having double quote' do
289
+ html = <<~HTML
290
+ <html lang="en">
291
+ <head>
292
+ <META charset="utf-8">
293
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
294
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
295
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
296
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="description" />
297
+ </head>
298
+ <html>
299
+ HTML
300
+ meta_description = dummy_object.grep_meta_description(html.to_s)
301
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
302
+ end
303
+
304
+ it "should bring description even some other meta tag is empty" do
305
+ html = <<~HTML
306
+ <html lang="en">
307
+ <head>
308
+ <META charset="utf-8">
309
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
310
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
311
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
312
+ <meta content="" name="description">
313
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="description"/>
314
+ </head>
315
+ <html>
316
+ HTML
317
+ meta_description = dummy_object.grep_meta_description(html.to_s)
318
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
319
+ end
320
+ end
321
+ end