brilliant_web_scraper 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/README.md +31 -0
  4. data/brilliant_web_scraper-1.0.0.gem +0 -0
  5. data/brilliant_web_scraper-1.0.gem +0 -0
  6. data/brilliant_web_scraper.gemspec +30 -0
  7. data/lib/brilliant_web_scraper.rb +55 -0
  8. data/lib/parsers/description_helper.rb +28 -0
  9. data/lib/parsers/emails.rb +30 -0
  10. data/lib/parsers/facebook_profile.rb +11 -0
  11. data/lib/parsers/instagram_profile.rb +11 -0
  12. data/lib/parsers/linkedin_profile.rb +11 -0
  13. data/lib/parsers/meta_description.rb +13 -0
  14. data/lib/parsers/org_description.rb +13 -0
  15. data/lib/parsers/phone_numbers.rb +34 -0
  16. data/lib/parsers/pinterest_profile.rb +11 -0
  17. data/lib/parsers/redirected_to.rb +29 -0
  18. data/lib/parsers/title.rb +13 -0
  19. data/lib/parsers/twitter_description.rb +13 -0
  20. data/lib/parsers/twitter_profile.rb +11 -0
  21. data/lib/parsers/unescape_html_helper.rb +17 -0
  22. data/lib/parsers/vimeo_profile.rb +11 -0
  23. data/lib/parsers/youtube_channel.rb +29 -0
  24. data/lib/scraper/errors.rb +19 -0
  25. data/lib/scraper/scrape_exceptions.rb +49 -0
  26. data/lib/scraper/scrape_helper.rb +59 -0
  27. data/lib/scraper/scrape_request.rb +29 -0
  28. data/lib/version.rb +6 -0
  29. data/spec/lib/parsers/description_helper_spec.rb +24 -0
  30. data/spec/lib/parsers/emails_spec.rb +60 -0
  31. data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
  32. data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
  33. data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
  34. data/spec/lib/parsers/meta_description_spec.rb +321 -0
  35. data/spec/lib/parsers/org_description_spec.rb +316 -0
  36. data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
  37. data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
  38. data/spec/lib/parsers/redirected_to_spec.rb +207 -0
  39. data/spec/lib/parsers/title_spec.rb +87 -0
  40. data/spec/lib/parsers/twitter_description_spec.rb +314 -0
  41. data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
  42. data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
  43. data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
  44. data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
  45. data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
  46. data/spec/lib/scraper/scrape_request_test.rb +34 -0
  47. data/spec/spec_helper.rb +111 -0
  48. data/spec/vcr/encoding_compatibility_error.yml +316 -0
  49. data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
  50. data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
  51. data/spec/vcr/non_html_scrape.yml +163 -0
  52. data/spec/vcr/valid_scrape_response.yml +696 -0
  53. metadata +250 -0
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nesty'
4
+
5
+ # Raise error as WebScraper Error
6
+ module WebScraper
7
+ # Inclide nesty to have actual stacktrace of bug
8
+ class Error < StandardError
9
+ include Nesty::NestedError
10
+ end
11
+
12
+ class TimeoutError < Error; end
13
+
14
+ class RequestError < Error; end
15
+
16
+ class ParserError < Error; end
17
+
18
+ class NonHtmlError < ParserError; end
19
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ # List all the possible exceptions
4
+ module ScrapeExceptions
5
+ GENERAL_EXCEPTIONS = [
6
+ URI::InvalidURIError,
7
+ RestClient::NotAcceptable,
8
+ RestClient::BadGateway,
9
+ RestClient::URITooLong,
10
+ Encoding::CompatibilityError,
11
+ RestClient::SeeOther,
12
+ RestClient::LoopDetected,
13
+ RestClient::PermanentRedirect,
14
+ RestClient::Locked,
15
+ RestClient::MethodNotAllowed,
16
+ RestClient::NotImplemented,
17
+ RestClient::PaymentRequired,
18
+ RestClient::TooManyRequests,
19
+ RestClient::RangeNotSatisfiable,
20
+ Errno::ENETUNREACH,
21
+ RestClient::Conflict,
22
+ RestClient::ProxyAuthenticationRequired,
23
+ Net::HTTPBadResponse,
24
+ Errno::ECONNREFUSED,
25
+ Errno::ECONNRESET,
26
+ Errno::EHOSTUNREACH,
27
+ Errno::EINVAL,
28
+ OpenSSL::SSL::SSLError,
29
+ RestClient::BadRequest,
30
+ RestClient::Forbidden,
31
+ RestClient::GatewayTimeout,
32
+ RestClient::Gone,
33
+ RestClient::InternalServerError,
34
+ RestClient::MovedPermanently,
35
+ RestClient::NotFound,
36
+ RestClient::RequestFailed,
37
+ RestClient::ServerBrokeConnection,
38
+ RestClient::ServiceUnavailable,
39
+ RestClient::SSLCertificateNotVerified,
40
+ RestClient::Unauthorized,
41
+ SocketError
42
+ ].freeze
43
+
44
+ TIMEOUT_EXCEPTIONS = [
45
+ RestClient::Exceptions::OpenTimeout,
46
+ RestClient::Exceptions::ReadTimeout,
47
+ RestClient::RequestTimeout
48
+ ].freeze
49
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Scrapes below data
4
+ # @Title
5
+ # @Descriptions
6
+ # @Social Profiles
7
+ # @Contact Details
8
+ module ScrapeHelper
9
+ def perform_scrape(url, read_timeout, connection_timeout)
10
+ response = nil
11
+ request_duration = Benchmark.measure do
12
+ response = ScrapeRequest.new(url, read_timeout, connection_timeout)
13
+ end.real
14
+ retry_count = 0
15
+ begin
16
+ scrape_data = nil
17
+ scrape_duration = Benchmark.measure do
18
+ scrape_data = grep_data(response.body)
19
+ end.real
20
+
21
+ data_hash = {
22
+ web_request_duration: request_duration,
23
+ response_scrape_duraton: scrape_duration,
24
+ scrape_data: scrape_data
25
+ }
26
+ rescue ArgumentError => e
27
+ retry_count += 1
28
+ raise WebScraper::ParserError, e.message if retry_count > 1
29
+
30
+ response = response.encode('UTF-16be', invalid: :replace, replace: '?')
31
+ response = response.encode('UTF-8')
32
+ retry
33
+ rescue Encoding::CompatibilityError => e
34
+ raise WebScraper::ParserError, e.message
35
+ end
36
+ data_hash
37
+ end
38
+
39
+ private
40
+
41
+ def grep_data(response)
42
+ {
43
+ title: grep_title(response),
44
+ meta_description: grep_meta_description(response),
45
+ org_description: grep_org_description(response),
46
+ twitter_description: grep_twitter_description(response),
47
+ twitter_profile: grep_twitter_profile(response),
48
+ linkedin_profile: grep_linkedin_profile(response),
49
+ facebook_profile: grep_facebook_profile(response),
50
+ instagram_profile: grep_instagram_profile(response),
51
+ vimeo_profile: grep_vimeo_profile(response),
52
+ pinterest_profile: grep_pinterest_profile(response),
53
+ youtube_channel: grep_youtube_channel(response),
54
+ emails: grep_emails(response),
55
+ phone_numbers: grep_phone_numbers(response),
56
+ redirected_to: grep_redirected_to_url(response)
57
+ }
58
+ end
59
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ # @Makes actual scrape request, either raises exception or response
4
+ module ScrapeRequest
5
+ extend ScrapeExceptions
6
+ class << self
7
+ def new(url, read_timeout, connection_timeout)
8
+ begin
9
+ params_hash = {
10
+ method: :get,
11
+ url: url,
12
+ read_timeout: read_timeout,
13
+ connection_timeout: connection_timeout,
14
+ headers: { 'accept-encoding': 'identity' }
15
+ }
16
+ response = RestClient::Request.execute(params_hash)
17
+ content_type = response.headers[:content_type]
18
+ return response if content_type =~ %r{(?i)text\s*\/\s*html}
19
+
20
+ exception_message = "Invalid response format received: #{content_type}"
21
+ raise WebScraper::NonHtmlError, exception_message
22
+ rescue *TIMEOUT_EXCEPTIONS => e
23
+ raise WebScraper::TimeoutError, e.message
24
+ rescue *GENERAL_EXCEPTIONS => e
25
+ raise WebScraper::RequestError, e.message
26
+ end
27
+ end
28
+ end
29
+ end
data/lib/version.rb ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Holds current version number
4
+ module WebScraper
5
+ VERSION = '0.1'
6
+ end
@@ -0,0 +1,24 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'DescriptionHelper' do
4
+
5
+ class DummyTestClass
6
+ include DescriptionHelper
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'it should return nil for inalid description' do
11
+ descriptions = [" ", "", "|", "-"]
12
+ expect(dummy_object.send(:parse_description, *[descriptions])).to be_nil
13
+ end
14
+
15
+ it 'should return valid description' do
16
+ descriptions = [
17
+ '2019年趣味幽默猜生肖,1358884不像看图找生肖,2019年看图猜生肖网站,2019看图找生肖83期,2019看图找生肖,2019看图找生肖109期,2019看图猜生肖买马,2019看图猜生肖买,2019全年看图找生肖图',
18
+ "-"
19
+ ]
20
+ expect(
21
+ dummy_object.send(:parse_description, *[descriptions])
22
+ ).to eq('2019年趣味幽默猜生肖,1358884不像看图找生肖,2019年看图猜生肖网站,2019看图找生肖83期,2019看图找生肖,2019看图找生肖109期,2019看图猜生肖买马,2019看图猜生肖买,2019全年看图找生肖图')
23
+ end
24
+ end
@@ -0,0 +1,60 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Emails' do
4
+
5
+ class DummyTestClass
6
+ include Emails
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_emails(nil)).to be_nil
12
+ expect(dummy_object.grep_emails('')).to be_nil
13
+ end
14
+
15
+ it 'should give []' do
16
+ html = <<~HTML
17
+ <a href="mailto:abc@example.com">abc@example.com</a>
18
+ <a href="mailto:example@mail.com">example@email.com</a>
19
+ <a href="mailto:name@domain.com">name@domain.com</a>
20
+ <a href="mailto:name@company.com">name@company.com</a>
21
+ <a href="mailto:you@youremail.com">you@youremail.com</a>
22
+ <a href="mailto:your@emailaddress.com">your@emailaddress.com</a>
23
+ <a href="mailto:yourname@yourdomain.com">yourname@yourdomain.com</a>
24
+ <a href="mailto:yourname@yourcompany.com">yourname@yourcompany.com</a>
25
+ <a href="mailto:YOU@EMAILADRESS.COM">YOU@EMAILADRESS.COM</a>
26
+ <a href="mailto:you@address.com">you@address.com</a>
27
+ <a href="mailto:xxx@yyy.zzz">xxx@yyy.zzz</a>
28
+ <a href="mailto:test@test.com">test@test.com</a>
29
+ <a href="mailto:@example.com">@example.com"</a>
30
+ HTML
31
+ expect(dummy_object.grep_emails(html.to_s)).to eq([])
32
+ end
33
+ it 'should grep organization contact emailaddresses' do
34
+ html = <<~HTML
35
+ <a href="mailto:abc@example.com">abc@example.com</a>
36
+ <a class="fusion-social-network-icon fusion-tooltip " style="color:#ffffff;" href="mailto:&#119;ils&#111;&#110;&#064;&#119;&#105;&#108;&#115;on&#046;n&#098;&#046;c&#097;" target="_self" title="Email">
37
+ <span class="screen-reader-text">Email</span>
38
+ </a>
39
+ <div>
40
+ <br><strong>Mailing address</strong>
41
+ : 1320 Yonge Street, Toronto, Ontario&#160; M4T 1X2<br><br>
42
+ <strong>Attendance</strong>:&#160;<br>Junior School: 1639attendance@yorkschool.com<br>Middle &amp; Senior School: 1320attendance@yorkschool.com<br><br>
43
+ </div>
44
+ <a href="mailto:%20support@switcherstudio.com">
45
+ <a href="mailto:%20support@switcherstudio.com">
46
+ <a href="mailto:ekerlow@hellermanllc.com">
47
+ &lt;a href=\\&quot;mailto:Michael.O%27Brien@idga.org?subject=Editorial%20Calendar%20Contributor\\&quot;&gt
48
+ HTML
49
+ emails = dummy_object.grep_emails(html.to_s)
50
+ expected_emails = [
51
+ "wilson@wilson.nb.ca",
52
+ "support@switcherstudio.com",
53
+ "ekerlow@hellermanllc.com",
54
+ "michael.o'brien@idga.org",
55
+ "1639attendance@yorkschool.com",
56
+ "1320attendance@yorkschool.com"
57
+ ]
58
+ expect(emails).to eq(expected_emails)
59
+ end
60
+ end
@@ -0,0 +1,77 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'FaceBook Profile' do
4
+
5
+ class DummyTestClass
6
+ include FacebookProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_facebook_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_facebook_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep any non profile url' do
16
+ html = <<~HTML
17
+ <a href="http://www.facebook.com/2008/fbml" target="_blank" class="sqs-svg-icon--wrapper facebook">
18
+ <a href="https://www.facebook.com/v2.0/dialog/share" target="_blank" class="sqs-svg-icon--wrapper facebook">
19
+ <a href="https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2FHFXMooseheads%2Fvideos" target="_blank" class="sqs-svg-icon--wrapper facebook">
20
+ <a href="https://www.facebook.com/search.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
21
+ <a href="http://www.facebook.com/home.php#/pages/Zend-Technologies/190917412139" target="_blank" class="sqs-svg-icon--wrapper facebook">
22
+ <img height="1" width="1" style="display:none" src="https://www.facebook.com/tr?id=1501718829946651&ev=PageView&noscript=1"/>
23
+ <a href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fbroadreachstaffing.com&#038;t=Broadreach" class="et_social_share" rel="nofollow" data-social_name="facebook" data-post_id="68" data-social_type="share" data-location="sidebar"></a>
24
+ <a href="https://www.facebook.com/photo.php?fbid=10157409473244808&set=p.10157409473244808&type=3" class="et_social_share" rel="nofollow" data-social_name="facebook" data-post_id="68" data-social_type="share" data-location="sidebar"></a>
25
+ <a href="https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.facebook.com%2Fchoosepremiere%2Fposts%2F10157307766122649" target="_blank" class="sqs-svg-icon--wrapper facebook">
26
+ <a href="https://www.facebook.com/dialog/send?display=popup&#038;link=https%3A%2F%2Fsmartcookiemedia.com%2F&#038;redirect_uri=https://smartcookiemedia.com/" target="_blank" class="sqs-svg-icon--wrapper facebook">
27
+ <a href="https://www.facebook.com/hashtag/beeryoga" target="_blank" class="sqs-svg-icon--wrapper facebook">
28
+ <a href="http://facebook.com/privacy" title="Facebook Privacy" target="_blank">Facebook</a>
29
+ <a target="_blank" title="Facebook - Social Gastronomy" href="http://www.facebook.com/home.php#/pages/Social-Gastronomy/187440209207?ref=ts"><img alt="images" src="/images/stories/social/images.jpg" width="30"></a>
30
+ <a href="http://www.facebook.com/plugins/like.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
31
+ <iframe src="http://www.facebook.com/plugins/likebox.php?href=https%3A%2F%2Fwww.facebook.com%2Fbandcart&amp;width=220&amp;colorscheme=dark&amp;show_faces=false&amp;stream=false&amp;header=false&amp;height=65" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:220px; height: 65px;" allowtransparency="true"></iframe>
32
+ <iframe style="border: none; overflow: hidden;" src="https://www.facebook.com/plugins/page.php?href=https%3A%2F%2Fwww.facebook.com%2FCarterBrothersCompany&tabs=timeline&width=340&height=500&small_header=false&adapt_container_width=true&hide_cover=false&show_facepile=true&appId=210697455750478" width="340" height="500" frameborder="0" scrolling="no"></iframe>
33
+ <a href="https://www.facebook.com/offsite_event.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
34
+ <a href="http://www.facebook.com/share.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
35
+ <a href="https://www.facebook.com/login/device-based/regular/login/?login_attempt=1" target="_blank" class="sqs-svg-icon--wrapper facebook">
36
+ <a href="https://www.facebook.com/recover/initiate?lwv=110" target="_blank" class="sqs-svg-icon--wrapper facebook">
37
+ <a href="https://www.facebook.com/help/568137493302217" target="_blank" class="sqs-svg-icon--wrapper facebook">
38
+ <a href="https://www.facebook.com/help/2687943754764396" target="_blank" class="sqs-svg-icon--wrapper facebook">
39
+ <a href="https://www.facebook.com/help/www/1573156092981768/" target="_blank" class="sqs-svg-icon--wrapper facebook">
40
+ <a href="https://facebook.com/security/hsts-pixel.gif?c=3.2.5" target="_blank" class="sqs-svg-icon--wrapper facebook">
41
+
42
+ HTML
43
+ expect(dummy_object.grep_facebook_profile(html.to_s)).to eq([])
44
+ end
45
+
46
+ it 'should grep valid urls' do
47
+ html = <<~HTML
48
+ <a href="http://facebook.com/AAEurope"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
49
+ <a target="_blank" href="https://www.facebook.com/pages/Basketball-New-Brunswick/156176001133032?sk=wall" title="Follow us on Facebook">facebook</a>
50
+ <a class="cff-photo cff-multiple cff-img-layout-4 cff-portrait nofancybox" style="max-width: 540px;" data-cff-page-name="Allied Printing Services" data-cff-post-time="4 days ago" href="https://www.facebook.com/alliedprinting/posts/2323507841068448"> FB Posts</a>
51
+ <a class="cff-photo cff-multiple cff-img-layout-4 cff-portrait nofancybox" style="max-width: 540px;" data-cff-page-name="Allied Printing Services" data-cff-post-time="4 days ago" href="https://www.facebook.com/groups/1004350633012081/"> FB Posts</a>
52
+ <a href="https://www.facebook.com/events/116316035951805/"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
53
+ <a href="https://www.facebook.com/arithane.foamroofing"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
54
+ <a href="http://www.facebook.com/pages/Surgical+Information+Systems/75322028321"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
55
+ <a href="https://www.facebook.com/Baylor-School-124353897738/"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
56
+ <a href="http://www.facebook.com/profile.php?id=100000325114186&v=info#!/pages/Blackstone-Counsel/150651724966482" target="_blank">
57
+ <img class="social" src="facebook.jpg" alt="Facebook"/>
58
+ </a>
59
+ <a href="http://facebook.com/profile.php?id=205682532825685" target="_blank"><img class="social" src="facebook.jpg" alt="Facebook"/></a>
60
+ <a href="http://www.facebook.com/share.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
61
+ HTML
62
+ fb_profiles = dummy_object.grep_facebook_profile(html.to_s)
63
+ expected_profiles = [
64
+ 'http://facebook.com/AAEurope',
65
+ 'https://www.facebook.com/pages/Basketball-New-Brunswick/156176001133032?sk=wall',
66
+ 'https://www.facebook.com/alliedprinting/posts/2323507841068448',
67
+ 'https://www.facebook.com/groups/1004350633012081/',
68
+ 'https://www.facebook.com/events/116316035951805/',
69
+ 'https://www.facebook.com/arithane.foamroofing',
70
+ 'http://www.facebook.com/pages/Surgical+Information+Systems/75322028321',
71
+ 'https://www.facebook.com/Baylor-School-124353897738/',
72
+ 'http://www.facebook.com/profile.php?id=100000325114186',
73
+ 'http://facebook.com/profile.php?id=205682532825685'
74
+ ]
75
+ expect(fb_profiles).to eq(expected_profiles)
76
+ end
77
+ end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Instagram Profile' do
4
+
5
+ class DummyTestClass
6
+ include InstagramProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_instagram_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_instagram_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep below url format' do
16
+ html = <<~HTML
17
+ <a href="http://instagram.com/" style="color: white;" class="fa fa-instagram"></a>
18
+ <a href="http://instagram.com/#" style="color: white;" class="fa fa-instagram"></a>
19
+ <a href="https://www.instagram.com/%username%" style="color: white;" class="fa fa-instagram"></a>
20
+ <a href="https://www.instagram.com/explore/tags/Talent/" style="color: white;" class="fa fa-instagram"></a>
21
+ HTML
22
+ expect(dummy_object.grep_instagram_profile(html.to_s)).to eq([])
23
+ end
24
+
25
+ it 'should grep organization instagram profiles' do
26
+ html = <<~HTML
27
+ <a href="https://www.instagram.com/nextgenhealthcare" target="_blank">Instagram</a>
28
+ <a href="https://instagram.com/nextgenhealthcare" target="_blank">Instagram</a>
29
+ <a href="https://www.instagram.com/printed4you.co.uk" target="_blank">Instagram</a>
30
+ <a href="https://www.instagram.com/web_spiders" target="_blank">Instagram</a>
31
+ <a href="http://instagram.com/mccaincanada?ref=badge" target="_blank">Instagram</a>
32
+ <a href="http://instagram.com/mcdermottscholars&quot;,&quot;target&quot;:&quot;_blank&quot;}},&quot;displayMode&quot;:&quot;fill&quot;}" target="_blank">Instagram</a>
33
+ HTML
34
+ instagram_profiles = dummy_object.grep_instagram_profile(html.to_s)
35
+ expected_instagram_profiles = [
36
+ 'https://www.instagram.com/nextgenhealthcare',
37
+ 'https://instagram.com/nextgenhealthcare',
38
+ 'https://www.instagram.com/printed4you.co.uk',
39
+ 'https://www.instagram.com/web_spiders',
40
+ 'http://instagram.com/mccaincanada',
41
+ 'http://instagram.com/mcdermottscholars'
42
+ ]
43
+ expect(dummy_object.grep_instagram_profile(html.to_s)).to eq(instagram_profiles)
44
+ end
45
+ end
@@ -0,0 +1,43 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Linkedin Profile' do
4
+
5
+ class DummyTestClass
6
+ include LinkedinProfile
7
+ end
8
+ let(:dummy_object) { DummyTestClass.new }
9
+
10
+ it 'should return nil for invalid input' do
11
+ expect(dummy_object.grep_linkedin_profile(nil)).to be_nil
12
+ expect(dummy_object.grep_linkedin_profile('')).to be_nil
13
+ end
14
+
15
+ it 'should not grep below url format' do
16
+ html = <<~HTML
17
+ <a href="https://www.linkedin.com" style="color: white;" class="fa fa-linkedin"></a>
18
+ <a href="https://www.linkedin.com/feed/" style="color: white;" class="fa fa-linkedin"></a>
19
+ <a href="https://www.linkedin.com/mynetwork/" style="color: white;" class="fa fa-linkedin"></a>
20
+ <a href="https://www.linkedin.com/jobs/" style="color: white;" class="fa fa-linkedin"></a>
21
+ <a href="https://www.linkedin.com/messaging/" style="color: white;" class="fa fa-linkedin"></a>
22
+ <a href="https://www.linkedin.com/notifications/" style="color: white;" class="fa fa-linkedin"></a>
23
+ <a href="https://www.linkedin.com/psettings/" style="color: white;" class="fa fa-linkedin"></a>
24
+ <a href="https://www.linkedin.com/ca/pet-32/" style="color: white;" class="fa fa-linkedin"></a>
25
+ HTML
26
+ expect(dummy_object.grep_linkedin_profile(html.to_s)).to eq([])
27
+ end
28
+
29
+ it 'should grep organization linkedin profiles' do
30
+ html = <<~HTML
31
+ <a href="https://www.linkedin.com/company/13247248/" target="_blank">Linkedin</a>
32
+ <a href="https://www.linkedin.com/company/m-files-corporation" target="_blank">Linkedin</a>
33
+ <a href="https://www.linkedin.com/company/dataendure" target="_blank">Linkedin</a>
34
+ HTML
35
+ linkedin_profiles = dummy_object.grep_linkedin_profile(html.to_s)
36
+ expected_linkedin_profiles = [
37
+ 'https://www.linkedin.com/company/13247248',
38
+ 'https://www.linkedin.com/company/m-files-corporation',
39
+ 'https://www.linkedin.com/company/dataendure'
40
+ ]
41
+ expect(dummy_object.grep_linkedin_profile(html.to_s)).to eq(expected_linkedin_profiles)
42
+ end
43
+ end
@@ -0,0 +1,321 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Meta Description' do
4
+ class DummyTestClass
5
+ include MetaDescription
6
+ end
7
+ let(:dummy_object) { DummyTestClass.new }
8
+
9
+ it 'should return nil for invalid inputs' do
10
+ expect(dummy_object.grep_meta_description('')).to be_nil
11
+ expect(dummy_object.grep_meta_description(nil)).to be_nil
12
+ end
13
+ describe 'Name key first meta description tag' do
14
+ it 'should return nil for no meta description tag presence' do
15
+ no_meta_description = <<~HTML
16
+ <head>
17
+ <meta charset="utf-8">
18
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
19
+ <meta name="viewport" content="width=device-width, initial-scale=1">
20
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
21
+ <meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
22
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
23
+ </head>
24
+ HTML
25
+ meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
26
+ expect(meta_description).to be_nil
27
+ end
28
+
29
+ it 'should return nil when content part is empty' do
30
+ no_meta_description = <<~HTML
31
+ <head>
32
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
33
+ <meta name="description" content="">
34
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
35
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
36
+ </head>
37
+ HTML
38
+ meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
39
+ expect(meta_description).to be_nil
40
+ end
41
+
42
+ it 'should return description from valid tag' do
43
+ html = <<~HTML
44
+ <head>
45
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
46
+ <meta content="" property="uid">
47
+ <meta name="description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
48
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
49
+ </head>
50
+ HTML
51
+ meta_description = dummy_object.grep_meta_description(html.to_s)
52
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
53
+ end
54
+
55
+ it 'should return description even tag is multilined and partially encoded' do
56
+ html = <<~HTML
57
+ <head>
58
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
59
+ <meta content="" property="uid">
60
+ <meta name="description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
61
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
62
+ name="viewport">
63
+ </head>
64
+ HTML
65
+ meta_description = dummy_object.grep_meta_description(html.to_s)
66
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
67
+ end
68
+
69
+ it 'should parse meta tag even it is partially single quoted' do
70
+ html = <<~HTML
71
+ <head>
72
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
73
+ <meta content="" property="uid">
74
+ <meta name=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
75
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
76
+ name="viewport">
77
+ </head>
78
+ HTML
79
+ meta_description = dummy_object.grep_meta_description(html.to_s)
80
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
81
+ end
82
+
83
+ it 'should parse meta tag even it is having other attributes defined' do
84
+ html = <<~HTML
85
+ <head>
86
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
87
+ <meta content="" property="uid">
88
+ <meta class="metadescription" name=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
89
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
90
+ name="viewport">
91
+ </head>
92
+ HTML
93
+ meta_description = dummy_object.grep_meta_description(html.to_s)
94
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
95
+ end
96
+
97
+ it 'should parse meta tag with itemprop as description key' do
98
+ html = <<~HTML
99
+ <head>
100
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
101
+ <meta content="" property="uid">
102
+ <meta class="metadescription" itemprop=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
103
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
104
+ name="viewport">
105
+ </head>
106
+ HTML
107
+ meta_description = dummy_object.grep_meta_description(html.to_s)
108
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
109
+ end
110
+
111
+ it 'should parse even name/itemprop key content is improperly assigned' do
112
+ html = <<~HTML
113
+ <head>
114
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
115
+ <meta content="" property="uid">
116
+ <meta class="metadescription" name=description content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" />
117
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
118
+ name="viewport">
119
+ </head>
120
+ HTML
121
+ meta_description = dummy_object.grep_meta_description(html.to_s)
122
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
123
+ end
124
+
125
+ it 'should bring description having single quote' do
126
+ html = <<~HTML
127
+ <html lang="en">
128
+ <head>
129
+ <META charset="utf-8">
130
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
131
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
132
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
133
+ <meta name="description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
134
+ </head>
135
+ <html>
136
+ HTML
137
+ meta_description = dummy_object.grep_meta_description(html.to_s)
138
+ expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
139
+ end
140
+
141
+ it 'should bring description having double quote' do
142
+ html = <<~HTML
143
+ <html lang="en">
144
+ <head>
145
+ <META charset="utf-8">
146
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
147
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
148
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
149
+ <meta name="description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
150
+ </head>
151
+ <html>
152
+ HTML
153
+ meta_description = dummy_object.grep_meta_description(html.to_s)
154
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
155
+ end
156
+
157
+ it "should bring description even some other meta tag is empty" do
158
+ html = <<~HTML
159
+ <html lang="en">
160
+ <head>
161
+ <META charset="utf-8">
162
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
163
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
164
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
165
+ <meta name="description" content="">
166
+ <meta name="description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
167
+ </head>
168
+ <html>
169
+ HTML
170
+ meta_description = dummy_object.grep_meta_description(html.to_s)
171
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
172
+ end
173
+ end
174
+ describe 'Content key first meta description tag' do
175
+ it 'should return nil when content part is empty' do
176
+ no_meta_description = <<~HTML
177
+ <head>
178
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
179
+ <meta content="" name="description">
180
+ <meta content='' name="description">
181
+ <title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
182
+ <link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
183
+ </head>
184
+ HTML
185
+ meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
186
+ expect(meta_description).to be_nil
187
+ end
188
+
189
+ it 'should return description from valid tag' do
190
+ html = <<~HTML
191
+ <head>
192
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
193
+ <meta content="" property="uid">
194
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="description">
195
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
196
+ </head>
197
+ HTML
198
+ meta_description = dummy_object.grep_meta_description(html.to_s)
199
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
200
+ end
201
+
202
+ it 'should return description even tag is multilined and partially encoded' do
203
+ html = <<~HTML
204
+ <head>
205
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
206
+ <meta content="" property="uid">
207
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="description" >
208
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
209
+ name="viewport">
210
+ </head>
211
+ HTML
212
+ meta_description = dummy_object.grep_meta_description(html.to_s)
213
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
214
+ end
215
+
216
+ it 'should parse meta tag even it is partially single quoted' do
217
+ html = <<~HTML
218
+ <head>
219
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
220
+ <meta content="" property="uid">
221
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name=\'description">
222
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
223
+ name="viewport">
224
+ </head>
225
+ HTML
226
+ meta_description = dummy_object.grep_meta_description(html.to_s)
227
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
228
+ end
229
+
230
+ it 'should parse meta tag even it is having other attributes defined' do
231
+ html = <<~HTML
232
+ <head>
233
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
234
+ <meta content="" property="uid">
235
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=\'description">
236
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
237
+ name="viewport">
238
+ </head>
239
+ HTML
240
+ meta_description = dummy_object.grep_meta_description(html.to_s)
241
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
242
+ end
243
+
244
+ it 'should parse meta tag with itemprop as description key' do
245
+ html = <<~HTML
246
+ <head>
247
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
248
+ <meta content="" property="uid">
249
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'description" charset="UTF-8">
250
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
251
+ name="viewport">
252
+ </head>
253
+ HTML
254
+ meta_description = dummy_object.grep_meta_description(html.to_s)
255
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
256
+ end
257
+
258
+ it 'should parse even name/itemprop key content is improperly assigned' do
259
+ html = <<~HTML
260
+ <head>
261
+ <meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
262
+ <meta content="" property="uid">
263
+ <meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=description />
264
+ <meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
265
+ name="viewport">
266
+ </head>
267
+ HTML
268
+ meta_description = dummy_object.grep_meta_description(html.to_s)
269
+ expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
270
+ end
271
+
272
+ it 'should bring description having single quote' do
273
+ html = <<~HTML
274
+ <html lang="en">
275
+ <head>
276
+ <META charset="utf-8">
277
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
278
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
279
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
280
+ <meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." name="description" />
281
+ </head>
282
+ <html>
283
+ HTML
284
+ meta_description = dummy_object.grep_meta_description(html.to_s)
285
+ expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
286
+ end
287
+
288
+ it 'should bring description having double quote' do
289
+ html = <<~HTML
290
+ <html lang="en">
291
+ <head>
292
+ <META charset="utf-8">
293
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
294
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
295
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
296
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="description" />
297
+ </head>
298
+ <html>
299
+ HTML
300
+ meta_description = dummy_object.grep_meta_description(html.to_s)
301
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
302
+ end
303
+
304
+ it "should bring description even some other meta tag is empty" do
305
+ html = <<~HTML
306
+ <html lang="en">
307
+ <head>
308
+ <META charset="utf-8">
309
+ <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
310
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
311
+ <title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
312
+ <meta content="" name="description">
313
+ <meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="description"/>
314
+ </head>
315
+ <html>
316
+ HTML
317
+ meta_description = dummy_object.grep_meta_description(html.to_s)
318
+ expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
319
+ end
320
+ end
321
+ end