RubyGems - brilliant_web_scraper - Versions diffs - 0.1 - Mend

brilliant_web_scraper 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +7 -0
data/Gemfile +4 -0
data/README.md +31 -0
data/brilliant_web_scraper-1.0.0.gem +0 -0
data/brilliant_web_scraper-1.0.gem +0 -0
data/brilliant_web_scraper.gemspec +30 -0
data/lib/brilliant_web_scraper.rb +55 -0
data/lib/parsers/description_helper.rb +28 -0
data/lib/parsers/emails.rb +30 -0
data/lib/parsers/facebook_profile.rb +11 -0
data/lib/parsers/instagram_profile.rb +11 -0
data/lib/parsers/linkedin_profile.rb +11 -0
data/lib/parsers/meta_description.rb +13 -0
data/lib/parsers/org_description.rb +13 -0
data/lib/parsers/phone_numbers.rb +34 -0
data/lib/parsers/pinterest_profile.rb +11 -0
data/lib/parsers/redirected_to.rb +29 -0
data/lib/parsers/title.rb +13 -0
data/lib/parsers/twitter_description.rb +13 -0
data/lib/parsers/twitter_profile.rb +11 -0
data/lib/parsers/unescape_html_helper.rb +17 -0
data/lib/parsers/vimeo_profile.rb +11 -0
data/lib/parsers/youtube_channel.rb +29 -0
data/lib/scraper/errors.rb +19 -0
data/lib/scraper/scrape_exceptions.rb +49 -0
data/lib/scraper/scrape_helper.rb +59 -0
data/lib/scraper/scrape_request.rb +29 -0
data/lib/version.rb +6 -0
data/spec/lib/parsers/description_helper_spec.rb +24 -0
data/spec/lib/parsers/emails_spec.rb +60 -0
data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
data/spec/lib/parsers/meta_description_spec.rb +321 -0
data/spec/lib/parsers/org_description_spec.rb +316 -0
data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
data/spec/lib/parsers/redirected_to_spec.rb +207 -0
data/spec/lib/parsers/title_spec.rb +87 -0
data/spec/lib/parsers/twitter_description_spec.rb +314 -0
data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
data/spec/lib/scraper/scrape_request_test.rb +34 -0
data/spec/spec_helper.rb +111 -0
data/spec/vcr/encoding_compatibility_error.yml +316 -0
data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
data/spec/vcr/non_html_scrape.yml +163 -0
data/spec/vcr/valid_scrape_response.yml +696 -0
metadata +250 -0

data/lib/scraper/errors.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+require 'nesty'
+# Raise error as WebScraper Error
+module WebScraper
+  # Inclide nesty to have actual stacktrace of bug
+  class Error < StandardError
+    include Nesty::NestedError
+  end
+  class TimeoutError < Error; end
+  class RequestError < Error; end
+  class ParserError < Error; end
+  class NonHtmlError < ParserError; end
+end

data/lib/scraper/scrape_exceptions.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+# List all the possible exceptions
+module ScrapeExceptions
+  GENERAL_EXCEPTIONS = [
+    URI::InvalidURIError,
+    RestClient::NotAcceptable,
+    RestClient::BadGateway,
+    RestClient::URITooLong,
+    Encoding::CompatibilityError,
+    RestClient::SeeOther,
+    RestClient::LoopDetected,
+    RestClient::PermanentRedirect,
+    RestClient::Locked,
+    RestClient::MethodNotAllowed,
+    RestClient::NotImplemented,
+    RestClient::PaymentRequired,
+    RestClient::TooManyRequests,
+    RestClient::RangeNotSatisfiable,
+    Errno::ENETUNREACH,
+    RestClient::Conflict,
+    RestClient::ProxyAuthenticationRequired,
+    Net::HTTPBadResponse,
+    Errno::ECONNREFUSED,
+    Errno::ECONNRESET,
+    Errno::EHOSTUNREACH,
+    Errno::EINVAL,
+    OpenSSL::SSL::SSLError,
+    RestClient::BadRequest,
+    RestClient::Forbidden,
+    RestClient::GatewayTimeout,
+    RestClient::Gone,
+    RestClient::InternalServerError,
+    RestClient::MovedPermanently,
+    RestClient::NotFound,
+    RestClient::RequestFailed,
+    RestClient::ServerBrokeConnection,
+    RestClient::ServiceUnavailable,
+    RestClient::SSLCertificateNotVerified,
+    RestClient::Unauthorized,
+    SocketError
+  ].freeze
+  TIMEOUT_EXCEPTIONS = [
+    RestClient::Exceptions::OpenTimeout,
+    RestClient::Exceptions::ReadTimeout,
+    RestClient::RequestTimeout
+  ].freeze
+end

data/lib/scraper/scrape_helper.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# frozen_string_literal: true
+# Scrapes below data
+# @Title
+# @Descriptions
+# @Social Profiles
+# @Contact Details
+module ScrapeHelper
+  def perform_scrape(url, read_timeout, connection_timeout)
+    response = nil
+    request_duration = Benchmark.measure do
+      response = ScrapeRequest.new(url, read_timeout, connection_timeout)
+    end.real
+    retry_count = 0
+    begin
+      scrape_data = nil
+      scrape_duration = Benchmark.measure do
+        scrape_data = grep_data(response.body)
+      end.real
+      data_hash = {
+        web_request_duration: request_duration,
+        response_scrape_duraton: scrape_duration,
+        scrape_data: scrape_data
+      }
+    rescue ArgumentError => e
+      retry_count += 1
+      raise WebScraper::ParserError, e.message if retry_count > 1
+      response = response.encode('UTF-16be', invalid: :replace, replace: '?')
+      response = response.encode('UTF-8')
+      retry
+    rescue Encoding::CompatibilityError => e
+      raise WebScraper::ParserError, e.message
+    end
+    data_hash
+  end
+  private
+  def grep_data(response)
+    {
+      title: grep_title(response),
+      meta_description: grep_meta_description(response),
+      org_description: grep_org_description(response),
+      twitter_description: grep_twitter_description(response),
+      twitter_profile: grep_twitter_profile(response),
+      linkedin_profile: grep_linkedin_profile(response),
+      facebook_profile: grep_facebook_profile(response),
+      instagram_profile: grep_instagram_profile(response),
+      vimeo_profile: grep_vimeo_profile(response),
+      pinterest_profile: grep_pinterest_profile(response),
+      youtube_channel: grep_youtube_channel(response),
+      emails: grep_emails(response),
+      phone_numbers: grep_phone_numbers(response),
+      redirected_to: grep_redirected_to_url(response)
+    }
+  end
+end

data/lib/scraper/scrape_request.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+# @Makes actual scrape request, either raises exception or response
+module ScrapeRequest
+  extend ScrapeExceptions
+  class << self
+    def new(url, read_timeout, connection_timeout)
+      begin
+        params_hash = {
+          method: :get,
+          url: url,
+          read_timeout: read_timeout,
+          connection_timeout: connection_timeout,
+          headers: { 'accept-encoding': 'identity' }
+        }
+        response = RestClient::Request.execute(params_hash)
+        content_type = response.headers[:content_type]
+        return response if content_type =~ %r{(?i)text\s*\/\s*html}
+        exception_message = "Invalid response format received: #{content_type}"
+        raise WebScraper::NonHtmlError, exception_message
+      rescue *TIMEOUT_EXCEPTIONS => e
+        raise WebScraper::TimeoutError, e.message
+      rescue *GENERAL_EXCEPTIONS => e
+        raise WebScraper::RequestError, e.message
+      end
+    end
+  end
+end

data/lib/version.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+# Holds current version number
+module WebScraper
+  VERSION = '0.1'
+end

data/spec/lib/parsers/description_helper_spec.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require 'spec_helper'
+describe 'DescriptionHelper' do
+  class DummyTestClass
+    include DescriptionHelper
+  end
+  let(:dummy_object) { DummyTestClass.new }
+  it 'it should return nil for inalid description' do
+    descriptions = [" ", "", "|", "-"]
+    expect(dummy_object.send(:parse_description, *[descriptions])).to be_nil
+  end
+  it 'should return valid description' do
+    descriptions = [
+      '2019年趣味幽默猜生肖,1358884不像看图找生肖,2019年看图猜生肖网站,2019看图找生肖83期,2019看图找生肖,2019看图找生肖109期,2019看图猜生肖买马,2019看图猜生肖买,2019全年看图找生肖图',
+      "-"
+      ]
+    expect(
+      dummy_object.send(:parse_description, *[descriptions])
+      ).to eq('2019年趣味幽默猜生肖,1358884不像看图找生肖,2019年看图猜生肖网站,2019看图找生肖83期,2019看图找生肖,2019看图找生肖109期,2019看图猜生肖买马,2019看图猜生肖买,2019全年看图找生肖图')
+  end
+end

data/spec/lib/parsers/emails_spec.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require 'spec_helper'
+describe 'Emails' do
+  class DummyTestClass
+    include Emails
+  end
+  let(:dummy_object) { DummyTestClass.new }
+  it 'should return nil for invalid input' do
+    expect(dummy_object.grep_emails(nil)).to be_nil
+    expect(dummy_object.grep_emails('')).to be_nil
+  end
+  it 'should give []' do
+  	html = <<~HTML
+      <a href="mailto:abc@example.com">abc@example.com</a>
+      <a href="mailto:example@mail.com">example@email.com</a>
+      <a href="mailto:name@domain.com">name@domain.com</a>
+      <a href="mailto:name@company.com">name@company.com</a>
+      <a href="mailto:you@youremail.com">you@youremail.com</a>
+      <a href="mailto:your@emailaddress.com">your@emailaddress.com</a>
+      <a href="mailto:yourname@yourdomain.com">yourname@yourdomain.com</a>
+      <a href="mailto:yourname@yourcompany.com">yourname@yourcompany.com</a>
+      <a href="mailto:YOU@EMAILADRESS.COM">YOU@EMAILADRESS.COM</a>
+      <a href="mailto:you@address.com">you@address.com</a>
+      <a href="mailto:xxx@yyy.zzz">xxx@yyy.zzz</a>
+      <a href="mailto:test@test.com">test@test.com</a>
+      <a href="mailto:@example.com">@example.com"</a>
+  	HTML
+  	expect(dummy_object.grep_emails(html.to_s)).to eq([])
+  end
+  it 'should grep organization contact emailaddresses' do
+  	html = <<~HTML
+  		<a href="mailto:abc@example.com">abc@example.com</a>
+      <a class="fusion-social-network-icon fusion-tooltip " style="color:#ffffff;" href="mailto:&#119;ils&#111;&#110;&#064;&#119;&#105;&#108;&#115;on&#046;n&#098;&#046;c&#097;" target="_self" title="Email">
+        <span class="screen-reader-text">Email</span>
+      </a>
+      <div>
+        <br><strong>Mailing address</strong>
+        : 1320 Yonge Street, Toronto, Ontario&#160; M4T 1X2<br><br>
+        <strong>Attendance</strong>:&#160;<br>Junior School: 1639attendance@yorkschool.com<br>Middle &amp; Senior School: 1320attendance@yorkschool.com<br><br>
+      </div>
+      <a href="mailto:%20support@switcherstudio.com">
+      <a href="mailto:%20support@switcherstudio.com">
+      <a href="mailto:ekerlow@hellermanllc.com">
+      &lt;a href=\\&quot;mailto:Michael.O%27Brien@idga.org?subject=Editorial%20Calendar%20Contributor\\&quot;&gt
+    HTML
+  	emails = dummy_object.grep_emails(html.to_s)
+    expected_emails = [
+      "wilson@wilson.nb.ca",
+      "support@switcherstudio.com",
+      "ekerlow@hellermanllc.com",
+      "michael.o'brien@idga.org",
+      "1639attendance@yorkschool.com",
+      "1320attendance@yorkschool.com"
+    ]
+    expect(emails).to eq(expected_emails)
+  end
+end

data/spec/lib/parsers/facebook_profile_spec.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require 'spec_helper'
+describe 'FaceBook Profile' do
+  class DummyTestClass
+    include FacebookProfile
+  end
+  let(:dummy_object) { DummyTestClass.new }
+  it 'should return nil for invalid input' do
+    expect(dummy_object.grep_facebook_profile(nil)).to be_nil
+    expect(dummy_object.grep_facebook_profile('')).to be_nil
+  end
+  it 'should not grep any non profile url' do
+    html = <<~HTML
+      <a href="http://www.facebook.com/2008/fbml" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/v2.0/dialog/share" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2FHFXMooseheads%2Fvideos" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/search.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="http://www.facebook.com/home.php#/pages/Zend-Technologies/190917412139" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <img height="1" width="1" style="display:none" src="https://www.facebook.com/tr?id=1501718829946651&ev=PageView&noscript=1"/>
+      <a href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fbroadreachstaffing.com&#038;t=Broadreach" class="et_social_share" rel="nofollow" data-social_name="facebook" data-post_id="68" data-social_type="share" data-location="sidebar"></a>
+      <a href="https://www.facebook.com/photo.php?fbid=10157409473244808&set=p.10157409473244808&type=3" class="et_social_share" rel="nofollow" data-social_name="facebook" data-post_id="68" data-social_type="share" data-location="sidebar"></a>
+      <a href="https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.facebook.com%2Fchoosepremiere%2Fposts%2F10157307766122649" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/dialog/send?display=popup&#038;link=https%3A%2F%2Fsmartcookiemedia.com%2F&#038;redirect_uri=https://smartcookiemedia.com/" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/hashtag/beeryoga" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="http://facebook.com/privacy" title="Facebook Privacy" target="_blank">Facebook</a>
+      <a target="_blank" title="Facebook - Social Gastronomy" href="http://www.facebook.com/home.php#/pages/Social-Gastronomy/187440209207?ref=ts"><img alt="images" src="/images/stories/social/images.jpg" width="30"></a>
+      <a href="http://www.facebook.com/plugins/like.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <iframe src="http://www.facebook.com/plugins/likebox.php?href=https%3A%2F%2Fwww.facebook.com%2Fbandcart&amp;width=220&amp;colorscheme=dark&amp;show_faces=false&amp;stream=false&amp;header=false&amp;height=65" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:220px; height: 65px;" allowtransparency="true"></iframe>
+      <iframe style="border: none; overflow: hidden;" src="https://www.facebook.com/plugins/page.php?href=https%3A%2F%2Fwww.facebook.com%2FCarterBrothersCompany&tabs=timeline&width=340&height=500&small_header=false&adapt_container_width=true&hide_cover=false&show_facepile=true&appId=210697455750478" width="340" height="500" frameborder="0" scrolling="no"></iframe>
+      <a href="https://www.facebook.com/offsite_event.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="http://www.facebook.com/share.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/login/device-based/regular/login/?login_attempt=1" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/recover/initiate?lwv=110" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/help/568137493302217" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/help/2687943754764396" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://www.facebook.com/help/www/1573156092981768/" target="_blank" class="sqs-svg-icon--wrapper facebook">
+      <a href="https://facebook.com/security/hsts-pixel.gif?c=3.2.5" target="_blank" class="sqs-svg-icon--wrapper facebook">
+    HTML
+    expect(dummy_object.grep_facebook_profile(html.to_s)).to eq([])
+  end
+  it 'should grep valid urls' do
+    html = <<~HTML
+      <a href="http://facebook.com/AAEurope"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
+      <a target="_blank" href="https://www.facebook.com/pages/Basketball-New-Brunswick/156176001133032?sk=wall" title="Follow us on Facebook">facebook</a>
+      <a class="cff-photo cff-multiple cff-img-layout-4 cff-portrait nofancybox" style="max-width: 540px;"  data-cff-page-name="Allied Printing Services" data-cff-post-time="4 days ago" href="https://www.facebook.com/alliedprinting/posts/2323507841068448"> FB Posts</a>
+      <a class="cff-photo cff-multiple cff-img-layout-4 cff-portrait nofancybox" style="max-width: 540px;"  data-cff-page-name="Allied Printing Services" data-cff-post-time="4 days ago" href="https://www.facebook.com/groups/1004350633012081/"> FB Posts</a>
+      <a href="https://www.facebook.com/events/116316035951805/"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
+      <a href="https://www.facebook.com/arithane.foamroofing"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
+      <a href="http://www.facebook.com/pages/Surgical+Information+Systems/75322028321"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
+      <a href="https://www.facebook.com/Baylor-School-124353897738/"><img alt="Follow us on Facebook - opens external site" src="/content/images/chrome/rebrand/icon-footer-facebook.png"></a>
+      <a href="http://www.facebook.com/profile.php?id=100000325114186&v=info#!/pages/Blackstone-Counsel/150651724966482" target="_blank">
+        <img class="social"  src="facebook.jpg" alt="Facebook"/>
+      </a>
+      <a href="http://facebook.com/profile.php?id=205682532825685" target="_blank"><img class="social"  src="facebook.jpg" alt="Facebook"/></a>
+      <a href="http://www.facebook.com/share.php" target="_blank" class="sqs-svg-icon--wrapper facebook">
+    HTML
+    fb_profiles = dummy_object.grep_facebook_profile(html.to_s)
+    expected_profiles = [
+      'http://facebook.com/AAEurope',
+      'https://www.facebook.com/pages/Basketball-New-Brunswick/156176001133032?sk=wall',
+      'https://www.facebook.com/alliedprinting/posts/2323507841068448',
+      'https://www.facebook.com/groups/1004350633012081/',
+      'https://www.facebook.com/events/116316035951805/',
+      'https://www.facebook.com/arithane.foamroofing',
+      'http://www.facebook.com/pages/Surgical+Information+Systems/75322028321',
+      'https://www.facebook.com/Baylor-School-124353897738/',
+      'http://www.facebook.com/profile.php?id=100000325114186',
+      'http://facebook.com/profile.php?id=205682532825685'
+    ]
+    expect(fb_profiles).to eq(expected_profiles)
+  end
+end

data/spec/lib/parsers/instagram_profile_spec.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require 'spec_helper'
+describe 'Instagram Profile' do
+  class DummyTestClass
+    include InstagramProfile
+  end
+  let(:dummy_object) { DummyTestClass.new }
+  it 'should return nil for invalid input' do
+    expect(dummy_object.grep_instagram_profile(nil)).to be_nil
+    expect(dummy_object.grep_instagram_profile('')).to be_nil
+  end
+  it 'should not grep below url format' do
+  	html = <<~HTML
+      <a href="http://instagram.com/" style="color: white;" class="fa fa-instagram"></a>
+  		<a href="http://instagram.com/#" style="color: white;" class="fa fa-instagram"></a>
+  		<a href="https://www.instagram.com/%username%" style="color: white;" class="fa fa-instagram"></a>
+  		<a href="https://www.instagram.com/explore/tags/Talent/" style="color: white;" class="fa fa-instagram"></a>
+  	HTML
+  	expect(dummy_object.grep_instagram_profile(html.to_s)).to eq([])
+  end
+  it 'should grep organization instagram profiles' do
+  	html = <<~HTML
+  		<a href="https://www.instagram.com/nextgenhealthcare" target="_blank">Instagram</a>
+  		<a href="https://instagram.com/nextgenhealthcare" target="_blank">Instagram</a>
+  		<a href="https://www.instagram.com/printed4you.co.uk" target="_blank">Instagram</a>
+      <a href="https://www.instagram.com/web_spiders" target="_blank">Instagram</a>
+      <a href="http://instagram.com/mccaincanada?ref=badge" target="_blank">Instagram</a>
+      <a href="http://instagram.com/mcdermottscholars&quot;,&quot;target&quot;:&quot;_blank&quot;}},&quot;displayMode&quot;:&quot;fill&quot;}" target="_blank">Instagram</a>
+    HTML
+  	instagram_profiles = dummy_object.grep_instagram_profile(html.to_s)
+  	expected_instagram_profiles = [
+  		'https://www.instagram.com/nextgenhealthcare',
+  		'https://instagram.com/nextgenhealthcare',
+  		'https://www.instagram.com/printed4you.co.uk',
+      'https://www.instagram.com/web_spiders',
+      'http://instagram.com/mccaincanada',
+      'http://instagram.com/mcdermottscholars'
+  	]
+  	expect(dummy_object.grep_instagram_profile(html.to_s)).to eq(instagram_profiles)
+  end
+end

data/spec/lib/parsers/linkedin_profile_spec.rb ADDED Viewed

@@ -0,0 +1,43 @@
+require 'spec_helper'
+describe 'Linkedin Profile' do
+  class DummyTestClass
+    include LinkedinProfile
+  end
+  let(:dummy_object) { DummyTestClass.new }
+  it 'should return nil for invalid input' do
+    expect(dummy_object.grep_linkedin_profile(nil)).to be_nil
+    expect(dummy_object.grep_linkedin_profile('')).to be_nil
+  end
+  it 'should not grep below url format' do
+  	html = <<~HTML
+  		<a href="https://www.linkedin.com" style="color: white;" class="fa fa-linkedin"></a>
+  		<a href="https://www.linkedin.com/feed/" style="color: white;" class="fa fa-linkedin"></a>
+  		<a href="https://www.linkedin.com/mynetwork/" style="color: white;" class="fa fa-linkedin"></a>
+  		<a href="https://www.linkedin.com/jobs/" style="color: white;" class="fa fa-linkedin"></a>
+  		<a href="https://www.linkedin.com/messaging/" style="color: white;" class="fa fa-linkedin"></a>
+  		<a href="https://www.linkedin.com/notifications/" style="color: white;" class="fa fa-linkedin"></a>
+  		<a href="https://www.linkedin.com/psettings/" style="color: white;" class="fa fa-linkedin"></a>
+  		<a href="https://www.linkedin.com/ca/pet-32/" style="color: white;" class="fa fa-linkedin"></a>
+  	HTML
+  	expect(dummy_object.grep_linkedin_profile(html.to_s)).to eq([])
+  end
+  it 'should grep organization linkedin profiles' do
+  	html = <<~HTML
+  		<a href="https://www.linkedin.com/company/13247248/" target="_blank">Linkedin</a>
+  		<a href="https://www.linkedin.com/company/m-files-corporation" target="_blank">Linkedin</a>
+  		<a href="https://www.linkedin.com/company/dataendure" target="_blank">Linkedin</a>
+  	HTML
+  	linkedin_profiles = dummy_object.grep_linkedin_profile(html.to_s)
+  	expected_linkedin_profiles = [
+  		'https://www.linkedin.com/company/13247248',
+  		'https://www.linkedin.com/company/m-files-corporation',
+  		'https://www.linkedin.com/company/dataendure'
+  	]
+  	expect(dummy_object.grep_linkedin_profile(html.to_s)).to eq(expected_linkedin_profiles)
+  end
+end

data/spec/lib/parsers/meta_description_spec.rb ADDED Viewed

@@ -0,0 +1,321 @@
+require 'spec_helper'
+describe 'Meta Description' do
+  class DummyTestClass
+  	include MetaDescription
+	end
+  let(:dummy_object) { DummyTestClass.new }
+  it 'should return nil for invalid inputs' do
+  	expect(dummy_object.grep_meta_description('')).to be_nil
+  	expect(dummy_object.grep_meta_description(nil)).to be_nil
+  end
+  describe 'Name key first meta description tag' do
+  	it 'should return nil for no meta description tag presence' do
+  		no_meta_description = <<~HTML
+  			<head>
+					<meta charset="utf-8">
+					<meta http-equiv="X-UA-Compatible" content="IE=edge">
+					<meta name="viewport" content="width=device-width, initial-scale=1">
+					<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
+					<meta name="google-site-verification" content="h2NvZnvL9v536RUYH3jney-9V8JRBGESmzH5-ph0EM4">
+					<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
+				</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
+  		expect(meta_description).to be_nil
+  	end
+  	it 'should return nil when content part is empty' do
+  		no_meta_description = <<~HTML
+  			<head>
+					<meta http-equiv="X-UA-Compatible" content="IE=edge">
+					<meta name="description" content="">
+					<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
+					<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
+				</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
+  		expect(meta_description).to be_nil
+  	end
+  	it 'should return description from valid tag' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta name="description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should return description even tag is multilined and partially encoded' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta name="description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should parse meta tag even it is partially single quoted' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta name=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront.">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should parse meta tag even it is having other attributes defined' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta class="metadescription" name=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should parse meta tag with itemprop as description key' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta class="metadescription" itemprop=\'description" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should parse even name/itemprop key content is improperly assigned' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta class="metadescription" name=description content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" />
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should bring description having single quote' do
+  		html = <<~HTML
+  			<html lang="en">
+					<head>
+						<META charset="utf-8">
+						<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
+						<meta name="viewport" content="width=device-width, initial-scale=1" />
+						<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
+						<meta name="description" content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." />
+  				</head>
+  			<html>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
+  	end
+  	it 'should bring description having double quote' do
+  		html = <<~HTML
+  			<html lang="en">
+					<head>
+						<META charset="utf-8">
+						<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
+						<meta name="viewport" content="width=device-width, initial-scale=1" />
+						<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
+  					<meta name="description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
+  				</head>
+  			<html>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
+  	end
+  	it "should bring description even some other meta tag is empty" do
+  		html = <<~HTML
+  			<html lang="en">
+					<head>
+						<META charset="utf-8">
+						<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
+						<meta name="viewport" content="width=device-width, initial-scale=1" />
+						<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
+  					<meta name="description" content="">
+  					<meta name="description" content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' />
+  				</head>
+  			<html>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
+  	end
+  end
+  describe 'Content key first meta description tag' do
+  	it 'should return nil when content part is empty' do
+  		no_meta_description = <<~HTML
+  			<head>
+					<meta http-equiv="X-UA-Compatible" content="IE=edge">
+					<meta content="" name="description">
+					<meta content='' name="description">
+					<title>Internet Service Providers Chennai | Internet Telephony Service Providers Chennai | Internet bandwidth Providers Chennai - Pulse Telesystems Pvt Ltd</title>
+					<link href="http://www.pulse.in/new/wp-content/themes/pulse/css/bootstrap.min.css" rel="stylesheet" type="text/css">
+				</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(no_meta_description.to_s)
+  		expect(meta_description).to be_nil
+  	end
+  	it 'should return description from valid tag' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="description">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should return description even tag is multilined and partially encoded' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name="description" >
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should parse meta tag even it is partially single quoted' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." name=\'description">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should parse meta tag even it is having other attributes defined' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=\'description">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should parse meta tag with itemprop as description key' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." itemprop=\'description" charset="UTF-8">
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should parse even name/itemprop key content is improperly assigned' do
+  		html = <<~HTML
+  			<head>
+					<meta content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." property="og:description">
+					<meta content="" property="uid">
+					<meta class="metadescription" content="With Hired your job search has never been easier! Simply create a profile &amp; vetted companies compete for you, reaching out with salary &amp; equity upfront." charset="UTF-8" name=description />
+					<meta content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"
+					name="viewport">
+  			</head>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('With Hired your job search has never been easier! Simply create a profile & vetted companies compete for you, reaching out with salary & equity upfront.')
+  	end
+  	it 'should bring description having single quote' do
+  		html = <<~HTML
+  			<html lang="en">
+					<head>
+						<META charset="utf-8">
+						<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
+						<meta name="viewport" content="width=device-width, initial-scale=1" />
+						<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
+						<meta content="Wilentz Goldman & Spitzer is one of New Jersey's largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law." name="description"	/>
+  				</head>
+  			<html>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('Wilentz Goldman & Spitzer is one of New Jersey\'s largest law firms. Our lawyers proudly serve our clients in nearly every aspect of law.')
+  	end
+  	it 'should bring description having double quote' do
+  		html = <<~HTML
+  			<html lang="en">
+					<head>
+						<META charset="utf-8">
+						<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
+						<meta name="viewport" content="width=device-width, initial-scale=1" />
+						<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
+  					<meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="description" />
+  				</head>
+  			<html>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
+  	end
+  	it "should bring description even some other meta tag is empty" do
+  		html = <<~HTML
+  			<html lang="en">
+					<head>
+						<META charset="utf-8">
+						<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
+						<meta name="viewport" content="width=device-width, initial-scale=1" />
+						<title>Wilentz, Goldman & Spitzer: NJ, NYC and PA Full-Service Law Firm</title>
+  					<meta content="" name="description">
+  					<meta content='Whether you&#039;re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what&#039;s new, what&#039;s best and how to make the most out of the products you love.' name="description"/>
+  				</head>
+  			<html>
+  		HTML
+  		meta_description = dummy_object.grep_meta_description(html.to_s)
+  		expect(meta_description).to eq('Whether you\'re a Mac "die-hard" or an iPad "newbie" we give you the scoop on what\'s new, what\'s best and how to make the most out of the products you love.')
+  	end
+  end
+end