brilliant_web_scraper 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: efbe9d1a0688fd10e200d972b56c3e2ec86203f1
4
- data.tar.gz: 20cce1c52197f11dcea73813831bb4172829ddaa
3
+ metadata.gz: 9ad5219a19dcfc311bed756a83d82fd3758bd71a
4
+ data.tar.gz: c085eb2a96b8eb503cd44edc87821823cf0ad965
5
5
  SHA512:
6
- metadata.gz: 638c34f7efbc963613f4bb841abbf183bf134ee3197bebc99f9403ba7864befd44243f53092a9aa3ba7ea58314475b61d6671816e8e3f8ef4deb7f49b6f0ef52
7
- data.tar.gz: f91110f69e8228de408aa0c35050fe6137fac22bdb93ff86be3c70d380e1cf57534f50f78e5d585cb63e421ff0fc51aa04089d38e8a86ab3a6ca305659dc909a
6
+ metadata.gz: 4a5c3c9dd78f3e123b0c279a04c2d0d262c58aef8d0086668c37aea8e52b6ec8da98d6d242debddf600922dcfed3d377bd159e62fc9bb1e7390b1e62e881eb2b
7
+ data.tar.gz: de051e60d7b90bde7984871d1b39e52f46be6366910591f0ec31c434a41037a9cc1274989dd50755cbf6e03d0e9f498a3271aad173b12a8ea3fd6a578eb8fbc9
@@ -0,0 +1,90 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ brilliant_web_scraper (0.2)
5
+ charlock_holmes (~> 0.7.6)
6
+ nesty (~> 1.0, >= 1.0.1)
7
+ rest-client (~> 2.0, >= 2.0.2)
8
+
9
+ GEM
10
+ remote: http://rubygems.org/
11
+ specs:
12
+ addressable (2.6.0)
13
+ public_suffix (>= 2.0.2, < 4.0)
14
+ ast (2.4.0)
15
+ charlock_holmes (0.7.6)
16
+ coderay (1.1.2)
17
+ crack (0.4.3)
18
+ safe_yaml (~> 1.0.0)
19
+ diff-lcs (1.3)
20
+ domain_name (0.5.20190701)
21
+ unf (>= 0.0.5, < 1.0.0)
22
+ hashdiff (1.0.0)
23
+ http-accept (1.7.0)
24
+ http-cookie (1.0.3)
25
+ domain_name (~> 0.5)
26
+ jaro_winkler (1.5.3)
27
+ method_source (0.9.2)
28
+ mime-types (3.2.2)
29
+ mime-types-data (~> 3.2015)
30
+ mime-types-data (3.2019.0331)
31
+ nesty (1.0.2)
32
+ netrc (0.11.0)
33
+ parallel (1.17.0)
34
+ parser (2.6.3.0)
35
+ ast (~> 2.4.0)
36
+ pry (0.12.2)
37
+ coderay (~> 1.1.0)
38
+ method_source (~> 0.9.0)
39
+ public_suffix (3.1.1)
40
+ rainbow (3.0.0)
41
+ rest-client (2.1.0)
42
+ http-accept (>= 1.7.0, < 2.0)
43
+ http-cookie (>= 1.0.2, < 2.0)
44
+ mime-types (>= 1.16, < 4.0)
45
+ netrc (~> 0.8)
46
+ rspec (3.8.0)
47
+ rspec-core (~> 3.8.0)
48
+ rspec-expectations (~> 3.8.0)
49
+ rspec-mocks (~> 3.8.0)
50
+ rspec-core (3.8.2)
51
+ rspec-support (~> 3.8.0)
52
+ rspec-expectations (3.8.4)
53
+ diff-lcs (>= 1.2.0, < 2.0)
54
+ rspec-support (~> 3.8.0)
55
+ rspec-mocks (3.8.1)
56
+ diff-lcs (>= 1.2.0, < 2.0)
57
+ rspec-support (~> 3.8.0)
58
+ rspec-support (3.8.2)
59
+ rubocop (0.73.0)
60
+ jaro_winkler (~> 1.5.1)
61
+ parallel (~> 1.10)
62
+ parser (>= 2.6)
63
+ rainbow (>= 2.2.2, < 4.0)
64
+ ruby-progressbar (~> 1.7)
65
+ unicode-display_width (>= 1.4.0, < 1.7)
66
+ ruby-progressbar (1.10.1)
67
+ safe_yaml (1.0.5)
68
+ unf (0.1.4)
69
+ unf_ext
70
+ unf_ext (0.0.7.6)
71
+ unicode-display_width (1.6.0)
72
+ vcr (3.0.3)
73
+ webmock (2.3.2)
74
+ addressable (>= 2.3.6)
75
+ crack (>= 0.3.2)
76
+ hashdiff
77
+
78
+ PLATFORMS
79
+ ruby
80
+
81
+ DEPENDENCIES
82
+ brilliant_web_scraper!
83
+ pry (~> 0.12.2)
84
+ rspec (~> 3.5)
85
+ rubocop (~> 0.73.0)
86
+ vcr (~> 3.0, >= 3.0.1)
87
+ webmock (~> 2.1)
88
+
89
+ BUNDLED WITH
90
+ 1.16.6
data/README.md CHANGED
@@ -1,14 +1,11 @@
1
- # WebScraper [![Build Status](https://api.travis-ci.com/bkotu6717/brilliant_web_scraper.svg)](https://travis-ci.com/bkotu6717/brilliant_web_scraper)
1
+ # BrilliantWebScraper [![Build Status](https://api.travis-ci.com/bkotu6717/brilliant_web_scraper.svg)](https://travis-ci.com/bkotu6717/brilliant_web_scraper)[![Maintainability](https://api.codeclimate.com/v1/badges/15a8a6e117f11bd94376/maintainability)](https://codeclimate.com/github/bkotu6717/brilliant_web_scraper/maintainability)
2
2
 
3
- A decent web scraping gem. Scrapes website description, social profiles, contact details, youtube channels.
4
-
5
-
6
- It accepts a URL or Domain as input and gets it's title, descrptios, social profiles, YouTube channels and it's current URL if got redirected.
3
+ A decent web scraping gem. Scrapes website title, description, social profiles such as linkedin, facebook, twitter, instgram, vimeo, pinterest, youtube channel and contact details such as emails, phone numbers.
7
4
 
8
5
 
9
6
  ## See it in action!
10
7
 
11
- You can try WebScraper live at this little demo: [https://brilliantweb-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
8
+ You can try BrillaintWebScraper live at this little demo: [https://brilliant-web-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
12
9
 
13
10
  ## Installation
14
11
 
@@ -21,11 +18,11 @@ gem 'brilliant_web_scraper'
21
18
 
22
19
  ## Usage
23
20
 
24
- Initialize a BrilliantWebScraper instance for an URL, like this:
21
+ Initialize a BrilliantWebScraper instance for an URL, like this with optional timeouts, default connection_timeout and read_timeouts are 10s, 10s respectively:
25
22
 
26
23
  ```ruby
27
24
  require 'brilliant_web_scraper'
25
+ results = BrilliantWebScraper.new('http://pwc.com', 5, 5)
26
+
28
27
  results = BrilliantWebScraper.new('http://pwc.com')
29
28
  ```
30
-
31
- If you don't include the scheme on the URL, it is fine:
@@ -6,23 +6,28 @@ Gem::Specification.new do |s|
6
6
  s.name = 'brilliant_web_scraper'
7
7
  s.version = WebScraper::VERSION
8
8
  s.licenses = ['Nonstandard']
9
- s.summary = 'A decent web scraping ruby library!'
10
- s.description = 'Scrapes data such as description, social profiles, contact details'
9
+ s.summary = 'A decent web scraping ruby gem!'
10
+ s.description = 'A decent web scraping gem.'\
11
+ 'Scrapes website\'s title, description,'\
12
+ 'social profiles such as linkedin, '\
13
+ 'facebook, twitter, instgram, vimeo,'\
14
+ 'pinterest, youtube channel and'\
15
+ ' contact details such as emails, phone numbers.'
11
16
  s.authors = ['Kotu Bhaskara Rao']
12
17
  s.email = 'bkotu6717@gmail.com'
13
18
  s.require_paths = ['lib']
14
19
  s.homepage = 'https://github.com/bkotu6717/brilliant_web_scraper'
15
20
  s.files = Dir['**/*'].keep_if { |file|
16
- file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" && File.file?(file)
21
+ file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" &&
22
+ File.file?(file)
17
23
  }
18
24
  s.required_ruby_version = '>= 2.3.0'
19
25
 
20
- s.add_dependency 'nesty', '~> 1.0', '>= 1.0.1'
21
- s.add_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
26
+ s.add_runtime_dependency 'charlock_holmes', '~> 0.7.6'
27
+ s.add_runtime_dependency 'nesty', '~> 1.0', '>= 1.0.1'
28
+ s.add_runtime_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
22
29
 
23
- s.add_development_dependency 'nesty', '~> 1.0', '>= 1.0.1'
24
30
  s.add_development_dependency 'pry', '~> 0.12.2'
25
- s.add_development_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
26
31
  s.add_development_dependency 'rspec', '~> 3.5'
27
32
  s.add_development_dependency 'rubocop', '~> 0.73.0'
28
33
  s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
@@ -2,7 +2,8 @@
2
2
 
3
3
  require 'rest-client'
4
4
  require 'cgi'
5
- require 'benchmark'
5
+ require 'charlock_holmes/string'
6
+ require 'timeout'
6
7
 
7
8
  current_directory = File.dirname(__FILE__) + '/scraper'
8
9
  require File.expand_path(File.join(current_directory, 'errors'))
@@ -21,7 +21,7 @@ module DescriptionHelper
21
21
  def parse_description(descriptions)
22
22
  return if descriptions.nil? || descriptions.empty?
23
23
 
24
- descriptions = descriptions.reject { |x| x.nil? || x.empty? }
24
+ descriptions = descriptions.reject { |x| x.nil? || x.empty? || x =~ /^\s*$/}
25
25
  descriptions = descriptions.map { |x| unescape_html(x) }
26
26
  descriptions.find { |x| (x !~ /^\s*[|-]?\s*$/) }
27
27
  end
@@ -10,7 +10,7 @@ module Emails
10
10
  return if response.nil? || response.empty?
11
11
 
12
12
  first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
13
- second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3})["'\s><]}
13
+ second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3})["'\s><]}
14
14
  first_set = response.scan(first_regex).flatten.compact
15
15
  first_set = get_processed_emails(first_set)
16
16
  second_set = response.scan(second_regex).flatten.compact
@@ -24,7 +24,7 @@ module Emails
24
24
  unescaped_emails = email_set.map { |email| unescape_html(email) }
25
25
  return [] if unescaped_emails.empty?
26
26
 
27
- email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3}/im
27
+ email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3}/im
28
28
  unescaped_emails.select { |data| data =~ email_match_regex }
29
29
  end
30
30
  end
@@ -5,7 +5,7 @@ module FacebookProfile
5
5
  def grep_facebook_profile(response)
6
6
  return if response.nil? || response.empty?
7
7
 
8
- facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
8
+ facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|images|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
9
9
  response.scan(facebook_url_regex).flatten.compact.uniq
10
10
  end
11
11
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  # Fetch latest url of the given website
4
4
  module RedirectedTo
5
+ include UnescapeHtmlHelper
5
6
 
6
7
  def grep_redirected_to_url(response)
7
8
  return if response.nil? || response.empty?
@@ -18,7 +19,7 @@ module RedirectedTo
18
19
  url = parser(web_urls)
19
20
  break unless url.nil?
20
21
  end
21
- url
22
+ unescape_html(url)
22
23
  end
23
24
 
24
25
  private
@@ -5,7 +5,7 @@ module TwitterProfile
5
5
  def grep_twitter_profile(response)
6
6
  return if response.nil? || response.empty?
7
7
 
8
- twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
8
+ twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!\{\{)(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
9
9
  response.scan(twitter_regex).flatten.compact.uniq
10
10
  end
11
11
  end
@@ -5,7 +5,7 @@ module VimeoProfile
5
5
  def grep_vimeo_profile(response)
6
6
  return if response.nil? || response.empty?
7
7
 
8
- vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\&\?<>\s]+)}
8
+ vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\\\&\?<>\s]+)}
9
9
  response.scan(vimeo_regex).flatten.compact.uniq
10
10
  end
11
11
  end
@@ -6,38 +6,38 @@
6
6
  # @Social Profiles
7
7
  # @Contact Details
8
8
  module ScrapeHelper
9
- def perform_scrape(url, read_timeout, connection_timeout)
10
- response = nil
11
- request_duration = Benchmark.measure do
12
- response = ScrapeRequest.new(url, read_timeout, connection_timeout)
13
- end.real
14
- retry_count = 0
15
- begin
16
- scrape_data = nil
17
- scrape_duration = Benchmark.measure do
18
- scrape_data = grep_data(response.body)
19
- end.real
20
-
21
- data_hash = {
22
- web_request_duration: request_duration,
23
- response_scrape_duraton: scrape_duration,
24
- scrape_data: scrape_data
25
- }
26
- rescue ArgumentError => e
27
- retry_count += 1
28
- raise WebScraper::ParserError, e.message if retry_count > 1
29
-
30
- response = response.encode('UTF-16be', invalid: :replace, replace: '?')
31
- response = response.encode('UTF-8')
32
- retry
33
- rescue Encoding::CompatibilityError => e
34
- raise WebScraper::ParserError, e.message
9
+ def perform_scrape(url, read_timeout, open_timeout)
10
+ timeout_in_sec = scraper_timeout(read_timeout, open_timeout)
11
+ Timeout::timeout(timeout_in_sec) do
12
+ response = ScrapeRequest.new(url, read_timeout, open_timeout)
13
+ retry_count = 0
14
+ body = response.body
15
+ begin
16
+ body = body.tr("\000", '')
17
+ encoding = body.detect_encoding[:encoding]
18
+ body = body.encode('UTF-8', encoding)
19
+ grep_data(body)
20
+ rescue Encoding::UndefinedConversionError, ArgumentError => e
21
+ retry_count += 1
22
+ raise WebScraper::ParserError, e.message if retry_count > 1
23
+ body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
24
+ retry
25
+ rescue Encoding::CompatibilityError => e
26
+ raise WebScraper::ParserError, e.message
27
+ rescue StandardError => e
28
+ raise WebScraper::RequestError, e.message
29
+ end
35
30
  end
36
- data_hash
31
+ rescue Timeout::Error => e
32
+ raise WebScraper::TimeoutError, e.message
37
33
  end
38
34
 
39
35
  private
40
36
 
37
+ def scraper_timeout(read_timeout, open_timeout)
38
+ ( read_timeout + open_timeout + 1 )
39
+ end
40
+
41
41
  def grep_data(response)
42
42
  {
43
43
  title: grep_title(response),
@@ -1,24 +1,28 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # @Makes actual scrape request, either raises exception or response
3
+ # @Makes actual scrape request, either raises exception or serves response
4
4
  module ScrapeRequest
5
5
  extend ScrapeExceptions
6
6
  class << self
7
7
  def new(url, read_timeout, connection_timeout)
8
+ params_hash = {
9
+ method: :get,
10
+ url: url,
11
+ read_timeout: read_timeout,
12
+ open_timeout: connection_timeout,
13
+ max_redirects: 10,
14
+ verify_ssl: false
15
+ }
8
16
  begin
9
- params_hash = {
10
- method: :get,
11
- url: url,
12
- read_timeout: read_timeout,
13
- connection_timeout: connection_timeout,
14
- headers: { 'accept-encoding': 'identity' }
15
- }
16
17
  response = RestClient::Request.execute(params_hash)
17
18
  content_type = response.headers[:content_type]
18
19
  return response if content_type =~ %r{(?i)text\s*\/\s*html}
19
20
 
20
21
  exception_message = "Invalid response format received: #{content_type}"
21
22
  raise WebScraper::NonHtmlError, exception_message
23
+ rescue Zlib::DataError
24
+ params_hash[:headers] = { 'accept-encoding': 'identity' }
25
+ retry
22
26
  rescue *TIMEOUT_EXCEPTIONS => e
23
27
  raise WebScraper::TimeoutError, e.message
24
28
  rescue *GENERAL_EXCEPTIONS => e
@@ -2,5 +2,5 @@
2
2
 
3
3
  # Holds current version number
4
4
  module WebScraper
5
- VERSION = '0.1'
5
+ VERSION = '0.2'
6
6
  end
@@ -27,6 +27,10 @@ describe 'Emails' do
27
27
  <a href="mailto:xxx@yyy.zzz">xxx@yyy.zzz</a>
28
28
  <a href="mailto:test@test.com">test@test.com</a>
29
29
  <a href="mailto:@example.com">@example.com"</a>
30
+ <a href="mailto:v@201908240100.css">v@201908240100.css"</a>
31
+ <a href="mailto:v@201908240100.js">v@201908240100.js"</a>
32
+ <a href="mailto:ajax-loader@2x.gif">ajax-loader@2x.gif"</a>
33
+ <a href="mailto:favicon@2x.ico">favicon@2x.ico"</a>
30
34
  HTML
31
35
  expect(dummy_object.grep_emails(html.to_s)).to eq([])
32
36
  end
@@ -14,6 +14,7 @@ describe 'FaceBook Profile' do
14
14
 
15
15
  it 'should not grep any non profile url' do
16
16
  html = <<~HTML
17
+ <a href="https://www.facebook.com/images/fb_icon_325x325.png" target="_blank" class="sqs-svg-icon--wrapper facebook">
17
18
  <a href="http://www.facebook.com/2008/fbml" target="_blank" class="sqs-svg-icon--wrapper facebook">
18
19
  <a href="https://www.facebook.com/v2.0/dialog/share" target="_blank" class="sqs-svg-icon--wrapper facebook">
19
20
  <a href="https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2FHFXMooseheads%2Fvideos" target="_blank" class="sqs-svg-icon--wrapper facebook">
@@ -3,205 +3,223 @@ require 'spec_helper'
3
3
  describe 'Website Redirected To' do
4
4
 
5
5
  class DummyTestClass
6
- include RedirectedTo
7
- end
6
+ include RedirectedTo
7
+ end
8
8
  let(:dummy_object) { DummyTestClass.new }
9
9
 
10
10
 
11
11
  it 'should return nil for invalid input' do
12
- expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
13
- expect(dummy_object.grep_redirected_to_url('')).to be_nil
14
- end
12
+ expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
13
+ expect(dummy_object.grep_redirected_to_url('')).to be_nil
14
+ end
15
15
 
16
16
  describe 'Website grep from link tag' do
17
- describe 'rel attribute first ' do
18
-
19
- it 'should return nil when canonical url is empty' do
20
- html = <<~HTML
21
- <link rel="canonical" href="">
22
- <link rel="canonical" href=''>
23
- HTML
24
- website = dummy_object.grep_redirected_to_url(html.to_s)
25
- expect(website).to be_nil
26
- end
27
-
28
- it 'should grep website' do
29
- html = <<~HTML
30
- <link rel="canonical" href="">
31
- <link rel="canonical" href='https://www.apple.com/'>
32
- HTML
33
- website = dummy_object.grep_redirected_to_url(html.to_s)
34
- expect(website).to eq('https://www.apple.com/')
35
- end
36
-
37
- it 'should grep website even with extra attributes' do
38
- html = <<~HTML
39
- <link rel="canonical" href="" itemprop="current_url">
40
- <link rel="canonical" href='https://www.apple.com/'
41
- itemprop="current_url" >
42
- HTML
43
- website = dummy_object.grep_redirected_to_url(html.to_s)
44
- expect(website).to eq('https://www.apple.com/')
45
- end
46
- end
47
- describe 'href attribute first' do
48
- it 'should return nil when canonical url is empty' do
49
- html = <<~HTML
50
- <link href="" rel="canonical" >
51
- <link href='' rel="canonical" >
52
- HTML
53
- website = dummy_object.grep_redirected_to_url(html.to_s)
54
- expect(website).to be_nil
55
- end
56
-
57
- it 'should grep website' do
58
- html = <<~HTML
59
- <link rel="canonical" href="">
60
- <link href='https://www.apple.com/' rel="canonical">
61
- HTML
62
- website = dummy_object.grep_redirected_to_url(html.to_s)
63
- expect(website).to eq('https://www.apple.com/')
64
- end
65
-
66
- it 'should grep website even with extra attributes' do
67
- html = <<~HTML
68
- <link href="" itemprop="current_url" rel="canonical">
69
- <link href='https://www.apple.com/' rel="canonical"
70
- itemprop="current_url" >
71
- HTML
72
- website = dummy_object.grep_redirected_to_url(html.to_s)
73
- expect(website).to eq('https://www.apple.com/')
74
- end
75
- end
17
+ describe 'rel attribute first ' do
18
+
19
+ it 'should return nil when canonical url is empty' do
20
+ html = <<~HTML
21
+ <link rel="canonical" href="">
22
+ <link rel="canonical" href=''>
23
+ HTML
24
+ website = dummy_object.grep_redirected_to_url(html.to_s)
25
+ expect(website).to be_nil
26
+ end
27
+
28
+ it 'should grep website' do
29
+ html = <<~HTML
30
+ <link rel="canonical" href="">
31
+ <link rel="canonical" href='https://www.apple.com/'>
32
+ HTML
33
+ website = dummy_object.grep_redirected_to_url(html.to_s)
34
+ expect(website).to eq('https://www.apple.com/')
35
+ end
36
+
37
+ it 'should grep website even with extra attributes' do
38
+ html = <<~HTML
39
+ <link rel="canonical" href="" itemprop="current_url">
40
+ <link rel="canonical" href='https://www.apple.com/'
41
+ itemprop="current_url" >
42
+ HTML
43
+ website = dummy_object.grep_redirected_to_url(html.to_s)
44
+ expect(website).to eq('https://www.apple.com/')
45
+ end
46
+ end
47
+
48
+ describe 'href attribute first' do
49
+
50
+ it 'should return nil when canonical url is empty' do
51
+ html = <<~HTML
52
+ <link href="" rel="canonical" >
53
+ <link href='' rel="canonical" >
54
+ HTML
55
+ website = dummy_object.grep_redirected_to_url(html.to_s)
56
+ expect(website).to be_nil
57
+ end
58
+
59
+ it 'should grep website' do
60
+ html = <<~HTML
61
+ <link rel="canonical" href="">
62
+ <link href='https://www.apple.com/' rel="canonical">
63
+ HTML
64
+ website = dummy_object.grep_redirected_to_url(html.to_s)
65
+ expect(website).to eq('https://www.apple.com/')
66
+ end
67
+
68
+ it 'should grep website even with extra attributes' do
69
+ html = <<~HTML
70
+ <link href="" itemprop="current_url" rel="canonical">
71
+ <link href='https://www.apple.com/' rel="canonical"
72
+ itemprop="current_url" >
73
+ HTML
74
+ website = dummy_object.grep_redirected_to_url(html.to_s)
75
+ expect(website).to eq('https://www.apple.com/')
76
+ end
77
+ end
76
78
  end
79
+
77
80
  describe 'Website grep from organization URL' do
78
- describe 'property attribute first ' do
79
- it 'should return nil when canonical url is empty' do
80
- html = <<~HTML
81
- <meta property="og:url" content="" />
82
- <meta property="og:url" content='' />
83
- HTML
84
- website = dummy_object.grep_redirected_to_url(html.to_s)
85
- expect(website).to be_nil
86
- end
87
-
88
- it 'should grep website' do
89
- html = <<~HTML
90
- <link property="og:url" content="">
91
- <meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
92
- HTML
93
- website = dummy_object.grep_redirected_to_url(html.to_s)
94
- expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
95
- end
96
-
97
- it 'should grep website even with extra attributes' do
98
- html = <<~HTML
99
- <link property="og:url" content="" calss="og-url">
100
- <meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
101
- class="og-url" />
102
- HTML
103
- website = dummy_object.grep_redirected_to_url(html.to_s)
104
- expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
105
- end
106
- end
107
- describe 'content attribute first ' do
108
- it 'should return nil when canonical url is empty' do
109
- html = <<~HTML
110
- <meta content="" property="og:url" />
111
- <meta content='' property="og:url"/>
112
- HTML
113
- website = dummy_object.grep_redirected_to_url(html.to_s)
114
- expect(website).to be_nil
115
- end
116
-
117
- it 'should grep website' do
118
- html = <<~HTML
119
- <link content="" property="og:url" >
120
- <meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
121
- HTML
122
- website = dummy_object.grep_redirected_to_url(html.to_s)
123
- expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
124
- end
125
-
126
- it 'should grep website even with extra attributes' do
127
- html = <<~HTML
128
- <link content="" calss="og-url" property="og:url">
129
- <meta content='https://www.dieppe.ca/fr/index.aspx'
130
- class="og-url" property="og:url" />
131
- HTML
132
- website = dummy_object.grep_redirected_to_url(html.to_s)
133
- expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
134
- end
135
- end
81
+
82
+ describe 'property attribute first ' do
83
+
84
+ it 'should return nil when canonical url is empty' do
85
+ html = <<~HTML
86
+ <meta property="og:url" content="" />
87
+ <meta property="og:url" content='' />
88
+ HTML
89
+ website = dummy_object.grep_redirected_to_url(html.to_s)
90
+ expect(website).to be_nil
91
+ end
92
+
93
+ it 'should grep website' do
94
+ html = <<~HTML
95
+ <link property="og:url" content="">
96
+ <meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
97
+ HTML
98
+ website = dummy_object.grep_redirected_to_url(html.to_s)
99
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
100
+ end
101
+
102
+ it 'should grep website even with extra attributes' do
103
+ html = <<~HTML
104
+ <link property="og:url" content="" calss="og-url">
105
+ <meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
106
+ class="og-url" />
107
+ HTML
108
+ website = dummy_object.grep_redirected_to_url(html.to_s)
109
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
110
+ end
111
+ end
112
+
113
+ describe 'content attribute first ' do
114
+
115
+ it 'should return nil when canonical url is empty' do
116
+ html = <<~HTML
117
+ <meta content="" property="og:url" />
118
+ <meta content='' property="og:url"/>
119
+ HTML
120
+ website = dummy_object.grep_redirected_to_url(html.to_s)
121
+ expect(website).to be_nil
122
+ end
123
+
124
+ it 'should grep website' do
125
+ html = <<~HTML
126
+ <link content="" property="og:url" >
127
+ <meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
128
+ HTML
129
+ website = dummy_object.grep_redirected_to_url(html.to_s)
130
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
131
+ end
132
+
133
+ it 'should grep website even with extra attributes' do
134
+ html = <<~HTML
135
+ <link content="" calss="og-url" property="og:url">
136
+ <meta content='https://www.dieppe.ca/fr/index.aspx'
137
+ class="og-url" property="og:url" />
138
+ HTML
139
+ website = dummy_object.grep_redirected_to_url(html.to_s)
140
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
141
+ end
142
+ end
136
143
  end
137
144
  describe 'grep website' do
138
- it 'it should return nil when link or og:url is absent' do
139
- html = <<~HTML
140
- <head>
141
- <meta charset="utf-8">
142
- <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
143
- <meta http-equiv="x-ua-compatible" content="ie=edge">
144
- <title>Techmologic | index</title>
145
- <!-- Font Awesome -->
146
- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
147
- <!-- Bootstrap core CSS -->
148
- <link href="css/bootstrap.min.css" rel="stylesheet">
149
- <!-- Material Design Bootstrap -->
150
- <link href="css/mdb.min.css" rel="stylesheet">
151
- <!-- Your custom styles (optional) -->
152
- <link href="css/style.css" rel="stylesheet">
153
- </head>
154
- HTML
155
- website = dummy_object.grep_redirected_to_url(html.to_s)
156
- expect(website).to be_nil
157
- end
158
- it 'should grep one of canonical or og:url' do
159
- html = <<~HTML
160
- <head>
161
- <meta charset="utf-8">
162
- <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
163
- <meta http-equiv="x-ua-compatible" content="ie=edge">
164
- <title>Techmologic | index</title>
165
- <link rel="canonical" href="">
166
- <meta property="og:url" content="" />
167
- <!-- Font Awesome -->
168
- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
169
- <!-- Bootstrap core CSS -->
170
- <link href="css/bootstrap.min.css" rel="stylesheet">
171
- <!-- Material Design Bootstrap -->
172
- <link href="css/mdb.min.css" rel="stylesheet">
173
- <!-- Your custom styles (optional) -->
174
- <link href="css/style.css" rel="stylesheet">
175
- <link rel="canonical" href="http://techmologics.com/">
176
- <meta property="og:url" content="http://techmologics.com/" />
177
- </head>
178
- HTML
179
- website = dummy_object.grep_redirected_to_url(html.to_s)
180
- expect(website).to eq('http://techmologics.com/')
181
- end
182
- it 'should grep one of canonical or og:url whatever it\'s position' do
183
- html = <<~HTML
184
- <head>
185
- <meta charset="utf-8">
186
- <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
187
- <meta http-equiv="x-ua-compatible" content="ie=edge">
188
- <title>Techmologic | index</title>
189
- <link href="" rel="canonical">
190
- <meta content="" property="og:url"/>
191
- <!-- Font Awesome -->
192
- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
193
- <!-- Bootstrap core CSS -->
194
- <link href="css/bootstrap.min.css" rel="stylesheet">
195
- <!-- Material Design Bootstrap -->
196
- <link href="css/mdb.min.css" rel="stylesheet">
197
- <!-- Your custom styles (optional) -->
198
- <link href="css/style.css" rel="stylesheet">
199
- <link href="http://techmologics.com/" rel="canonical" class="canonical">
200
- <meta content="http://techmologics.com/" property="og:url"/>
201
- </head>
202
- HTML
203
- website = dummy_object.grep_redirected_to_url(html.to_s)
204
- expect(website).to eq('http://techmologics.com/')
205
- end
145
+
146
+ it 'it should return nil when link or og:url is absent' do
147
+ html = <<~HTML
148
+ <head>
149
+ <meta charset="utf-8">
150
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
151
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
152
+ <title>Techmologic | index</title>
153
+ <!-- Font Awesome -->
154
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
155
+ <!-- Bootstrap core CSS -->
156
+ <link href="css/bootstrap.min.css" rel="stylesheet">
157
+ <!-- Material Design Bootstrap -->
158
+ <link href="css/mdb.min.css" rel="stylesheet">
159
+ <!-- Your custom styles (optional) -->
160
+ <link href="css/style.css" rel="stylesheet">
161
+ </head>
162
+ HTML
163
+ website = dummy_object.grep_redirected_to_url(html.to_s)
164
+ expect(website).to be_nil
165
+ end
166
+
167
+ it 'should grep one of canonical or og:url' do
168
+ html = <<~HTML
169
+ <head>
170
+ <meta charset="utf-8">
171
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
172
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
173
+ <title>Techmologic | index</title>
174
+ <link rel="canonical" href="">
175
+ <meta property="og:url" content="" />
176
+ <!-- Font Awesome -->
177
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
178
+ <!-- Bootstrap core CSS -->
179
+ <link href="css/bootstrap.min.css" rel="stylesheet">
180
+ <!-- Material Design Bootstrap -->
181
+ <link href="css/mdb.min.css" rel="stylesheet">
182
+ <!-- Your custom styles (optional) -->
183
+ <link href="css/style.css" rel="stylesheet">
184
+ <link rel="canonical" href="http://techmologics.com/">
185
+ <meta property="og:url" content="http://techmologics.com/" />
186
+ </head>
187
+ HTML
188
+ website = dummy_object.grep_redirected_to_url(html.to_s)
189
+ expect(website).to eq('http://techmologics.com/')
190
+ end
191
+
192
+ it 'should grep one of canonical or og:url whatever it\'s position' do
193
+ html = <<~HTML
194
+ <head>
195
+ <meta charset="utf-8">
196
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
197
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
198
+ <title>Techmologic | index</title>
199
+ <link href="" rel="canonical">
200
+ <meta content="" property="og:url"/>
201
+ <!-- Font Awesome -->
202
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
203
+ <!-- Bootstrap core CSS -->
204
+ <link href="css/bootstrap.min.css" rel="stylesheet">
205
+ <!-- Material Design Bootstrap -->
206
+ <link href="css/mdb.min.css" rel="stylesheet">
207
+ <!-- Your custom styles (optional) -->
208
+ <link href="css/style.css" rel="stylesheet">
209
+ <link href="http://techmologics.com/" rel="canonical" class="canonical">
210
+ <meta content="http://techmologics.com/" property="og:url"/>
211
+ </head>
212
+ HTML
213
+ website = dummy_object.grep_redirected_to_url(html.to_s)
214
+ expect(website).to eq('http://techmologics.com/')
215
+ end
216
+
217
+ it 'should decode html entities in the redirected_to url' do
218
+ html = <<~HTML
219
+ <meta content="https&#x3a;&#x2f;&#x2f;www&#x2e;santanderbank&#x2e;com&#x2f;us&#x2f;personal" property="og:url"/>
220
+ HTML
221
+ website = dummy_object.grep_redirected_to_url(html.to_s)
222
+ expect(website).to eq('https://www.santanderbank.com/us/personal')
223
+ end
206
224
  end
207
- end
225
+ end
@@ -31,6 +31,7 @@ describe 'Twitter Profile' do
31
31
  <a href=" http://twitter.com/share/" target="_blank">
32
32
  <a href="https://twitter.com/#!/Farmer_Brothers" target="_blank">
33
33
  <a href="http://twitter.com/javascripts/blogger.js" target="_blank">
34
+ <a href="https://twitter.com/{{../user.screen_name}}/status/{{../id_str}}" target="_blank">
34
35
  HTML
35
36
  expect(dummy_object.grep_twitter_profile(html.to_s)).to eq([])
36
37
  end
@@ -30,13 +30,15 @@ describe 'Vimeo Profile' do
30
30
  <a href="https://vimeo.com/channels/332103" target="_blank">
31
31
  <a href="https://vimeo.com/talech" target="_blank">
32
32
  <a href="https://vimeo.com/292173295/fdb8634a35/" target="_blank">
33
+ <a href="https://vimeo.com/337614648\\" target="_blank">
33
34
  HTML
34
35
  vimeo_profiles = dummy_object.grep_vimeo_profile(html.to_s)
35
36
  expected_profiles = [
36
- 'https://vimeo.com/107578087',
37
- 'https://vimeo.com/channels/332103',
38
- 'https://vimeo.com/talech',
39
- 'https://vimeo.com/292173295/fdb8634a35/',
37
+ 'https://vimeo.com/107578087',
38
+ 'https://vimeo.com/channels/332103',
39
+ 'https://vimeo.com/talech',
40
+ 'https://vimeo.com/292173295/fdb8634a35/',
41
+ 'https://vimeo.com/337614648'
40
42
  ]
41
43
  expect(vimeo_profiles).to eq(expected_profiles)
42
44
  end
metadata CHANGED
@@ -1,75 +1,69 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: brilliant_web_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kotu Bhaskara Rao
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-11 00:00:00.000000000 Z
11
+ date: 2019-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nesty
14
+ name: charlock_holmes
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
20
- - - ">="
21
- - !ruby/object:Gem::Version
22
- version: 1.0.1
19
+ version: 0.7.6
23
20
  type: :runtime
24
21
  prerelease: false
25
22
  version_requirements: !ruby/object:Gem::Requirement
26
23
  requirements:
27
24
  - - "~>"
28
25
  - !ruby/object:Gem::Version
29
- version: '1.0'
30
- - - ">="
31
- - !ruby/object:Gem::Version
32
- version: 1.0.1
26
+ version: 0.7.6
33
27
  - !ruby/object:Gem::Dependency
34
- name: rest-client
28
+ name: nesty
35
29
  requirement: !ruby/object:Gem::Requirement
36
30
  requirements:
37
31
  - - "~>"
38
32
  - !ruby/object:Gem::Version
39
- version: '2.0'
33
+ version: '1.0'
40
34
  - - ">="
41
35
  - !ruby/object:Gem::Version
42
- version: 2.0.2
36
+ version: 1.0.1
43
37
  type: :runtime
44
38
  prerelease: false
45
39
  version_requirements: !ruby/object:Gem::Requirement
46
40
  requirements:
47
41
  - - "~>"
48
42
  - !ruby/object:Gem::Version
49
- version: '2.0'
43
+ version: '1.0'
50
44
  - - ">="
51
45
  - !ruby/object:Gem::Version
52
- version: 2.0.2
46
+ version: 1.0.1
53
47
  - !ruby/object:Gem::Dependency
54
- name: nesty
48
+ name: rest-client
55
49
  requirement: !ruby/object:Gem::Requirement
56
50
  requirements:
57
51
  - - "~>"
58
52
  - !ruby/object:Gem::Version
59
- version: '1.0'
53
+ version: '2.0'
60
54
  - - ">="
61
55
  - !ruby/object:Gem::Version
62
- version: 1.0.1
63
- type: :development
56
+ version: 2.0.2
57
+ type: :runtime
64
58
  prerelease: false
65
59
  version_requirements: !ruby/object:Gem::Requirement
66
60
  requirements:
67
61
  - - "~>"
68
62
  - !ruby/object:Gem::Version
69
- version: '1.0'
63
+ version: '2.0'
70
64
  - - ">="
71
65
  - !ruby/object:Gem::Version
72
- version: 1.0.1
66
+ version: 2.0.2
73
67
  - !ruby/object:Gem::Dependency
74
68
  name: pry
75
69
  requirement: !ruby/object:Gem::Requirement
@@ -84,26 +78,6 @@ dependencies:
84
78
  - - "~>"
85
79
  - !ruby/object:Gem::Version
86
80
  version: 0.12.2
87
- - !ruby/object:Gem::Dependency
88
- name: rest-client
89
- requirement: !ruby/object:Gem::Requirement
90
- requirements:
91
- - - "~>"
92
- - !ruby/object:Gem::Version
93
- version: '2.0'
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: 2.0.2
97
- type: :development
98
- prerelease: false
99
- version_requirements: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - "~>"
102
- - !ruby/object:Gem::Version
103
- version: '2.0'
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- version: 2.0.2
107
81
  - !ruby/object:Gem::Dependency
108
82
  name: rspec
109
83
  requirement: !ruby/object:Gem::Requirement
@@ -166,16 +140,17 @@ dependencies:
166
140
  - - "~>"
167
141
  - !ruby/object:Gem::Version
168
142
  version: '2.1'
169
- description: Scrapes data such as description, social profiles, contact details
143
+ description: A decent web scraping gem.Scrapes website's title, description,social
144
+ profiles such as linkedin, facebook, twitter, instgram, vimeo,pinterest, youtube
145
+ channel and contact details such as emails, phone numbers.
170
146
  email: bkotu6717@gmail.com
171
147
  executables: []
172
148
  extensions: []
173
149
  extra_rdoc_files: []
174
150
  files:
175
151
  - Gemfile
152
+ - Gemfile.lock
176
153
  - README.md
177
- - brilliant_web_scraper-1.0.0.gem
178
- - brilliant_web_scraper-1.0.gem
179
154
  - brilliant_web_scraper.gemspec
180
155
  - lib/brilliant_web_scraper.rb
181
156
  - lib/parsers/description_helper.rb
@@ -246,5 +221,5 @@ rubyforge_project:
246
221
  rubygems_version: 2.5.1
247
222
  signing_key:
248
223
  specification_version: 4
249
- summary: A decent web scraping ruby library!
224
+ summary: A decent web scraping ruby gem!
250
225
  test_files: []