brilliant_web_scraper 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: efbe9d1a0688fd10e200d972b56c3e2ec86203f1
4
- data.tar.gz: 20cce1c52197f11dcea73813831bb4172829ddaa
3
+ metadata.gz: 9ad5219a19dcfc311bed756a83d82fd3758bd71a
4
+ data.tar.gz: c085eb2a96b8eb503cd44edc87821823cf0ad965
5
5
  SHA512:
6
- metadata.gz: 638c34f7efbc963613f4bb841abbf183bf134ee3197bebc99f9403ba7864befd44243f53092a9aa3ba7ea58314475b61d6671816e8e3f8ef4deb7f49b6f0ef52
7
- data.tar.gz: f91110f69e8228de408aa0c35050fe6137fac22bdb93ff86be3c70d380e1cf57534f50f78e5d585cb63e421ff0fc51aa04089d38e8a86ab3a6ca305659dc909a
6
+ metadata.gz: 4a5c3c9dd78f3e123b0c279a04c2d0d262c58aef8d0086668c37aea8e52b6ec8da98d6d242debddf600922dcfed3d377bd159e62fc9bb1e7390b1e62e881eb2b
7
+ data.tar.gz: de051e60d7b90bde7984871d1b39e52f46be6366910591f0ec31c434a41037a9cc1274989dd50755cbf6e03d0e9f498a3271aad173b12a8ea3fd6a578eb8fbc9
@@ -0,0 +1,90 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ brilliant_web_scraper (0.2)
5
+ charlock_holmes (~> 0.7.6)
6
+ nesty (~> 1.0, >= 1.0.1)
7
+ rest-client (~> 2.0, >= 2.0.2)
8
+
9
+ GEM
10
+ remote: http://rubygems.org/
11
+ specs:
12
+ addressable (2.6.0)
13
+ public_suffix (>= 2.0.2, < 4.0)
14
+ ast (2.4.0)
15
+ charlock_holmes (0.7.6)
16
+ coderay (1.1.2)
17
+ crack (0.4.3)
18
+ safe_yaml (~> 1.0.0)
19
+ diff-lcs (1.3)
20
+ domain_name (0.5.20190701)
21
+ unf (>= 0.0.5, < 1.0.0)
22
+ hashdiff (1.0.0)
23
+ http-accept (1.7.0)
24
+ http-cookie (1.0.3)
25
+ domain_name (~> 0.5)
26
+ jaro_winkler (1.5.3)
27
+ method_source (0.9.2)
28
+ mime-types (3.2.2)
29
+ mime-types-data (~> 3.2015)
30
+ mime-types-data (3.2019.0331)
31
+ nesty (1.0.2)
32
+ netrc (0.11.0)
33
+ parallel (1.17.0)
34
+ parser (2.6.3.0)
35
+ ast (~> 2.4.0)
36
+ pry (0.12.2)
37
+ coderay (~> 1.1.0)
38
+ method_source (~> 0.9.0)
39
+ public_suffix (3.1.1)
40
+ rainbow (3.0.0)
41
+ rest-client (2.1.0)
42
+ http-accept (>= 1.7.0, < 2.0)
43
+ http-cookie (>= 1.0.2, < 2.0)
44
+ mime-types (>= 1.16, < 4.0)
45
+ netrc (~> 0.8)
46
+ rspec (3.8.0)
47
+ rspec-core (~> 3.8.0)
48
+ rspec-expectations (~> 3.8.0)
49
+ rspec-mocks (~> 3.8.0)
50
+ rspec-core (3.8.2)
51
+ rspec-support (~> 3.8.0)
52
+ rspec-expectations (3.8.4)
53
+ diff-lcs (>= 1.2.0, < 2.0)
54
+ rspec-support (~> 3.8.0)
55
+ rspec-mocks (3.8.1)
56
+ diff-lcs (>= 1.2.0, < 2.0)
57
+ rspec-support (~> 3.8.0)
58
+ rspec-support (3.8.2)
59
+ rubocop (0.73.0)
60
+ jaro_winkler (~> 1.5.1)
61
+ parallel (~> 1.10)
62
+ parser (>= 2.6)
63
+ rainbow (>= 2.2.2, < 4.0)
64
+ ruby-progressbar (~> 1.7)
65
+ unicode-display_width (>= 1.4.0, < 1.7)
66
+ ruby-progressbar (1.10.1)
67
+ safe_yaml (1.0.5)
68
+ unf (0.1.4)
69
+ unf_ext
70
+ unf_ext (0.0.7.6)
71
+ unicode-display_width (1.6.0)
72
+ vcr (3.0.3)
73
+ webmock (2.3.2)
74
+ addressable (>= 2.3.6)
75
+ crack (>= 0.3.2)
76
+ hashdiff
77
+
78
+ PLATFORMS
79
+ ruby
80
+
81
+ DEPENDENCIES
82
+ brilliant_web_scraper!
83
+ pry (~> 0.12.2)
84
+ rspec (~> 3.5)
85
+ rubocop (~> 0.73.0)
86
+ vcr (~> 3.0, >= 3.0.1)
87
+ webmock (~> 2.1)
88
+
89
+ BUNDLED WITH
90
+ 1.16.6
data/README.md CHANGED
@@ -1,14 +1,11 @@
1
- # WebScraper [![Build Status](https://api.travis-ci.com/bkotu6717/brilliant_web_scraper.svg)](https://travis-ci.com/bkotu6717/brilliant_web_scraper)
1
+ # BrilliantWebScraper [![Build Status](https://api.travis-ci.com/bkotu6717/brilliant_web_scraper.svg)](https://travis-ci.com/bkotu6717/brilliant_web_scraper)[![Maintainability](https://api.codeclimate.com/v1/badges/15a8a6e117f11bd94376/maintainability)](https://codeclimate.com/github/bkotu6717/brilliant_web_scraper/maintainability)
2
2
 
3
- A decent web scraping gem. Scrapes website description, social profiles, contact details, youtube channels.
4
-
5
-
6
- It accepts a URL or Domain as input and gets it's title, descrptios, social profiles, YouTube channels and it's current URL if got redirected.
3
+ A decent web scraping gem. Scrapes website title, description, social profiles such as linkedin, facebook, twitter, instgram, vimeo, pinterest, youtube channel and contact details such as emails, phone numbers.
7
4
 
8
5
 
9
6
  ## See it in action!
10
7
 
11
- You can try WebScraper live at this little demo: [https://brilliantweb-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
8
+ You can try BrillaintWebScraper live at this little demo: [https://brilliant-web-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
12
9
 
13
10
  ## Installation
14
11
 
@@ -21,11 +18,11 @@ gem 'brilliant_web_scraper'
21
18
 
22
19
  ## Usage
23
20
 
24
- Initialize a BrilliantWebScraper instance for an URL, like this:
21
+ Initialize a BrilliantWebScraper instance for an URL, like this with optional timeouts, default connection_timeout and read_timeouts are 10s, 10s respectively:
25
22
 
26
23
  ```ruby
27
24
  require 'brilliant_web_scraper'
25
+ results = BrilliantWebScraper.new('http://pwc.com', 5, 5)
26
+
28
27
  results = BrilliantWebScraper.new('http://pwc.com')
29
28
  ```
30
-
31
- If you don't include the scheme on the URL, it is fine:
@@ -6,23 +6,28 @@ Gem::Specification.new do |s|
6
6
  s.name = 'brilliant_web_scraper'
7
7
  s.version = WebScraper::VERSION
8
8
  s.licenses = ['Nonstandard']
9
- s.summary = 'A decent web scraping ruby library!'
10
- s.description = 'Scrapes data such as description, social profiles, contact details'
9
+ s.summary = 'A decent web scraping ruby gem!'
10
+ s.description = 'A decent web scraping gem.'\
11
+ 'Scrapes website\'s title, description,'\
12
+ 'social profiles such as linkedin, '\
13
+ 'facebook, twitter, instgram, vimeo,'\
14
+ 'pinterest, youtube channel and'\
15
+ ' contact details such as emails, phone numbers.'
11
16
  s.authors = ['Kotu Bhaskara Rao']
12
17
  s.email = 'bkotu6717@gmail.com'
13
18
  s.require_paths = ['lib']
14
19
  s.homepage = 'https://github.com/bkotu6717/brilliant_web_scraper'
15
20
  s.files = Dir['**/*'].keep_if { |file|
16
- file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" && File.file?(file)
21
+ file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" &&
22
+ File.file?(file)
17
23
  }
18
24
  s.required_ruby_version = '>= 2.3.0'
19
25
 
20
- s.add_dependency 'nesty', '~> 1.0', '>= 1.0.1'
21
- s.add_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
26
+ s.add_runtime_dependency 'charlock_holmes', '~> 0.7.6'
27
+ s.add_runtime_dependency 'nesty', '~> 1.0', '>= 1.0.1'
28
+ s.add_runtime_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
22
29
 
23
- s.add_development_dependency 'nesty', '~> 1.0', '>= 1.0.1'
24
30
  s.add_development_dependency 'pry', '~> 0.12.2'
25
- s.add_development_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
26
31
  s.add_development_dependency 'rspec', '~> 3.5'
27
32
  s.add_development_dependency 'rubocop', '~> 0.73.0'
28
33
  s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
@@ -2,7 +2,8 @@
2
2
 
3
3
  require 'rest-client'
4
4
  require 'cgi'
5
- require 'benchmark'
5
+ require 'charlock_holmes/string'
6
+ require 'timeout'
6
7
 
7
8
  current_directory = File.dirname(__FILE__) + '/scraper'
8
9
  require File.expand_path(File.join(current_directory, 'errors'))
@@ -21,7 +21,7 @@ module DescriptionHelper
21
21
  def parse_description(descriptions)
22
22
  return if descriptions.nil? || descriptions.empty?
23
23
 
24
- descriptions = descriptions.reject { |x| x.nil? || x.empty? }
24
+ descriptions = descriptions.reject { |x| x.nil? || x.empty? || x =~ /^\s*$/}
25
25
  descriptions = descriptions.map { |x| unescape_html(x) }
26
26
  descriptions.find { |x| (x !~ /^\s*[|-]?\s*$/) }
27
27
  end
@@ -10,7 +10,7 @@ module Emails
10
10
  return if response.nil? || response.empty?
11
11
 
12
12
  first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
13
- second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3})["'\s><]}
13
+ second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3})["'\s><]}
14
14
  first_set = response.scan(first_regex).flatten.compact
15
15
  first_set = get_processed_emails(first_set)
16
16
  second_set = response.scan(second_regex).flatten.compact
@@ -24,7 +24,7 @@ module Emails
24
24
  unescaped_emails = email_set.map { |email| unescape_html(email) }
25
25
  return [] if unescaped_emails.empty?
26
26
 
27
- email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3}/im
27
+ email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3}/im
28
28
  unescaped_emails.select { |data| data =~ email_match_regex }
29
29
  end
30
30
  end
@@ -5,7 +5,7 @@ module FacebookProfile
5
5
  def grep_facebook_profile(response)
6
6
  return if response.nil? || response.empty?
7
7
 
8
- facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
8
+ facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|images|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
9
9
  response.scan(facebook_url_regex).flatten.compact.uniq
10
10
  end
11
11
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  # Fetch latest url of the given website
4
4
  module RedirectedTo
5
+ include UnescapeHtmlHelper
5
6
 
6
7
  def grep_redirected_to_url(response)
7
8
  return if response.nil? || response.empty?
@@ -18,7 +19,7 @@ module RedirectedTo
18
19
  url = parser(web_urls)
19
20
  break unless url.nil?
20
21
  end
21
- url
22
+ unescape_html(url)
22
23
  end
23
24
 
24
25
  private
@@ -5,7 +5,7 @@ module TwitterProfile
5
5
  def grep_twitter_profile(response)
6
6
  return if response.nil? || response.empty?
7
7
 
8
- twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
8
+ twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!\{\{)(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
9
9
  response.scan(twitter_regex).flatten.compact.uniq
10
10
  end
11
11
  end
@@ -5,7 +5,7 @@ module VimeoProfile
5
5
  def grep_vimeo_profile(response)
6
6
  return if response.nil? || response.empty?
7
7
 
8
- vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\&\?<>\s]+)}
8
+ vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\\\&\?<>\s]+)}
9
9
  response.scan(vimeo_regex).flatten.compact.uniq
10
10
  end
11
11
  end
@@ -6,38 +6,38 @@
6
6
  # @Social Profiles
7
7
  # @Contact Details
8
8
  module ScrapeHelper
9
- def perform_scrape(url, read_timeout, connection_timeout)
10
- response = nil
11
- request_duration = Benchmark.measure do
12
- response = ScrapeRequest.new(url, read_timeout, connection_timeout)
13
- end.real
14
- retry_count = 0
15
- begin
16
- scrape_data = nil
17
- scrape_duration = Benchmark.measure do
18
- scrape_data = grep_data(response.body)
19
- end.real
20
-
21
- data_hash = {
22
- web_request_duration: request_duration,
23
- response_scrape_duraton: scrape_duration,
24
- scrape_data: scrape_data
25
- }
26
- rescue ArgumentError => e
27
- retry_count += 1
28
- raise WebScraper::ParserError, e.message if retry_count > 1
29
-
30
- response = response.encode('UTF-16be', invalid: :replace, replace: '?')
31
- response = response.encode('UTF-8')
32
- retry
33
- rescue Encoding::CompatibilityError => e
34
- raise WebScraper::ParserError, e.message
9
+ def perform_scrape(url, read_timeout, open_timeout)
10
+ timeout_in_sec = scraper_timeout(read_timeout, open_timeout)
11
+ Timeout::timeout(timeout_in_sec) do
12
+ response = ScrapeRequest.new(url, read_timeout, open_timeout)
13
+ retry_count = 0
14
+ body = response.body
15
+ begin
16
+ body = body.tr("\000", '')
17
+ encoding = body.detect_encoding[:encoding]
18
+ body = body.encode('UTF-8', encoding)
19
+ grep_data(body)
20
+ rescue Encoding::UndefinedConversionError, ArgumentError => e
21
+ retry_count += 1
22
+ raise WebScraper::ParserError, e.message if retry_count > 1
23
+ body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
24
+ retry
25
+ rescue Encoding::CompatibilityError => e
26
+ raise WebScraper::ParserError, e.message
27
+ rescue StandardError => e
28
+ raise WebScraper::RequestError, e.message
29
+ end
35
30
  end
36
- data_hash
31
+ rescue Timeout::Error => e
32
+ raise WebScraper::TimeoutError, e.message
37
33
  end
38
34
 
39
35
  private
40
36
 
37
+ def scraper_timeout(read_timeout, open_timeout)
38
+ ( read_timeout + open_timeout + 1 )
39
+ end
40
+
41
41
  def grep_data(response)
42
42
  {
43
43
  title: grep_title(response),
@@ -1,24 +1,28 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # @Makes actual scrape request, either raises exception or response
3
+ # @Makes actual scrape request, either raises exception or serves response
4
4
  module ScrapeRequest
5
5
  extend ScrapeExceptions
6
6
  class << self
7
7
  def new(url, read_timeout, connection_timeout)
8
+ params_hash = {
9
+ method: :get,
10
+ url: url,
11
+ read_timeout: read_timeout,
12
+ open_timeout: connection_timeout,
13
+ max_redirects: 10,
14
+ verify_ssl: false
15
+ }
8
16
  begin
9
- params_hash = {
10
- method: :get,
11
- url: url,
12
- read_timeout: read_timeout,
13
- connection_timeout: connection_timeout,
14
- headers: { 'accept-encoding': 'identity' }
15
- }
16
17
  response = RestClient::Request.execute(params_hash)
17
18
  content_type = response.headers[:content_type]
18
19
  return response if content_type =~ %r{(?i)text\s*\/\s*html}
19
20
 
20
21
  exception_message = "Invalid response format received: #{content_type}"
21
22
  raise WebScraper::NonHtmlError, exception_message
23
+ rescue Zlib::DataError
24
+ params_hash[:headers] = { 'accept-encoding': 'identity' }
25
+ retry
22
26
  rescue *TIMEOUT_EXCEPTIONS => e
23
27
  raise WebScraper::TimeoutError, e.message
24
28
  rescue *GENERAL_EXCEPTIONS => e
@@ -2,5 +2,5 @@
2
2
 
3
3
  # Holds current version number
4
4
  module WebScraper
5
- VERSION = '0.1'
5
+ VERSION = '0.2'
6
6
  end
@@ -27,6 +27,10 @@ describe 'Emails' do
27
27
  <a href="mailto:xxx@yyy.zzz">xxx@yyy.zzz</a>
28
28
  <a href="mailto:test@test.com">test@test.com</a>
29
29
  <a href="mailto:@example.com">@example.com"</a>
30
+ <a href="mailto:v@201908240100.css">v@201908240100.css"</a>
31
+ <a href="mailto:v@201908240100.js">v@201908240100.js"</a>
32
+ <a href="mailto:ajax-loader@2x.gif">ajax-loader@2x.gif"</a>
33
+ <a href="mailto:favicon@2x.ico">favicon@2x.ico"</a>
30
34
  HTML
31
35
  expect(dummy_object.grep_emails(html.to_s)).to eq([])
32
36
  end
@@ -14,6 +14,7 @@ describe 'FaceBook Profile' do
14
14
 
15
15
  it 'should not grep any non profile url' do
16
16
  html = <<~HTML
17
+ <a href="https://www.facebook.com/images/fb_icon_325x325.png" target="_blank" class="sqs-svg-icon--wrapper facebook">
17
18
  <a href="http://www.facebook.com/2008/fbml" target="_blank" class="sqs-svg-icon--wrapper facebook">
18
19
  <a href="https://www.facebook.com/v2.0/dialog/share" target="_blank" class="sqs-svg-icon--wrapper facebook">
19
20
  <a href="https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2FHFXMooseheads%2Fvideos" target="_blank" class="sqs-svg-icon--wrapper facebook">
@@ -3,205 +3,223 @@ require 'spec_helper'
3
3
  describe 'Website Redirected To' do
4
4
 
5
5
  class DummyTestClass
6
- include RedirectedTo
7
- end
6
+ include RedirectedTo
7
+ end
8
8
  let(:dummy_object) { DummyTestClass.new }
9
9
 
10
10
 
11
11
  it 'should return nil for invalid input' do
12
- expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
13
- expect(dummy_object.grep_redirected_to_url('')).to be_nil
14
- end
12
+ expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
13
+ expect(dummy_object.grep_redirected_to_url('')).to be_nil
14
+ end
15
15
 
16
16
  describe 'Website grep from link tag' do
17
- describe 'rel attribute first ' do
18
-
19
- it 'should return nil when canonical url is empty' do
20
- html = <<~HTML
21
- <link rel="canonical" href="">
22
- <link rel="canonical" href=''>
23
- HTML
24
- website = dummy_object.grep_redirected_to_url(html.to_s)
25
- expect(website).to be_nil
26
- end
27
-
28
- it 'should grep website' do
29
- html = <<~HTML
30
- <link rel="canonical" href="">
31
- <link rel="canonical" href='https://www.apple.com/'>
32
- HTML
33
- website = dummy_object.grep_redirected_to_url(html.to_s)
34
- expect(website).to eq('https://www.apple.com/')
35
- end
36
-
37
- it 'should grep website even with extra attributes' do
38
- html = <<~HTML
39
- <link rel="canonical" href="" itemprop="current_url">
40
- <link rel="canonical" href='https://www.apple.com/'
41
- itemprop="current_url" >
42
- HTML
43
- website = dummy_object.grep_redirected_to_url(html.to_s)
44
- expect(website).to eq('https://www.apple.com/')
45
- end
46
- end
47
- describe 'href attribute first' do
48
- it 'should return nil when canonical url is empty' do
49
- html = <<~HTML
50
- <link href="" rel="canonical" >
51
- <link href='' rel="canonical" >
52
- HTML
53
- website = dummy_object.grep_redirected_to_url(html.to_s)
54
- expect(website).to be_nil
55
- end
56
-
57
- it 'should grep website' do
58
- html = <<~HTML
59
- <link rel="canonical" href="">
60
- <link href='https://www.apple.com/' rel="canonical">
61
- HTML
62
- website = dummy_object.grep_redirected_to_url(html.to_s)
63
- expect(website).to eq('https://www.apple.com/')
64
- end
65
-
66
- it 'should grep website even with extra attributes' do
67
- html = <<~HTML
68
- <link href="" itemprop="current_url" rel="canonical">
69
- <link href='https://www.apple.com/' rel="canonical"
70
- itemprop="current_url" >
71
- HTML
72
- website = dummy_object.grep_redirected_to_url(html.to_s)
73
- expect(website).to eq('https://www.apple.com/')
74
- end
75
- end
17
+ describe 'rel attribute first ' do
18
+
19
+ it 'should return nil when canonical url is empty' do
20
+ html = <<~HTML
21
+ <link rel="canonical" href="">
22
+ <link rel="canonical" href=''>
23
+ HTML
24
+ website = dummy_object.grep_redirected_to_url(html.to_s)
25
+ expect(website).to be_nil
26
+ end
27
+
28
+ it 'should grep website' do
29
+ html = <<~HTML
30
+ <link rel="canonical" href="">
31
+ <link rel="canonical" href='https://www.apple.com/'>
32
+ HTML
33
+ website = dummy_object.grep_redirected_to_url(html.to_s)
34
+ expect(website).to eq('https://www.apple.com/')
35
+ end
36
+
37
+ it 'should grep website even with extra attributes' do
38
+ html = <<~HTML
39
+ <link rel="canonical" href="" itemprop="current_url">
40
+ <link rel="canonical" href='https://www.apple.com/'
41
+ itemprop="current_url" >
42
+ HTML
43
+ website = dummy_object.grep_redirected_to_url(html.to_s)
44
+ expect(website).to eq('https://www.apple.com/')
45
+ end
46
+ end
47
+
48
+ describe 'href attribute first' do
49
+
50
+ it 'should return nil when canonical url is empty' do
51
+ html = <<~HTML
52
+ <link href="" rel="canonical" >
53
+ <link href='' rel="canonical" >
54
+ HTML
55
+ website = dummy_object.grep_redirected_to_url(html.to_s)
56
+ expect(website).to be_nil
57
+ end
58
+
59
+ it 'should grep website' do
60
+ html = <<~HTML
61
+ <link rel="canonical" href="">
62
+ <link href='https://www.apple.com/' rel="canonical">
63
+ HTML
64
+ website = dummy_object.grep_redirected_to_url(html.to_s)
65
+ expect(website).to eq('https://www.apple.com/')
66
+ end
67
+
68
+ it 'should grep website even with extra attributes' do
69
+ html = <<~HTML
70
+ <link href="" itemprop="current_url" rel="canonical">
71
+ <link href='https://www.apple.com/' rel="canonical"
72
+ itemprop="current_url" >
73
+ HTML
74
+ website = dummy_object.grep_redirected_to_url(html.to_s)
75
+ expect(website).to eq('https://www.apple.com/')
76
+ end
77
+ end
76
78
  end
79
+
77
80
  describe 'Website grep from organization URL' do
78
- describe 'property attribute first ' do
79
- it 'should return nil when canonical url is empty' do
80
- html = <<~HTML
81
- <meta property="og:url" content="" />
82
- <meta property="og:url" content='' />
83
- HTML
84
- website = dummy_object.grep_redirected_to_url(html.to_s)
85
- expect(website).to be_nil
86
- end
87
-
88
- it 'should grep website' do
89
- html = <<~HTML
90
- <link property="og:url" content="">
91
- <meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
92
- HTML
93
- website = dummy_object.grep_redirected_to_url(html.to_s)
94
- expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
95
- end
96
-
97
- it 'should grep website even with extra attributes' do
98
- html = <<~HTML
99
- <link property="og:url" content="" calss="og-url">
100
- <meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
101
- class="og-url" />
102
- HTML
103
- website = dummy_object.grep_redirected_to_url(html.to_s)
104
- expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
105
- end
106
- end
107
- describe 'content attribute first ' do
108
- it 'should return nil when canonical url is empty' do
109
- html = <<~HTML
110
- <meta content="" property="og:url" />
111
- <meta content='' property="og:url"/>
112
- HTML
113
- website = dummy_object.grep_redirected_to_url(html.to_s)
114
- expect(website).to be_nil
115
- end
116
-
117
- it 'should grep website' do
118
- html = <<~HTML
119
- <link content="" property="og:url" >
120
- <meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
121
- HTML
122
- website = dummy_object.grep_redirected_to_url(html.to_s)
123
- expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
124
- end
125
-
126
- it 'should grep website even with extra attributes' do
127
- html = <<~HTML
128
- <link content="" calss="og-url" property="og:url">
129
- <meta content='https://www.dieppe.ca/fr/index.aspx'
130
- class="og-url" property="og:url" />
131
- HTML
132
- website = dummy_object.grep_redirected_to_url(html.to_s)
133
- expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
134
- end
135
- end
81
+
82
+ describe 'property attribute first ' do
83
+
84
+ it 'should return nil when canonical url is empty' do
85
+ html = <<~HTML
86
+ <meta property="og:url" content="" />
87
+ <meta property="og:url" content='' />
88
+ HTML
89
+ website = dummy_object.grep_redirected_to_url(html.to_s)
90
+ expect(website).to be_nil
91
+ end
92
+
93
+ it 'should grep website' do
94
+ html = <<~HTML
95
+ <link property="og:url" content="">
96
+ <meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
97
+ HTML
98
+ website = dummy_object.grep_redirected_to_url(html.to_s)
99
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
100
+ end
101
+
102
+ it 'should grep website even with extra attributes' do
103
+ html = <<~HTML
104
+ <link property="og:url" content="" calss="og-url">
105
+ <meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
106
+ class="og-url" />
107
+ HTML
108
+ website = dummy_object.grep_redirected_to_url(html.to_s)
109
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
110
+ end
111
+ end
112
+
113
+ describe 'content attribute first ' do
114
+
115
+ it 'should return nil when canonical url is empty' do
116
+ html = <<~HTML
117
+ <meta content="" property="og:url" />
118
+ <meta content='' property="og:url"/>
119
+ HTML
120
+ website = dummy_object.grep_redirected_to_url(html.to_s)
121
+ expect(website).to be_nil
122
+ end
123
+
124
+ it 'should grep website' do
125
+ html = <<~HTML
126
+ <link content="" property="og:url" >
127
+ <meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
128
+ HTML
129
+ website = dummy_object.grep_redirected_to_url(html.to_s)
130
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
131
+ end
132
+
133
+ it 'should grep website even with extra attributes' do
134
+ html = <<~HTML
135
+ <link content="" calss="og-url" property="og:url">
136
+ <meta content='https://www.dieppe.ca/fr/index.aspx'
137
+ class="og-url" property="og:url" />
138
+ HTML
139
+ website = dummy_object.grep_redirected_to_url(html.to_s)
140
+ expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
141
+ end
142
+ end
136
143
  end
137
144
  describe 'grep website' do
138
- it 'it should return nil when link or og:url is absent' do
139
- html = <<~HTML
140
- <head>
141
- <meta charset="utf-8">
142
- <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
143
- <meta http-equiv="x-ua-compatible" content="ie=edge">
144
- <title>Techmologic | index</title>
145
- <!-- Font Awesome -->
146
- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
147
- <!-- Bootstrap core CSS -->
148
- <link href="css/bootstrap.min.css" rel="stylesheet">
149
- <!-- Material Design Bootstrap -->
150
- <link href="css/mdb.min.css" rel="stylesheet">
151
- <!-- Your custom styles (optional) -->
152
- <link href="css/style.css" rel="stylesheet">
153
- </head>
154
- HTML
155
- website = dummy_object.grep_redirected_to_url(html.to_s)
156
- expect(website).to be_nil
157
- end
158
- it 'should grep one of canonical or og:url' do
159
- html = <<~HTML
160
- <head>
161
- <meta charset="utf-8">
162
- <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
163
- <meta http-equiv="x-ua-compatible" content="ie=edge">
164
- <title>Techmologic | index</title>
165
- <link rel="canonical" href="">
166
- <meta property="og:url" content="" />
167
- <!-- Font Awesome -->
168
- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
169
- <!-- Bootstrap core CSS -->
170
- <link href="css/bootstrap.min.css" rel="stylesheet">
171
- <!-- Material Design Bootstrap -->
172
- <link href="css/mdb.min.css" rel="stylesheet">
173
- <!-- Your custom styles (optional) -->
174
- <link href="css/style.css" rel="stylesheet">
175
- <link rel="canonical" href="http://techmologics.com/">
176
- <meta property="og:url" content="http://techmologics.com/" />
177
- </head>
178
- HTML
179
- website = dummy_object.grep_redirected_to_url(html.to_s)
180
- expect(website).to eq('http://techmologics.com/')
181
- end
182
- it 'should grep one of canonical or og:url whatever it\'s position' do
183
- html = <<~HTML
184
- <head>
185
- <meta charset="utf-8">
186
- <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
187
- <meta http-equiv="x-ua-compatible" content="ie=edge">
188
- <title>Techmologic | index</title>
189
- <link href="" rel="canonical">
190
- <meta content="" property="og:url"/>
191
- <!-- Font Awesome -->
192
- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
193
- <!-- Bootstrap core CSS -->
194
- <link href="css/bootstrap.min.css" rel="stylesheet">
195
- <!-- Material Design Bootstrap -->
196
- <link href="css/mdb.min.css" rel="stylesheet">
197
- <!-- Your custom styles (optional) -->
198
- <link href="css/style.css" rel="stylesheet">
199
- <link href="http://techmologics.com/" rel="canonical" class="canonical">
200
- <meta content="http://techmologics.com/" property="og:url"/>
201
- </head>
202
- HTML
203
- website = dummy_object.grep_redirected_to_url(html.to_s)
204
- expect(website).to eq('http://techmologics.com/')
205
- end
145
+
146
+ it 'it should return nil when link or og:url is absent' do
147
+ html = <<~HTML
148
+ <head>
149
+ <meta charset="utf-8">
150
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
151
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
152
+ <title>Techmologic | index</title>
153
+ <!-- Font Awesome -->
154
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
155
+ <!-- Bootstrap core CSS -->
156
+ <link href="css/bootstrap.min.css" rel="stylesheet">
157
+ <!-- Material Design Bootstrap -->
158
+ <link href="css/mdb.min.css" rel="stylesheet">
159
+ <!-- Your custom styles (optional) -->
160
+ <link href="css/style.css" rel="stylesheet">
161
+ </head>
162
+ HTML
163
+ website = dummy_object.grep_redirected_to_url(html.to_s)
164
+ expect(website).to be_nil
165
+ end
166
+
167
+ it 'should grep one of canonical or og:url' do
168
+ html = <<~HTML
169
+ <head>
170
+ <meta charset="utf-8">
171
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
172
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
173
+ <title>Techmologic | index</title>
174
+ <link rel="canonical" href="">
175
+ <meta property="og:url" content="" />
176
+ <!-- Font Awesome -->
177
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
178
+ <!-- Bootstrap core CSS -->
179
+ <link href="css/bootstrap.min.css" rel="stylesheet">
180
+ <!-- Material Design Bootstrap -->
181
+ <link href="css/mdb.min.css" rel="stylesheet">
182
+ <!-- Your custom styles (optional) -->
183
+ <link href="css/style.css" rel="stylesheet">
184
+ <link rel="canonical" href="http://techmologics.com/">
185
+ <meta property="og:url" content="http://techmologics.com/" />
186
+ </head>
187
+ HTML
188
+ website = dummy_object.grep_redirected_to_url(html.to_s)
189
+ expect(website).to eq('http://techmologics.com/')
190
+ end
191
+
192
+ it 'should grep one of canonical or og:url whatever it\'s position' do
193
+ html = <<~HTML
194
+ <head>
195
+ <meta charset="utf-8">
196
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
197
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
198
+ <title>Techmologic | index</title>
199
+ <link href="" rel="canonical">
200
+ <meta content="" property="og:url"/>
201
+ <!-- Font Awesome -->
202
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
203
+ <!-- Bootstrap core CSS -->
204
+ <link href="css/bootstrap.min.css" rel="stylesheet">
205
+ <!-- Material Design Bootstrap -->
206
+ <link href="css/mdb.min.css" rel="stylesheet">
207
+ <!-- Your custom styles (optional) -->
208
+ <link href="css/style.css" rel="stylesheet">
209
+ <link href="http://techmologics.com/" rel="canonical" class="canonical">
210
+ <meta content="http://techmologics.com/" property="og:url"/>
211
+ </head>
212
+ HTML
213
+ website = dummy_object.grep_redirected_to_url(html.to_s)
214
+ expect(website).to eq('http://techmologics.com/')
215
+ end
216
+
217
+ it 'should decode html entities in the redirected_to url' do
218
+ html = <<~HTML
219
+ <meta content="https&#x3a;&#x2f;&#x2f;www&#x2e;santanderbank&#x2e;com&#x2f;us&#x2f;personal" property="og:url"/>
220
+ HTML
221
+ website = dummy_object.grep_redirected_to_url(html.to_s)
222
+ expect(website).to eq('https://www.santanderbank.com/us/personal')
223
+ end
206
224
  end
207
- end
225
+ end
@@ -31,6 +31,7 @@ describe 'Twitter Profile' do
31
31
  <a href=" http://twitter.com/share/" target="_blank">
32
32
  <a href="https://twitter.com/#!/Farmer_Brothers" target="_blank">
33
33
  <a href="http://twitter.com/javascripts/blogger.js" target="_blank">
34
+ <a href="https://twitter.com/{{../user.screen_name}}/status/{{../id_str}}" target="_blank">
34
35
  HTML
35
36
  expect(dummy_object.grep_twitter_profile(html.to_s)).to eq([])
36
37
  end
@@ -30,13 +30,15 @@ describe 'Vimeo Profile' do
30
30
  <a href="https://vimeo.com/channels/332103" target="_blank">
31
31
  <a href="https://vimeo.com/talech" target="_blank">
32
32
  <a href="https://vimeo.com/292173295/fdb8634a35/" target="_blank">
33
+ <a href="https://vimeo.com/337614648\\" target="_blank">
33
34
  HTML
34
35
  vimeo_profiles = dummy_object.grep_vimeo_profile(html.to_s)
35
36
  expected_profiles = [
36
- 'https://vimeo.com/107578087',
37
- 'https://vimeo.com/channels/332103',
38
- 'https://vimeo.com/talech',
39
- 'https://vimeo.com/292173295/fdb8634a35/',
37
+ 'https://vimeo.com/107578087',
38
+ 'https://vimeo.com/channels/332103',
39
+ 'https://vimeo.com/talech',
40
+ 'https://vimeo.com/292173295/fdb8634a35/',
41
+ 'https://vimeo.com/337614648'
40
42
  ]
41
43
  expect(vimeo_profiles).to eq(expected_profiles)
42
44
  end
metadata CHANGED
@@ -1,75 +1,69 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: brilliant_web_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kotu Bhaskara Rao
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-11 00:00:00.000000000 Z
11
+ date: 2019-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nesty
14
+ name: charlock_holmes
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
20
- - - ">="
21
- - !ruby/object:Gem::Version
22
- version: 1.0.1
19
+ version: 0.7.6
23
20
  type: :runtime
24
21
  prerelease: false
25
22
  version_requirements: !ruby/object:Gem::Requirement
26
23
  requirements:
27
24
  - - "~>"
28
25
  - !ruby/object:Gem::Version
29
- version: '1.0'
30
- - - ">="
31
- - !ruby/object:Gem::Version
32
- version: 1.0.1
26
+ version: 0.7.6
33
27
  - !ruby/object:Gem::Dependency
34
- name: rest-client
28
+ name: nesty
35
29
  requirement: !ruby/object:Gem::Requirement
36
30
  requirements:
37
31
  - - "~>"
38
32
  - !ruby/object:Gem::Version
39
- version: '2.0'
33
+ version: '1.0'
40
34
  - - ">="
41
35
  - !ruby/object:Gem::Version
42
- version: 2.0.2
36
+ version: 1.0.1
43
37
  type: :runtime
44
38
  prerelease: false
45
39
  version_requirements: !ruby/object:Gem::Requirement
46
40
  requirements:
47
41
  - - "~>"
48
42
  - !ruby/object:Gem::Version
49
- version: '2.0'
43
+ version: '1.0'
50
44
  - - ">="
51
45
  - !ruby/object:Gem::Version
52
- version: 2.0.2
46
+ version: 1.0.1
53
47
  - !ruby/object:Gem::Dependency
54
- name: nesty
48
+ name: rest-client
55
49
  requirement: !ruby/object:Gem::Requirement
56
50
  requirements:
57
51
  - - "~>"
58
52
  - !ruby/object:Gem::Version
59
- version: '1.0'
53
+ version: '2.0'
60
54
  - - ">="
61
55
  - !ruby/object:Gem::Version
62
- version: 1.0.1
63
- type: :development
56
+ version: 2.0.2
57
+ type: :runtime
64
58
  prerelease: false
65
59
  version_requirements: !ruby/object:Gem::Requirement
66
60
  requirements:
67
61
  - - "~>"
68
62
  - !ruby/object:Gem::Version
69
- version: '1.0'
63
+ version: '2.0'
70
64
  - - ">="
71
65
  - !ruby/object:Gem::Version
72
- version: 1.0.1
66
+ version: 2.0.2
73
67
  - !ruby/object:Gem::Dependency
74
68
  name: pry
75
69
  requirement: !ruby/object:Gem::Requirement
@@ -84,26 +78,6 @@ dependencies:
84
78
  - - "~>"
85
79
  - !ruby/object:Gem::Version
86
80
  version: 0.12.2
87
- - !ruby/object:Gem::Dependency
88
- name: rest-client
89
- requirement: !ruby/object:Gem::Requirement
90
- requirements:
91
- - - "~>"
92
- - !ruby/object:Gem::Version
93
- version: '2.0'
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: 2.0.2
97
- type: :development
98
- prerelease: false
99
- version_requirements: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - "~>"
102
- - !ruby/object:Gem::Version
103
- version: '2.0'
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- version: 2.0.2
107
81
  - !ruby/object:Gem::Dependency
108
82
  name: rspec
109
83
  requirement: !ruby/object:Gem::Requirement
@@ -166,16 +140,17 @@ dependencies:
166
140
  - - "~>"
167
141
  - !ruby/object:Gem::Version
168
142
  version: '2.1'
169
- description: Scrapes data such as description, social profiles, contact details
143
+ description: A decent web scraping gem.Scrapes website's title, description,social
144
+ profiles such as linkedin, facebook, twitter, instgram, vimeo,pinterest, youtube
145
+ channel and contact details such as emails, phone numbers.
170
146
  email: bkotu6717@gmail.com
171
147
  executables: []
172
148
  extensions: []
173
149
  extra_rdoc_files: []
174
150
  files:
175
151
  - Gemfile
152
+ - Gemfile.lock
176
153
  - README.md
177
- - brilliant_web_scraper-1.0.0.gem
178
- - brilliant_web_scraper-1.0.gem
179
154
  - brilliant_web_scraper.gemspec
180
155
  - lib/brilliant_web_scraper.rb
181
156
  - lib/parsers/description_helper.rb
@@ -246,5 +221,5 @@ rubyforge_project:
246
221
  rubygems_version: 2.5.1
247
222
  signing_key:
248
223
  specification_version: 4
249
- summary: A decent web scraping ruby library!
224
+ summary: A decent web scraping ruby gem!
250
225
  test_files: []