brilliant_web_scraper 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +90 -0
- data/README.md +6 -9
- data/brilliant_web_scraper.gemspec +12 -7
- data/lib/brilliant_web_scraper.rb +2 -1
- data/lib/parsers/description_helper.rb +1 -1
- data/lib/parsers/emails.rb +2 -2
- data/lib/parsers/facebook_profile.rb +1 -1
- data/lib/parsers/redirected_to.rb +2 -1
- data/lib/parsers/twitter_profile.rb +1 -1
- data/lib/parsers/vimeo_profile.rb +1 -1
- data/lib/scraper/scrape_helper.rb +27 -27
- data/lib/scraper/scrape_request.rb +12 -8
- data/lib/version.rb +1 -1
- data/spec/lib/parsers/emails_spec.rb +4 -0
- data/spec/lib/parsers/facebook_profile_spec.rb +1 -0
- data/spec/lib/parsers/redirected_to_spec.rb +209 -191
- data/spec/lib/parsers/twitter_profile_spec.rb +1 -0
- data/spec/lib/parsers/vimeo_profile_spec.rb +6 -4
- metadata +21 -46
- data/brilliant_web_scraper-1.0.0.gem +0 -0
- data/brilliant_web_scraper-1.0.gem +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ad5219a19dcfc311bed756a83d82fd3758bd71a
|
4
|
+
data.tar.gz: c085eb2a96b8eb503cd44edc87821823cf0ad965
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4a5c3c9dd78f3e123b0c279a04c2d0d262c58aef8d0086668c37aea8e52b6ec8da98d6d242debddf600922dcfed3d377bd159e62fc9bb1e7390b1e62e881eb2b
|
7
|
+
data.tar.gz: de051e60d7b90bde7984871d1b39e52f46be6366910591f0ec31c434a41037a9cc1274989dd50755cbf6e03d0e9f498a3271aad173b12a8ea3fd6a578eb8fbc9
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
brilliant_web_scraper (0.2)
|
5
|
+
charlock_holmes (~> 0.7.6)
|
6
|
+
nesty (~> 1.0, >= 1.0.1)
|
7
|
+
rest-client (~> 2.0, >= 2.0.2)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: http://rubygems.org/
|
11
|
+
specs:
|
12
|
+
addressable (2.6.0)
|
13
|
+
public_suffix (>= 2.0.2, < 4.0)
|
14
|
+
ast (2.4.0)
|
15
|
+
charlock_holmes (0.7.6)
|
16
|
+
coderay (1.1.2)
|
17
|
+
crack (0.4.3)
|
18
|
+
safe_yaml (~> 1.0.0)
|
19
|
+
diff-lcs (1.3)
|
20
|
+
domain_name (0.5.20190701)
|
21
|
+
unf (>= 0.0.5, < 1.0.0)
|
22
|
+
hashdiff (1.0.0)
|
23
|
+
http-accept (1.7.0)
|
24
|
+
http-cookie (1.0.3)
|
25
|
+
domain_name (~> 0.5)
|
26
|
+
jaro_winkler (1.5.3)
|
27
|
+
method_source (0.9.2)
|
28
|
+
mime-types (3.2.2)
|
29
|
+
mime-types-data (~> 3.2015)
|
30
|
+
mime-types-data (3.2019.0331)
|
31
|
+
nesty (1.0.2)
|
32
|
+
netrc (0.11.0)
|
33
|
+
parallel (1.17.0)
|
34
|
+
parser (2.6.3.0)
|
35
|
+
ast (~> 2.4.0)
|
36
|
+
pry (0.12.2)
|
37
|
+
coderay (~> 1.1.0)
|
38
|
+
method_source (~> 0.9.0)
|
39
|
+
public_suffix (3.1.1)
|
40
|
+
rainbow (3.0.0)
|
41
|
+
rest-client (2.1.0)
|
42
|
+
http-accept (>= 1.7.0, < 2.0)
|
43
|
+
http-cookie (>= 1.0.2, < 2.0)
|
44
|
+
mime-types (>= 1.16, < 4.0)
|
45
|
+
netrc (~> 0.8)
|
46
|
+
rspec (3.8.0)
|
47
|
+
rspec-core (~> 3.8.0)
|
48
|
+
rspec-expectations (~> 3.8.0)
|
49
|
+
rspec-mocks (~> 3.8.0)
|
50
|
+
rspec-core (3.8.2)
|
51
|
+
rspec-support (~> 3.8.0)
|
52
|
+
rspec-expectations (3.8.4)
|
53
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
54
|
+
rspec-support (~> 3.8.0)
|
55
|
+
rspec-mocks (3.8.1)
|
56
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
57
|
+
rspec-support (~> 3.8.0)
|
58
|
+
rspec-support (3.8.2)
|
59
|
+
rubocop (0.73.0)
|
60
|
+
jaro_winkler (~> 1.5.1)
|
61
|
+
parallel (~> 1.10)
|
62
|
+
parser (>= 2.6)
|
63
|
+
rainbow (>= 2.2.2, < 4.0)
|
64
|
+
ruby-progressbar (~> 1.7)
|
65
|
+
unicode-display_width (>= 1.4.0, < 1.7)
|
66
|
+
ruby-progressbar (1.10.1)
|
67
|
+
safe_yaml (1.0.5)
|
68
|
+
unf (0.1.4)
|
69
|
+
unf_ext
|
70
|
+
unf_ext (0.0.7.6)
|
71
|
+
unicode-display_width (1.6.0)
|
72
|
+
vcr (3.0.3)
|
73
|
+
webmock (2.3.2)
|
74
|
+
addressable (>= 2.3.6)
|
75
|
+
crack (>= 0.3.2)
|
76
|
+
hashdiff
|
77
|
+
|
78
|
+
PLATFORMS
|
79
|
+
ruby
|
80
|
+
|
81
|
+
DEPENDENCIES
|
82
|
+
brilliant_web_scraper!
|
83
|
+
pry (~> 0.12.2)
|
84
|
+
rspec (~> 3.5)
|
85
|
+
rubocop (~> 0.73.0)
|
86
|
+
vcr (~> 3.0, >= 3.0.1)
|
87
|
+
webmock (~> 2.1)
|
88
|
+
|
89
|
+
BUNDLED WITH
|
90
|
+
1.16.6
|
data/README.md
CHANGED
@@ -1,14 +1,11 @@
|
|
1
|
-
#
|
1
|
+
# BrilliantWebScraper [![Build Status](https://api.travis-ci.com/bkotu6717/brilliant_web_scraper.svg)](https://travis-ci.com/bkotu6717/brilliant_web_scraper)[![Maintainability](https://api.codeclimate.com/v1/badges/15a8a6e117f11bd94376/maintainability)](https://codeclimate.com/github/bkotu6717/brilliant_web_scraper/maintainability)
|
2
2
|
|
3
|
-
A decent web scraping gem. Scrapes website description, social profiles, contact details,
|
4
|
-
|
5
|
-
|
6
|
-
It accepts a URL or Domain as input and gets it's title, descrptios, social profiles, YouTube channels and it's current URL if got redirected.
|
3
|
+
A decent web scraping gem. Scrapes website title, description, social profiles such as linkedin, facebook, twitter, instgram, vimeo, pinterest, youtube channel and contact details such as emails, phone numbers.
|
7
4
|
|
8
5
|
|
9
6
|
## See it in action!
|
10
7
|
|
11
|
-
You can try
|
8
|
+
You can try BrillaintWebScraper live at this little demo: [https://brilliant-web-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
|
12
9
|
|
13
10
|
## Installation
|
14
11
|
|
@@ -21,11 +18,11 @@ gem 'brilliant_web_scraper'
|
|
21
18
|
|
22
19
|
## Usage
|
23
20
|
|
24
|
-
Initialize a BrilliantWebScraper instance for an URL, like this:
|
21
|
+
Initialize a BrilliantWebScraper instance for an URL, like this with optional timeouts, default connection_timeout and read_timeouts are 10s, 10s respectively:
|
25
22
|
|
26
23
|
```ruby
|
27
24
|
require 'brilliant_web_scraper'
|
25
|
+
results = BrilliantWebScraper.new('http://pwc.com', 5, 5)
|
26
|
+
|
28
27
|
results = BrilliantWebScraper.new('http://pwc.com')
|
29
28
|
```
|
30
|
-
|
31
|
-
If you don't include the scheme on the URL, it is fine:
|
@@ -6,23 +6,28 @@ Gem::Specification.new do |s|
|
|
6
6
|
s.name = 'brilliant_web_scraper'
|
7
7
|
s.version = WebScraper::VERSION
|
8
8
|
s.licenses = ['Nonstandard']
|
9
|
-
s.summary = 'A decent web scraping ruby
|
10
|
-
s.description = '
|
9
|
+
s.summary = 'A decent web scraping ruby gem!'
|
10
|
+
s.description = 'A decent web scraping gem.'\
|
11
|
+
'Scrapes website\'s title, description,'\
|
12
|
+
'social profiles such as linkedin, '\
|
13
|
+
'facebook, twitter, instgram, vimeo,'\
|
14
|
+
'pinterest, youtube channel and'\
|
15
|
+
' contact details such as emails, phone numbers.'
|
11
16
|
s.authors = ['Kotu Bhaskara Rao']
|
12
17
|
s.email = 'bkotu6717@gmail.com'
|
13
18
|
s.require_paths = ['lib']
|
14
19
|
s.homepage = 'https://github.com/bkotu6717/brilliant_web_scraper'
|
15
20
|
s.files = Dir['**/*'].keep_if { |file|
|
16
|
-
file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" &&
|
21
|
+
file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" &&
|
22
|
+
File.file?(file)
|
17
23
|
}
|
18
24
|
s.required_ruby_version = '>= 2.3.0'
|
19
25
|
|
20
|
-
s.
|
21
|
-
s.
|
26
|
+
s.add_runtime_dependency 'charlock_holmes', '~> 0.7.6'
|
27
|
+
s.add_runtime_dependency 'nesty', '~> 1.0', '>= 1.0.1'
|
28
|
+
s.add_runtime_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
22
29
|
|
23
|
-
s.add_development_dependency 'nesty', '~> 1.0', '>= 1.0.1'
|
24
30
|
s.add_development_dependency 'pry', '~> 0.12.2'
|
25
|
-
s.add_development_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
26
31
|
s.add_development_dependency 'rspec', '~> 3.5'
|
27
32
|
s.add_development_dependency 'rubocop', '~> 0.73.0'
|
28
33
|
s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
|
@@ -21,7 +21,7 @@ module DescriptionHelper
|
|
21
21
|
def parse_description(descriptions)
|
22
22
|
return if descriptions.nil? || descriptions.empty?
|
23
23
|
|
24
|
-
descriptions = descriptions.reject { |x| x.nil? || x.empty? }
|
24
|
+
descriptions = descriptions.reject { |x| x.nil? || x.empty? || x =~ /^\s*$/}
|
25
25
|
descriptions = descriptions.map { |x| unescape_html(x) }
|
26
26
|
descriptions.find { |x| (x !~ /^\s*[|-]?\s*$/) }
|
27
27
|
end
|
data/lib/parsers/emails.rb
CHANGED
@@ -10,7 +10,7 @@ module Emails
|
|
10
10
|
return if response.nil? || response.empty?
|
11
11
|
|
12
12
|
first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
|
13
|
-
second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3})["'\s><]}
|
13
|
+
second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3})["'\s><]}
|
14
14
|
first_set = response.scan(first_regex).flatten.compact
|
15
15
|
first_set = get_processed_emails(first_set)
|
16
16
|
second_set = response.scan(second_regex).flatten.compact
|
@@ -24,7 +24,7 @@ module Emails
|
|
24
24
|
unescaped_emails = email_set.map { |email| unescape_html(email) }
|
25
25
|
return [] if unescaped_emails.empty?
|
26
26
|
|
27
|
-
email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3}/im
|
27
|
+
email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3}/im
|
28
28
|
unescaped_emails.select { |data| data =~ email_match_regex }
|
29
29
|
end
|
30
30
|
end
|
@@ -5,7 +5,7 @@ module FacebookProfile
|
|
5
5
|
def grep_facebook_profile(response)
|
6
6
|
return if response.nil? || response.empty?
|
7
7
|
|
8
|
-
facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
|
8
|
+
facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|images|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
|
9
9
|
response.scan(facebook_url_regex).flatten.compact.uniq
|
10
10
|
end
|
11
11
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
# Fetch latest url of the given website
|
4
4
|
module RedirectedTo
|
5
|
+
include UnescapeHtmlHelper
|
5
6
|
|
6
7
|
def grep_redirected_to_url(response)
|
7
8
|
return if response.nil? || response.empty?
|
@@ -18,7 +19,7 @@ module RedirectedTo
|
|
18
19
|
url = parser(web_urls)
|
19
20
|
break unless url.nil?
|
20
21
|
end
|
21
|
-
url
|
22
|
+
unescape_html(url)
|
22
23
|
end
|
23
24
|
|
24
25
|
private
|
@@ -5,7 +5,7 @@ module TwitterProfile
|
|
5
5
|
def grep_twitter_profile(response)
|
6
6
|
return if response.nil? || response.empty?
|
7
7
|
|
8
|
-
twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
|
8
|
+
twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!\{\{)(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
|
9
9
|
response.scan(twitter_regex).flatten.compact.uniq
|
10
10
|
end
|
11
11
|
end
|
@@ -5,7 +5,7 @@ module VimeoProfile
|
|
5
5
|
def grep_vimeo_profile(response)
|
6
6
|
return if response.nil? || response.empty?
|
7
7
|
|
8
|
-
vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'
|
8
|
+
vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\\\&\?<>\s]+)}
|
9
9
|
response.scan(vimeo_regex).flatten.compact.uniq
|
10
10
|
end
|
11
11
|
end
|
@@ -6,38 +6,38 @@
|
|
6
6
|
# @Social Profiles
|
7
7
|
# @Contact Details
|
8
8
|
module ScrapeHelper
|
9
|
-
def perform_scrape(url, read_timeout,
|
10
|
-
|
11
|
-
|
12
|
-
response = ScrapeRequest.new(url, read_timeout,
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
response = response.encode('UTF-16be', invalid: :replace, replace: '?')
|
31
|
-
response = response.encode('UTF-8')
|
32
|
-
retry
|
33
|
-
rescue Encoding::CompatibilityError => e
|
34
|
-
raise WebScraper::ParserError, e.message
|
9
|
+
def perform_scrape(url, read_timeout, open_timeout)
|
10
|
+
timeout_in_sec = scraper_timeout(read_timeout, open_timeout)
|
11
|
+
Timeout::timeout(timeout_in_sec) do
|
12
|
+
response = ScrapeRequest.new(url, read_timeout, open_timeout)
|
13
|
+
retry_count = 0
|
14
|
+
body = response.body
|
15
|
+
begin
|
16
|
+
body = body.tr("\000", '')
|
17
|
+
encoding = body.detect_encoding[:encoding]
|
18
|
+
body = body.encode('UTF-8', encoding)
|
19
|
+
grep_data(body)
|
20
|
+
rescue Encoding::UndefinedConversionError, ArgumentError => e
|
21
|
+
retry_count += 1
|
22
|
+
raise WebScraper::ParserError, e.message if retry_count > 1
|
23
|
+
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
24
|
+
retry
|
25
|
+
rescue Encoding::CompatibilityError => e
|
26
|
+
raise WebScraper::ParserError, e.message
|
27
|
+
rescue StandardError => e
|
28
|
+
raise WebScraper::RequestError, e.message
|
29
|
+
end
|
35
30
|
end
|
36
|
-
|
31
|
+
rescue Timeout::Error => e
|
32
|
+
raise WebScraper::TimeoutError, e.message
|
37
33
|
end
|
38
34
|
|
39
35
|
private
|
40
36
|
|
37
|
+
def scraper_timeout(read_timeout, open_timeout)
|
38
|
+
( read_timeout + open_timeout + 1 )
|
39
|
+
end
|
40
|
+
|
41
41
|
def grep_data(response)
|
42
42
|
{
|
43
43
|
title: grep_title(response),
|
@@ -1,24 +1,28 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
# @Makes actual scrape request, either raises exception or response
|
3
|
+
# @Makes actual scrape request, either raises exception or serves response
|
4
4
|
module ScrapeRequest
|
5
5
|
extend ScrapeExceptions
|
6
6
|
class << self
|
7
7
|
def new(url, read_timeout, connection_timeout)
|
8
|
+
params_hash = {
|
9
|
+
method: :get,
|
10
|
+
url: url,
|
11
|
+
read_timeout: read_timeout,
|
12
|
+
open_timeout: connection_timeout,
|
13
|
+
max_redirects: 10,
|
14
|
+
verify_ssl: false
|
15
|
+
}
|
8
16
|
begin
|
9
|
-
params_hash = {
|
10
|
-
method: :get,
|
11
|
-
url: url,
|
12
|
-
read_timeout: read_timeout,
|
13
|
-
connection_timeout: connection_timeout,
|
14
|
-
headers: { 'accept-encoding': 'identity' }
|
15
|
-
}
|
16
17
|
response = RestClient::Request.execute(params_hash)
|
17
18
|
content_type = response.headers[:content_type]
|
18
19
|
return response if content_type =~ %r{(?i)text\s*\/\s*html}
|
19
20
|
|
20
21
|
exception_message = "Invalid response format received: #{content_type}"
|
21
22
|
raise WebScraper::NonHtmlError, exception_message
|
23
|
+
rescue Zlib::DataError
|
24
|
+
params_hash[:headers] = { 'accept-encoding': 'identity' }
|
25
|
+
retry
|
22
26
|
rescue *TIMEOUT_EXCEPTIONS => e
|
23
27
|
raise WebScraper::TimeoutError, e.message
|
24
28
|
rescue *GENERAL_EXCEPTIONS => e
|
data/lib/version.rb
CHANGED
@@ -27,6 +27,10 @@ describe 'Emails' do
|
|
27
27
|
<a href="mailto:xxx@yyy.zzz">xxx@yyy.zzz</a>
|
28
28
|
<a href="mailto:test@test.com">test@test.com</a>
|
29
29
|
<a href="mailto:@example.com">@example.com"</a>
|
30
|
+
<a href="mailto:v@201908240100.css">v@201908240100.css"</a>
|
31
|
+
<a href="mailto:v@201908240100.js">v@201908240100.js"</a>
|
32
|
+
<a href="mailto:ajax-loader@2x.gif">ajax-loader@2x.gif"</a>
|
33
|
+
<a href="mailto:favicon@2x.ico">favicon@2x.ico"</a>
|
30
34
|
HTML
|
31
35
|
expect(dummy_object.grep_emails(html.to_s)).to eq([])
|
32
36
|
end
|
@@ -14,6 +14,7 @@ describe 'FaceBook Profile' do
|
|
14
14
|
|
15
15
|
it 'should not grep any non profile url' do
|
16
16
|
html = <<~HTML
|
17
|
+
<a href="https://www.facebook.com/images/fb_icon_325x325.png" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
17
18
|
<a href="http://www.facebook.com/2008/fbml" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
18
19
|
<a href="https://www.facebook.com/v2.0/dialog/share" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
19
20
|
<a href="https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2FHFXMooseheads%2Fvideos" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
@@ -3,205 +3,223 @@ require 'spec_helper'
|
|
3
3
|
describe 'Website Redirected To' do
|
4
4
|
|
5
5
|
class DummyTestClass
|
6
|
-
|
7
|
-
|
6
|
+
include RedirectedTo
|
7
|
+
end
|
8
8
|
let(:dummy_object) { DummyTestClass.new }
|
9
9
|
|
10
10
|
|
11
11
|
it 'should return nil for invalid input' do
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
|
13
|
+
expect(dummy_object.grep_redirected_to_url('')).to be_nil
|
14
|
+
end
|
15
15
|
|
16
16
|
describe 'Website grep from link tag' do
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
17
|
+
describe 'rel attribute first ' do
|
18
|
+
|
19
|
+
it 'should return nil when canonical url is empty' do
|
20
|
+
html = <<~HTML
|
21
|
+
<link rel="canonical" href="">
|
22
|
+
<link rel="canonical" href=''>
|
23
|
+
HTML
|
24
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
25
|
+
expect(website).to be_nil
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should grep website' do
|
29
|
+
html = <<~HTML
|
30
|
+
<link rel="canonical" href="">
|
31
|
+
<link rel="canonical" href='https://www.apple.com/'>
|
32
|
+
HTML
|
33
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
34
|
+
expect(website).to eq('https://www.apple.com/')
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should grep website even with extra attributes' do
|
38
|
+
html = <<~HTML
|
39
|
+
<link rel="canonical" href="" itemprop="current_url">
|
40
|
+
<link rel="canonical" href='https://www.apple.com/'
|
41
|
+
itemprop="current_url" >
|
42
|
+
HTML
|
43
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
44
|
+
expect(website).to eq('https://www.apple.com/')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe 'href attribute first' do
|
49
|
+
|
50
|
+
it 'should return nil when canonical url is empty' do
|
51
|
+
html = <<~HTML
|
52
|
+
<link href="" rel="canonical" >
|
53
|
+
<link href='' rel="canonical" >
|
54
|
+
HTML
|
55
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
56
|
+
expect(website).to be_nil
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'should grep website' do
|
60
|
+
html = <<~HTML
|
61
|
+
<link rel="canonical" href="">
|
62
|
+
<link href='https://www.apple.com/' rel="canonical">
|
63
|
+
HTML
|
64
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
65
|
+
expect(website).to eq('https://www.apple.com/')
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'should grep website even with extra attributes' do
|
69
|
+
html = <<~HTML
|
70
|
+
<link href="" itemprop="current_url" rel="canonical">
|
71
|
+
<link href='https://www.apple.com/' rel="canonical"
|
72
|
+
itemprop="current_url" >
|
73
|
+
HTML
|
74
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
75
|
+
expect(website).to eq('https://www.apple.com/')
|
76
|
+
end
|
77
|
+
end
|
76
78
|
end
|
79
|
+
|
77
80
|
describe 'Website grep from organization URL' do
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
81
|
+
|
82
|
+
describe 'property attribute first ' do
|
83
|
+
|
84
|
+
it 'should return nil when canonical url is empty' do
|
85
|
+
html = <<~HTML
|
86
|
+
<meta property="og:url" content="" />
|
87
|
+
<meta property="og:url" content='' />
|
88
|
+
HTML
|
89
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
90
|
+
expect(website).to be_nil
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should grep website' do
|
94
|
+
html = <<~HTML
|
95
|
+
<link property="og:url" content="">
|
96
|
+
<meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
|
97
|
+
HTML
|
98
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
99
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'should grep website even with extra attributes' do
|
103
|
+
html = <<~HTML
|
104
|
+
<link property="og:url" content="" calss="og-url">
|
105
|
+
<meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
|
106
|
+
class="og-url" />
|
107
|
+
HTML
|
108
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
109
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
describe 'content attribute first ' do
|
114
|
+
|
115
|
+
it 'should return nil when canonical url is empty' do
|
116
|
+
html = <<~HTML
|
117
|
+
<meta content="" property="og:url" />
|
118
|
+
<meta content='' property="og:url"/>
|
119
|
+
HTML
|
120
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
121
|
+
expect(website).to be_nil
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'should grep website' do
|
125
|
+
html = <<~HTML
|
126
|
+
<link content="" property="og:url" >
|
127
|
+
<meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
|
128
|
+
HTML
|
129
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
130
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
131
|
+
end
|
132
|
+
|
133
|
+
it 'should grep website even with extra attributes' do
|
134
|
+
html = <<~HTML
|
135
|
+
<link content="" calss="og-url" property="og:url">
|
136
|
+
<meta content='https://www.dieppe.ca/fr/index.aspx'
|
137
|
+
class="og-url" property="og:url" />
|
138
|
+
HTML
|
139
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
140
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
141
|
+
end
|
142
|
+
end
|
136
143
|
end
|
137
144
|
describe 'grep website' do
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
145
|
+
|
146
|
+
it 'it should return nil when link or og:url is absent' do
|
147
|
+
html = <<~HTML
|
148
|
+
<head>
|
149
|
+
<meta charset="utf-8">
|
150
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
151
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
152
|
+
<title>Techmologic | index</title>
|
153
|
+
<!-- Font Awesome -->
|
154
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
155
|
+
<!-- Bootstrap core CSS -->
|
156
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
157
|
+
<!-- Material Design Bootstrap -->
|
158
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
159
|
+
<!-- Your custom styles (optional) -->
|
160
|
+
<link href="css/style.css" rel="stylesheet">
|
161
|
+
</head>
|
162
|
+
HTML
|
163
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
164
|
+
expect(website).to be_nil
|
165
|
+
end
|
166
|
+
|
167
|
+
it 'should grep one of canonical or og:url' do
|
168
|
+
html = <<~HTML
|
169
|
+
<head>
|
170
|
+
<meta charset="utf-8">
|
171
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
172
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
173
|
+
<title>Techmologic | index</title>
|
174
|
+
<link rel="canonical" href="">
|
175
|
+
<meta property="og:url" content="" />
|
176
|
+
<!-- Font Awesome -->
|
177
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
178
|
+
<!-- Bootstrap core CSS -->
|
179
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
180
|
+
<!-- Material Design Bootstrap -->
|
181
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
182
|
+
<!-- Your custom styles (optional) -->
|
183
|
+
<link href="css/style.css" rel="stylesheet">
|
184
|
+
<link rel="canonical" href="http://techmologics.com/">
|
185
|
+
<meta property="og:url" content="http://techmologics.com/" />
|
186
|
+
</head>
|
187
|
+
HTML
|
188
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
189
|
+
expect(website).to eq('http://techmologics.com/')
|
190
|
+
end
|
191
|
+
|
192
|
+
it 'should grep one of canonical or og:url whatever it\'s position' do
|
193
|
+
html = <<~HTML
|
194
|
+
<head>
|
195
|
+
<meta charset="utf-8">
|
196
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
197
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
198
|
+
<title>Techmologic | index</title>
|
199
|
+
<link href="" rel="canonical">
|
200
|
+
<meta content="" property="og:url"/>
|
201
|
+
<!-- Font Awesome -->
|
202
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
203
|
+
<!-- Bootstrap core CSS -->
|
204
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
205
|
+
<!-- Material Design Bootstrap -->
|
206
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
207
|
+
<!-- Your custom styles (optional) -->
|
208
|
+
<link href="css/style.css" rel="stylesheet">
|
209
|
+
<link href="http://techmologics.com/" rel="canonical" class="canonical">
|
210
|
+
<meta content="http://techmologics.com/" property="og:url"/>
|
211
|
+
</head>
|
212
|
+
HTML
|
213
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
214
|
+
expect(website).to eq('http://techmologics.com/')
|
215
|
+
end
|
216
|
+
|
217
|
+
it 'should decode html entities in the redirected_to url' do
|
218
|
+
html = <<~HTML
|
219
|
+
<meta content="https://www.santanderbank.com/us/personal" property="og:url"/>
|
220
|
+
HTML
|
221
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
222
|
+
expect(website).to eq('https://www.santanderbank.com/us/personal')
|
223
|
+
end
|
206
224
|
end
|
207
|
-
end
|
225
|
+
end
|
@@ -31,6 +31,7 @@ describe 'Twitter Profile' do
|
|
31
31
|
<a href=" http://twitter.com/share/" target="_blank">
|
32
32
|
<a href="https://twitter.com/#!/Farmer_Brothers" target="_blank">
|
33
33
|
<a href="http://twitter.com/javascripts/blogger.js" target="_blank">
|
34
|
+
<a href="https://twitter.com/{{../user.screen_name}}/status/{{../id_str}}" target="_blank">
|
34
35
|
HTML
|
35
36
|
expect(dummy_object.grep_twitter_profile(html.to_s)).to eq([])
|
36
37
|
end
|
@@ -30,13 +30,15 @@ describe 'Vimeo Profile' do
|
|
30
30
|
<a href="https://vimeo.com/channels/332103" target="_blank">
|
31
31
|
<a href="https://vimeo.com/talech" target="_blank">
|
32
32
|
<a href="https://vimeo.com/292173295/fdb8634a35/" target="_blank">
|
33
|
+
<a href="https://vimeo.com/337614648\\" target="_blank">
|
33
34
|
HTML
|
34
35
|
vimeo_profiles = dummy_object.grep_vimeo_profile(html.to_s)
|
35
36
|
expected_profiles = [
|
36
|
-
'https://vimeo.com/107578087',
|
37
|
-
'https://vimeo.com/channels/332103',
|
38
|
-
'https://vimeo.com/talech',
|
39
|
-
'https://vimeo.com/292173295/fdb8634a35/',
|
37
|
+
'https://vimeo.com/107578087',
|
38
|
+
'https://vimeo.com/channels/332103',
|
39
|
+
'https://vimeo.com/talech',
|
40
|
+
'https://vimeo.com/292173295/fdb8634a35/',
|
41
|
+
'https://vimeo.com/337614648'
|
40
42
|
]
|
41
43
|
expect(vimeo_profiles).to eq(expected_profiles)
|
42
44
|
end
|
metadata
CHANGED
@@ -1,75 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: brilliant_web_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kotu Bhaskara Rao
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-08-
|
11
|
+
date: 2019-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: charlock_holmes
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
20
|
-
- - ">="
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version: 1.0.1
|
19
|
+
version: 0.7.6
|
23
20
|
type: :runtime
|
24
21
|
prerelease: false
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
24
|
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
30
|
-
- - ">="
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: 1.0.1
|
26
|
+
version: 0.7.6
|
33
27
|
- !ruby/object:Gem::Dependency
|
34
|
-
name:
|
28
|
+
name: nesty
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
36
30
|
requirements:
|
37
31
|
- - "~>"
|
38
32
|
- !ruby/object:Gem::Version
|
39
|
-
version: '
|
33
|
+
version: '1.0'
|
40
34
|
- - ">="
|
41
35
|
- !ruby/object:Gem::Version
|
42
|
-
version:
|
36
|
+
version: 1.0.1
|
43
37
|
type: :runtime
|
44
38
|
prerelease: false
|
45
39
|
version_requirements: !ruby/object:Gem::Requirement
|
46
40
|
requirements:
|
47
41
|
- - "~>"
|
48
42
|
- !ruby/object:Gem::Version
|
49
|
-
version: '
|
43
|
+
version: '1.0'
|
50
44
|
- - ">="
|
51
45
|
- !ruby/object:Gem::Version
|
52
|
-
version:
|
46
|
+
version: 1.0.1
|
53
47
|
- !ruby/object:Gem::Dependency
|
54
|
-
name:
|
48
|
+
name: rest-client
|
55
49
|
requirement: !ruby/object:Gem::Requirement
|
56
50
|
requirements:
|
57
51
|
- - "~>"
|
58
52
|
- !ruby/object:Gem::Version
|
59
|
-
version: '
|
53
|
+
version: '2.0'
|
60
54
|
- - ">="
|
61
55
|
- !ruby/object:Gem::Version
|
62
|
-
version:
|
63
|
-
type: :
|
56
|
+
version: 2.0.2
|
57
|
+
type: :runtime
|
64
58
|
prerelease: false
|
65
59
|
version_requirements: !ruby/object:Gem::Requirement
|
66
60
|
requirements:
|
67
61
|
- - "~>"
|
68
62
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
63
|
+
version: '2.0'
|
70
64
|
- - ">="
|
71
65
|
- !ruby/object:Gem::Version
|
72
|
-
version:
|
66
|
+
version: 2.0.2
|
73
67
|
- !ruby/object:Gem::Dependency
|
74
68
|
name: pry
|
75
69
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,26 +78,6 @@ dependencies:
|
|
84
78
|
- - "~>"
|
85
79
|
- !ruby/object:Gem::Version
|
86
80
|
version: 0.12.2
|
87
|
-
- !ruby/object:Gem::Dependency
|
88
|
-
name: rest-client
|
89
|
-
requirement: !ruby/object:Gem::Requirement
|
90
|
-
requirements:
|
91
|
-
- - "~>"
|
92
|
-
- !ruby/object:Gem::Version
|
93
|
-
version: '2.0'
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: 2.0.2
|
97
|
-
type: :development
|
98
|
-
prerelease: false
|
99
|
-
version_requirements: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - "~>"
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '2.0'
|
104
|
-
- - ">="
|
105
|
-
- !ruby/object:Gem::Version
|
106
|
-
version: 2.0.2
|
107
81
|
- !ruby/object:Gem::Dependency
|
108
82
|
name: rspec
|
109
83
|
requirement: !ruby/object:Gem::Requirement
|
@@ -166,16 +140,17 @@ dependencies:
|
|
166
140
|
- - "~>"
|
167
141
|
- !ruby/object:Gem::Version
|
168
142
|
version: '2.1'
|
169
|
-
description:
|
143
|
+
description: A decent web scraping gem.Scrapes website's title, description,social
|
144
|
+
profiles such as linkedin, facebook, twitter, instgram, vimeo,pinterest, youtube
|
145
|
+
channel and contact details such as emails, phone numbers.
|
170
146
|
email: bkotu6717@gmail.com
|
171
147
|
executables: []
|
172
148
|
extensions: []
|
173
149
|
extra_rdoc_files: []
|
174
150
|
files:
|
175
151
|
- Gemfile
|
152
|
+
- Gemfile.lock
|
176
153
|
- README.md
|
177
|
-
- brilliant_web_scraper-1.0.0.gem
|
178
|
-
- brilliant_web_scraper-1.0.gem
|
179
154
|
- brilliant_web_scraper.gemspec
|
180
155
|
- lib/brilliant_web_scraper.rb
|
181
156
|
- lib/parsers/description_helper.rb
|
@@ -246,5 +221,5 @@ rubyforge_project:
|
|
246
221
|
rubygems_version: 2.5.1
|
247
222
|
signing_key:
|
248
223
|
specification_version: 4
|
249
|
-
summary: A decent web scraping ruby
|
224
|
+
summary: A decent web scraping ruby gem!
|
250
225
|
test_files: []
|
Binary file
|
Binary file
|