brilliant_web_scraper 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +90 -0
- data/README.md +6 -9
- data/brilliant_web_scraper.gemspec +12 -7
- data/lib/brilliant_web_scraper.rb +2 -1
- data/lib/parsers/description_helper.rb +1 -1
- data/lib/parsers/emails.rb +2 -2
- data/lib/parsers/facebook_profile.rb +1 -1
- data/lib/parsers/redirected_to.rb +2 -1
- data/lib/parsers/twitter_profile.rb +1 -1
- data/lib/parsers/vimeo_profile.rb +1 -1
- data/lib/scraper/scrape_helper.rb +27 -27
- data/lib/scraper/scrape_request.rb +12 -8
- data/lib/version.rb +1 -1
- data/spec/lib/parsers/emails_spec.rb +4 -0
- data/spec/lib/parsers/facebook_profile_spec.rb +1 -0
- data/spec/lib/parsers/redirected_to_spec.rb +209 -191
- data/spec/lib/parsers/twitter_profile_spec.rb +1 -0
- data/spec/lib/parsers/vimeo_profile_spec.rb +6 -4
- metadata +21 -46
- data/brilliant_web_scraper-1.0.0.gem +0 -0
- data/brilliant_web_scraper-1.0.gem +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9ad5219a19dcfc311bed756a83d82fd3758bd71a
|
|
4
|
+
data.tar.gz: c085eb2a96b8eb503cd44edc87821823cf0ad965
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4a5c3c9dd78f3e123b0c279a04c2d0d262c58aef8d0086668c37aea8e52b6ec8da98d6d242debddf600922dcfed3d377bd159e62fc9bb1e7390b1e62e881eb2b
|
|
7
|
+
data.tar.gz: de051e60d7b90bde7984871d1b39e52f46be6366910591f0ec31c434a41037a9cc1274989dd50755cbf6e03d0e9f498a3271aad173b12a8ea3fd6a578eb8fbc9
|
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
brilliant_web_scraper (0.2)
|
|
5
|
+
charlock_holmes (~> 0.7.6)
|
|
6
|
+
nesty (~> 1.0, >= 1.0.1)
|
|
7
|
+
rest-client (~> 2.0, >= 2.0.2)
|
|
8
|
+
|
|
9
|
+
GEM
|
|
10
|
+
remote: http://rubygems.org/
|
|
11
|
+
specs:
|
|
12
|
+
addressable (2.6.0)
|
|
13
|
+
public_suffix (>= 2.0.2, < 4.0)
|
|
14
|
+
ast (2.4.0)
|
|
15
|
+
charlock_holmes (0.7.6)
|
|
16
|
+
coderay (1.1.2)
|
|
17
|
+
crack (0.4.3)
|
|
18
|
+
safe_yaml (~> 1.0.0)
|
|
19
|
+
diff-lcs (1.3)
|
|
20
|
+
domain_name (0.5.20190701)
|
|
21
|
+
unf (>= 0.0.5, < 1.0.0)
|
|
22
|
+
hashdiff (1.0.0)
|
|
23
|
+
http-accept (1.7.0)
|
|
24
|
+
http-cookie (1.0.3)
|
|
25
|
+
domain_name (~> 0.5)
|
|
26
|
+
jaro_winkler (1.5.3)
|
|
27
|
+
method_source (0.9.2)
|
|
28
|
+
mime-types (3.2.2)
|
|
29
|
+
mime-types-data (~> 3.2015)
|
|
30
|
+
mime-types-data (3.2019.0331)
|
|
31
|
+
nesty (1.0.2)
|
|
32
|
+
netrc (0.11.0)
|
|
33
|
+
parallel (1.17.0)
|
|
34
|
+
parser (2.6.3.0)
|
|
35
|
+
ast (~> 2.4.0)
|
|
36
|
+
pry (0.12.2)
|
|
37
|
+
coderay (~> 1.1.0)
|
|
38
|
+
method_source (~> 0.9.0)
|
|
39
|
+
public_suffix (3.1.1)
|
|
40
|
+
rainbow (3.0.0)
|
|
41
|
+
rest-client (2.1.0)
|
|
42
|
+
http-accept (>= 1.7.0, < 2.0)
|
|
43
|
+
http-cookie (>= 1.0.2, < 2.0)
|
|
44
|
+
mime-types (>= 1.16, < 4.0)
|
|
45
|
+
netrc (~> 0.8)
|
|
46
|
+
rspec (3.8.0)
|
|
47
|
+
rspec-core (~> 3.8.0)
|
|
48
|
+
rspec-expectations (~> 3.8.0)
|
|
49
|
+
rspec-mocks (~> 3.8.0)
|
|
50
|
+
rspec-core (3.8.2)
|
|
51
|
+
rspec-support (~> 3.8.0)
|
|
52
|
+
rspec-expectations (3.8.4)
|
|
53
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
54
|
+
rspec-support (~> 3.8.0)
|
|
55
|
+
rspec-mocks (3.8.1)
|
|
56
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
57
|
+
rspec-support (~> 3.8.0)
|
|
58
|
+
rspec-support (3.8.2)
|
|
59
|
+
rubocop (0.73.0)
|
|
60
|
+
jaro_winkler (~> 1.5.1)
|
|
61
|
+
parallel (~> 1.10)
|
|
62
|
+
parser (>= 2.6)
|
|
63
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
64
|
+
ruby-progressbar (~> 1.7)
|
|
65
|
+
unicode-display_width (>= 1.4.0, < 1.7)
|
|
66
|
+
ruby-progressbar (1.10.1)
|
|
67
|
+
safe_yaml (1.0.5)
|
|
68
|
+
unf (0.1.4)
|
|
69
|
+
unf_ext
|
|
70
|
+
unf_ext (0.0.7.6)
|
|
71
|
+
unicode-display_width (1.6.0)
|
|
72
|
+
vcr (3.0.3)
|
|
73
|
+
webmock (2.3.2)
|
|
74
|
+
addressable (>= 2.3.6)
|
|
75
|
+
crack (>= 0.3.2)
|
|
76
|
+
hashdiff
|
|
77
|
+
|
|
78
|
+
PLATFORMS
|
|
79
|
+
ruby
|
|
80
|
+
|
|
81
|
+
DEPENDENCIES
|
|
82
|
+
brilliant_web_scraper!
|
|
83
|
+
pry (~> 0.12.2)
|
|
84
|
+
rspec (~> 3.5)
|
|
85
|
+
rubocop (~> 0.73.0)
|
|
86
|
+
vcr (~> 3.0, >= 3.0.1)
|
|
87
|
+
webmock (~> 2.1)
|
|
88
|
+
|
|
89
|
+
BUNDLED WITH
|
|
90
|
+
1.16.6
|
data/README.md
CHANGED
|
@@ -1,14 +1,11 @@
|
|
|
1
|
-
#
|
|
1
|
+
# BrilliantWebScraper [](https://travis-ci.com/bkotu6717/brilliant_web_scraper)[](https://codeclimate.com/github/bkotu6717/brilliant_web_scraper/maintainability)
|
|
2
2
|
|
|
3
|
-
A decent web scraping gem. Scrapes website description, social profiles, contact details,
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
It accepts a URL or Domain as input and gets it's title, descrptios, social profiles, YouTube channels and it's current URL if got redirected.
|
|
3
|
+
A decent web scraping gem. Scrapes website title, description, social profiles such as linkedin, facebook, twitter, instgram, vimeo, pinterest, youtube channel and contact details such as emails, phone numbers.
|
|
7
4
|
|
|
8
5
|
|
|
9
6
|
## See it in action!
|
|
10
7
|
|
|
11
|
-
You can try
|
|
8
|
+
You can try BrillaintWebScraper live at this little demo: [https://brilliant-web-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
|
|
12
9
|
|
|
13
10
|
## Installation
|
|
14
11
|
|
|
@@ -21,11 +18,11 @@ gem 'brilliant_web_scraper'
|
|
|
21
18
|
|
|
22
19
|
## Usage
|
|
23
20
|
|
|
24
|
-
Initialize a BrilliantWebScraper instance for an URL, like this:
|
|
21
|
+
Initialize a BrilliantWebScraper instance for an URL, like this with optional timeouts, default connection_timeout and read_timeouts are 10s, 10s respectively:
|
|
25
22
|
|
|
26
23
|
```ruby
|
|
27
24
|
require 'brilliant_web_scraper'
|
|
25
|
+
results = BrilliantWebScraper.new('http://pwc.com', 5, 5)
|
|
26
|
+
|
|
28
27
|
results = BrilliantWebScraper.new('http://pwc.com')
|
|
29
28
|
```
|
|
30
|
-
|
|
31
|
-
If you don't include the scheme on the URL, it is fine:
|
|
@@ -6,23 +6,28 @@ Gem::Specification.new do |s|
|
|
|
6
6
|
s.name = 'brilliant_web_scraper'
|
|
7
7
|
s.version = WebScraper::VERSION
|
|
8
8
|
s.licenses = ['Nonstandard']
|
|
9
|
-
s.summary = 'A decent web scraping ruby
|
|
10
|
-
s.description = '
|
|
9
|
+
s.summary = 'A decent web scraping ruby gem!'
|
|
10
|
+
s.description = 'A decent web scraping gem.'\
|
|
11
|
+
'Scrapes website\'s title, description,'\
|
|
12
|
+
'social profiles such as linkedin, '\
|
|
13
|
+
'facebook, twitter, instgram, vimeo,'\
|
|
14
|
+
'pinterest, youtube channel and'\
|
|
15
|
+
' contact details such as emails, phone numbers.'
|
|
11
16
|
s.authors = ['Kotu Bhaskara Rao']
|
|
12
17
|
s.email = 'bkotu6717@gmail.com'
|
|
13
18
|
s.require_paths = ['lib']
|
|
14
19
|
s.homepage = 'https://github.com/bkotu6717/brilliant_web_scraper'
|
|
15
20
|
s.files = Dir['**/*'].keep_if { |file|
|
|
16
|
-
file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" &&
|
|
21
|
+
file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" &&
|
|
22
|
+
File.file?(file)
|
|
17
23
|
}
|
|
18
24
|
s.required_ruby_version = '>= 2.3.0'
|
|
19
25
|
|
|
20
|
-
s.
|
|
21
|
-
s.
|
|
26
|
+
s.add_runtime_dependency 'charlock_holmes', '~> 0.7.6'
|
|
27
|
+
s.add_runtime_dependency 'nesty', '~> 1.0', '>= 1.0.1'
|
|
28
|
+
s.add_runtime_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
|
22
29
|
|
|
23
|
-
s.add_development_dependency 'nesty', '~> 1.0', '>= 1.0.1'
|
|
24
30
|
s.add_development_dependency 'pry', '~> 0.12.2'
|
|
25
|
-
s.add_development_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
|
26
31
|
s.add_development_dependency 'rspec', '~> 3.5'
|
|
27
32
|
s.add_development_dependency 'rubocop', '~> 0.73.0'
|
|
28
33
|
s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
|
|
@@ -21,7 +21,7 @@ module DescriptionHelper
|
|
|
21
21
|
def parse_description(descriptions)
|
|
22
22
|
return if descriptions.nil? || descriptions.empty?
|
|
23
23
|
|
|
24
|
-
descriptions = descriptions.reject { |x| x.nil? || x.empty? }
|
|
24
|
+
descriptions = descriptions.reject { |x| x.nil? || x.empty? || x =~ /^\s*$/}
|
|
25
25
|
descriptions = descriptions.map { |x| unescape_html(x) }
|
|
26
26
|
descriptions.find { |x| (x !~ /^\s*[|-]?\s*$/) }
|
|
27
27
|
end
|
data/lib/parsers/emails.rb
CHANGED
|
@@ -10,7 +10,7 @@ module Emails
|
|
|
10
10
|
return if response.nil? || response.empty?
|
|
11
11
|
|
|
12
12
|
first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
|
|
13
|
-
second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3})["'\s><]}
|
|
13
|
+
second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3})["'\s><]}
|
|
14
14
|
first_set = response.scan(first_regex).flatten.compact
|
|
15
15
|
first_set = get_processed_emails(first_set)
|
|
16
16
|
second_set = response.scan(second_regex).flatten.compact
|
|
@@ -24,7 +24,7 @@ module Emails
|
|
|
24
24
|
unescaped_emails = email_set.map { |email| unescape_html(email) }
|
|
25
25
|
return [] if unescaped_emails.empty?
|
|
26
26
|
|
|
27
|
-
email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3}/im
|
|
27
|
+
email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg|css|js|ico|gif)[A-Z]{2,3}/im
|
|
28
28
|
unescaped_emails.select { |data| data =~ email_match_regex }
|
|
29
29
|
end
|
|
30
30
|
end
|
|
@@ -5,7 +5,7 @@ module FacebookProfile
|
|
|
5
5
|
def grep_facebook_profile(response)
|
|
6
6
|
return if response.nil? || response.empty?
|
|
7
7
|
|
|
8
|
-
facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
|
|
8
|
+
facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|images|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
|
|
9
9
|
response.scan(facebook_url_regex).flatten.compact.uniq
|
|
10
10
|
end
|
|
11
11
|
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# Fetch latest url of the given website
|
|
4
4
|
module RedirectedTo
|
|
5
|
+
include UnescapeHtmlHelper
|
|
5
6
|
|
|
6
7
|
def grep_redirected_to_url(response)
|
|
7
8
|
return if response.nil? || response.empty?
|
|
@@ -18,7 +19,7 @@ module RedirectedTo
|
|
|
18
19
|
url = parser(web_urls)
|
|
19
20
|
break unless url.nil?
|
|
20
21
|
end
|
|
21
|
-
url
|
|
22
|
+
unescape_html(url)
|
|
22
23
|
end
|
|
23
24
|
|
|
24
25
|
private
|
|
@@ -5,7 +5,7 @@ module TwitterProfile
|
|
|
5
5
|
def grep_twitter_profile(response)
|
|
6
6
|
return if response.nil? || response.empty?
|
|
7
7
|
|
|
8
|
-
twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
|
|
8
|
+
twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!\{\{)(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
|
|
9
9
|
response.scan(twitter_regex).flatten.compact.uniq
|
|
10
10
|
end
|
|
11
11
|
end
|
|
@@ -5,7 +5,7 @@ module VimeoProfile
|
|
|
5
5
|
def grep_vimeo_profile(response)
|
|
6
6
|
return if response.nil? || response.empty?
|
|
7
7
|
|
|
8
|
-
vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'
|
|
8
|
+
vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\\\&\?<>\s]+)}
|
|
9
9
|
response.scan(vimeo_regex).flatten.compact.uniq
|
|
10
10
|
end
|
|
11
11
|
end
|
|
@@ -6,38 +6,38 @@
|
|
|
6
6
|
# @Social Profiles
|
|
7
7
|
# @Contact Details
|
|
8
8
|
module ScrapeHelper
|
|
9
|
-
def perform_scrape(url, read_timeout,
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
response = ScrapeRequest.new(url, read_timeout,
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
response = response.encode('UTF-16be', invalid: :replace, replace: '?')
|
|
31
|
-
response = response.encode('UTF-8')
|
|
32
|
-
retry
|
|
33
|
-
rescue Encoding::CompatibilityError => e
|
|
34
|
-
raise WebScraper::ParserError, e.message
|
|
9
|
+
def perform_scrape(url, read_timeout, open_timeout)
|
|
10
|
+
timeout_in_sec = scraper_timeout(read_timeout, open_timeout)
|
|
11
|
+
Timeout::timeout(timeout_in_sec) do
|
|
12
|
+
response = ScrapeRequest.new(url, read_timeout, open_timeout)
|
|
13
|
+
retry_count = 0
|
|
14
|
+
body = response.body
|
|
15
|
+
begin
|
|
16
|
+
body = body.tr("\000", '')
|
|
17
|
+
encoding = body.detect_encoding[:encoding]
|
|
18
|
+
body = body.encode('UTF-8', encoding)
|
|
19
|
+
grep_data(body)
|
|
20
|
+
rescue Encoding::UndefinedConversionError, ArgumentError => e
|
|
21
|
+
retry_count += 1
|
|
22
|
+
raise WebScraper::ParserError, e.message if retry_count > 1
|
|
23
|
+
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
|
24
|
+
retry
|
|
25
|
+
rescue Encoding::CompatibilityError => e
|
|
26
|
+
raise WebScraper::ParserError, e.message
|
|
27
|
+
rescue StandardError => e
|
|
28
|
+
raise WebScraper::RequestError, e.message
|
|
29
|
+
end
|
|
35
30
|
end
|
|
36
|
-
|
|
31
|
+
rescue Timeout::Error => e
|
|
32
|
+
raise WebScraper::TimeoutError, e.message
|
|
37
33
|
end
|
|
38
34
|
|
|
39
35
|
private
|
|
40
36
|
|
|
37
|
+
def scraper_timeout(read_timeout, open_timeout)
|
|
38
|
+
( read_timeout + open_timeout + 1 )
|
|
39
|
+
end
|
|
40
|
+
|
|
41
41
|
def grep_data(response)
|
|
42
42
|
{
|
|
43
43
|
title: grep_title(response),
|
|
@@ -1,24 +1,28 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
# @Makes actual scrape request, either raises exception or response
|
|
3
|
+
# @Makes actual scrape request, either raises exception or serves response
|
|
4
4
|
module ScrapeRequest
|
|
5
5
|
extend ScrapeExceptions
|
|
6
6
|
class << self
|
|
7
7
|
def new(url, read_timeout, connection_timeout)
|
|
8
|
+
params_hash = {
|
|
9
|
+
method: :get,
|
|
10
|
+
url: url,
|
|
11
|
+
read_timeout: read_timeout,
|
|
12
|
+
open_timeout: connection_timeout,
|
|
13
|
+
max_redirects: 10,
|
|
14
|
+
verify_ssl: false
|
|
15
|
+
}
|
|
8
16
|
begin
|
|
9
|
-
params_hash = {
|
|
10
|
-
method: :get,
|
|
11
|
-
url: url,
|
|
12
|
-
read_timeout: read_timeout,
|
|
13
|
-
connection_timeout: connection_timeout,
|
|
14
|
-
headers: { 'accept-encoding': 'identity' }
|
|
15
|
-
}
|
|
16
17
|
response = RestClient::Request.execute(params_hash)
|
|
17
18
|
content_type = response.headers[:content_type]
|
|
18
19
|
return response if content_type =~ %r{(?i)text\s*\/\s*html}
|
|
19
20
|
|
|
20
21
|
exception_message = "Invalid response format received: #{content_type}"
|
|
21
22
|
raise WebScraper::NonHtmlError, exception_message
|
|
23
|
+
rescue Zlib::DataError
|
|
24
|
+
params_hash[:headers] = { 'accept-encoding': 'identity' }
|
|
25
|
+
retry
|
|
22
26
|
rescue *TIMEOUT_EXCEPTIONS => e
|
|
23
27
|
raise WebScraper::TimeoutError, e.message
|
|
24
28
|
rescue *GENERAL_EXCEPTIONS => e
|
data/lib/version.rb
CHANGED
|
@@ -27,6 +27,10 @@ describe 'Emails' do
|
|
|
27
27
|
<a href="mailto:xxx@yyy.zzz">xxx@yyy.zzz</a>
|
|
28
28
|
<a href="mailto:test@test.com">test@test.com</a>
|
|
29
29
|
<a href="mailto:@example.com">@example.com"</a>
|
|
30
|
+
<a href="mailto:v@201908240100.css">v@201908240100.css"</a>
|
|
31
|
+
<a href="mailto:v@201908240100.js">v@201908240100.js"</a>
|
|
32
|
+
<a href="mailto:ajax-loader@2x.gif">ajax-loader@2x.gif"</a>
|
|
33
|
+
<a href="mailto:favicon@2x.ico">favicon@2x.ico"</a>
|
|
30
34
|
HTML
|
|
31
35
|
expect(dummy_object.grep_emails(html.to_s)).to eq([])
|
|
32
36
|
end
|
|
@@ -14,6 +14,7 @@ describe 'FaceBook Profile' do
|
|
|
14
14
|
|
|
15
15
|
it 'should not grep any non profile url' do
|
|
16
16
|
html = <<~HTML
|
|
17
|
+
<a href="https://www.facebook.com/images/fb_icon_325x325.png" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
|
17
18
|
<a href="http://www.facebook.com/2008/fbml" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
|
18
19
|
<a href="https://www.facebook.com/v2.0/dialog/share" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
|
19
20
|
<a href="https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2FHFXMooseheads%2Fvideos" target="_blank" class="sqs-svg-icon--wrapper facebook">
|
|
@@ -3,205 +3,223 @@ require 'spec_helper'
|
|
|
3
3
|
describe 'Website Redirected To' do
|
|
4
4
|
|
|
5
5
|
class DummyTestClass
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
include RedirectedTo
|
|
7
|
+
end
|
|
8
8
|
let(:dummy_object) { DummyTestClass.new }
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
it 'should return nil for invalid input' do
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
expect(dummy_object.grep_redirected_to_url(nil)).to be_nil
|
|
13
|
+
expect(dummy_object.grep_redirected_to_url('')).to be_nil
|
|
14
|
+
end
|
|
15
15
|
|
|
16
16
|
describe 'Website grep from link tag' do
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
17
|
+
describe 'rel attribute first ' do
|
|
18
|
+
|
|
19
|
+
it 'should return nil when canonical url is empty' do
|
|
20
|
+
html = <<~HTML
|
|
21
|
+
<link rel="canonical" href="">
|
|
22
|
+
<link rel="canonical" href=''>
|
|
23
|
+
HTML
|
|
24
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
25
|
+
expect(website).to be_nil
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
it 'should grep website' do
|
|
29
|
+
html = <<~HTML
|
|
30
|
+
<link rel="canonical" href="">
|
|
31
|
+
<link rel="canonical" href='https://www.apple.com/'>
|
|
32
|
+
HTML
|
|
33
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
34
|
+
expect(website).to eq('https://www.apple.com/')
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'should grep website even with extra attributes' do
|
|
38
|
+
html = <<~HTML
|
|
39
|
+
<link rel="canonical" href="" itemprop="current_url">
|
|
40
|
+
<link rel="canonical" href='https://www.apple.com/'
|
|
41
|
+
itemprop="current_url" >
|
|
42
|
+
HTML
|
|
43
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
44
|
+
expect(website).to eq('https://www.apple.com/')
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
describe 'href attribute first' do
|
|
49
|
+
|
|
50
|
+
it 'should return nil when canonical url is empty' do
|
|
51
|
+
html = <<~HTML
|
|
52
|
+
<link href="" rel="canonical" >
|
|
53
|
+
<link href='' rel="canonical" >
|
|
54
|
+
HTML
|
|
55
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
56
|
+
expect(website).to be_nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
it 'should grep website' do
|
|
60
|
+
html = <<~HTML
|
|
61
|
+
<link rel="canonical" href="">
|
|
62
|
+
<link href='https://www.apple.com/' rel="canonical">
|
|
63
|
+
HTML
|
|
64
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
65
|
+
expect(website).to eq('https://www.apple.com/')
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
it 'should grep website even with extra attributes' do
|
|
69
|
+
html = <<~HTML
|
|
70
|
+
<link href="" itemprop="current_url" rel="canonical">
|
|
71
|
+
<link href='https://www.apple.com/' rel="canonical"
|
|
72
|
+
itemprop="current_url" >
|
|
73
|
+
HTML
|
|
74
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
75
|
+
expect(website).to eq('https://www.apple.com/')
|
|
76
|
+
end
|
|
77
|
+
end
|
|
76
78
|
end
|
|
79
|
+
|
|
77
80
|
describe 'Website grep from organization URL' do
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
81
|
+
|
|
82
|
+
describe 'property attribute first ' do
|
|
83
|
+
|
|
84
|
+
it 'should return nil when canonical url is empty' do
|
|
85
|
+
html = <<~HTML
|
|
86
|
+
<meta property="og:url" content="" />
|
|
87
|
+
<meta property="og:url" content='' />
|
|
88
|
+
HTML
|
|
89
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
90
|
+
expect(website).to be_nil
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
it 'should grep website' do
|
|
94
|
+
html = <<~HTML
|
|
95
|
+
<link property="og:url" content="">
|
|
96
|
+
<meta property="og:url" content="https://www.dieppe.ca/fr/index.aspx" />
|
|
97
|
+
HTML
|
|
98
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
99
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
it 'should grep website even with extra attributes' do
|
|
103
|
+
html = <<~HTML
|
|
104
|
+
<link property="og:url" content="" calss="og-url">
|
|
105
|
+
<meta property="og:url" content='https://www.dieppe.ca/fr/index.aspx'
|
|
106
|
+
class="og-url" />
|
|
107
|
+
HTML
|
|
108
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
109
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
describe 'content attribute first ' do
|
|
114
|
+
|
|
115
|
+
it 'should return nil when canonical url is empty' do
|
|
116
|
+
html = <<~HTML
|
|
117
|
+
<meta content="" property="og:url" />
|
|
118
|
+
<meta content='' property="og:url"/>
|
|
119
|
+
HTML
|
|
120
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
121
|
+
expect(website).to be_nil
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it 'should grep website' do
|
|
125
|
+
html = <<~HTML
|
|
126
|
+
<link content="" property="og:url" >
|
|
127
|
+
<meta content="https://www.dieppe.ca/fr/index.aspx" property="og:url" />
|
|
128
|
+
HTML
|
|
129
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
130
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it 'should grep website even with extra attributes' do
|
|
134
|
+
html = <<~HTML
|
|
135
|
+
<link content="" calss="og-url" property="og:url">
|
|
136
|
+
<meta content='https://www.dieppe.ca/fr/index.aspx'
|
|
137
|
+
class="og-url" property="og:url" />
|
|
138
|
+
HTML
|
|
139
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
140
|
+
expect(website).to eq('https://www.dieppe.ca/fr/index.aspx')
|
|
141
|
+
end
|
|
142
|
+
end
|
|
136
143
|
end
|
|
137
144
|
describe 'grep website' do
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
145
|
+
|
|
146
|
+
it 'it should return nil when link or og:url is absent' do
|
|
147
|
+
html = <<~HTML
|
|
148
|
+
<head>
|
|
149
|
+
<meta charset="utf-8">
|
|
150
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
151
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
|
152
|
+
<title>Techmologic | index</title>
|
|
153
|
+
<!-- Font Awesome -->
|
|
154
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
|
155
|
+
<!-- Bootstrap core CSS -->
|
|
156
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
|
157
|
+
<!-- Material Design Bootstrap -->
|
|
158
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
|
159
|
+
<!-- Your custom styles (optional) -->
|
|
160
|
+
<link href="css/style.css" rel="stylesheet">
|
|
161
|
+
</head>
|
|
162
|
+
HTML
|
|
163
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
164
|
+
expect(website).to be_nil
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
it 'should grep one of canonical or og:url' do
|
|
168
|
+
html = <<~HTML
|
|
169
|
+
<head>
|
|
170
|
+
<meta charset="utf-8">
|
|
171
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
172
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
|
173
|
+
<title>Techmologic | index</title>
|
|
174
|
+
<link rel="canonical" href="">
|
|
175
|
+
<meta property="og:url" content="" />
|
|
176
|
+
<!-- Font Awesome -->
|
|
177
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
|
178
|
+
<!-- Bootstrap core CSS -->
|
|
179
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
|
180
|
+
<!-- Material Design Bootstrap -->
|
|
181
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
|
182
|
+
<!-- Your custom styles (optional) -->
|
|
183
|
+
<link href="css/style.css" rel="stylesheet">
|
|
184
|
+
<link rel="canonical" href="http://techmologics.com/">
|
|
185
|
+
<meta property="og:url" content="http://techmologics.com/" />
|
|
186
|
+
</head>
|
|
187
|
+
HTML
|
|
188
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
189
|
+
expect(website).to eq('http://techmologics.com/')
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it 'should grep one of canonical or og:url whatever it\'s position' do
|
|
193
|
+
html = <<~HTML
|
|
194
|
+
<head>
|
|
195
|
+
<meta charset="utf-8">
|
|
196
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
197
|
+
<meta http-equiv="x-ua-compatible" content="ie=edge">
|
|
198
|
+
<title>Techmologic | index</title>
|
|
199
|
+
<link href="" rel="canonical">
|
|
200
|
+
<meta content="" property="og:url"/>
|
|
201
|
+
<!-- Font Awesome -->
|
|
202
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css">
|
|
203
|
+
<!-- Bootstrap core CSS -->
|
|
204
|
+
<link href="css/bootstrap.min.css" rel="stylesheet">
|
|
205
|
+
<!-- Material Design Bootstrap -->
|
|
206
|
+
<link href="css/mdb.min.css" rel="stylesheet">
|
|
207
|
+
<!-- Your custom styles (optional) -->
|
|
208
|
+
<link href="css/style.css" rel="stylesheet">
|
|
209
|
+
<link href="http://techmologics.com/" rel="canonical" class="canonical">
|
|
210
|
+
<meta content="http://techmologics.com/" property="og:url"/>
|
|
211
|
+
</head>
|
|
212
|
+
HTML
|
|
213
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
214
|
+
expect(website).to eq('http://techmologics.com/')
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
it 'should decode html entities in the redirected_to url' do
|
|
218
|
+
html = <<~HTML
|
|
219
|
+
<meta content="https://www.santanderbank.com/us/personal" property="og:url"/>
|
|
220
|
+
HTML
|
|
221
|
+
website = dummy_object.grep_redirected_to_url(html.to_s)
|
|
222
|
+
expect(website).to eq('https://www.santanderbank.com/us/personal')
|
|
223
|
+
end
|
|
206
224
|
end
|
|
207
|
-
end
|
|
225
|
+
end
|
|
@@ -31,6 +31,7 @@ describe 'Twitter Profile' do
|
|
|
31
31
|
<a href=" http://twitter.com/share/" target="_blank">
|
|
32
32
|
<a href="https://twitter.com/#!/Farmer_Brothers" target="_blank">
|
|
33
33
|
<a href="http://twitter.com/javascripts/blogger.js" target="_blank">
|
|
34
|
+
<a href="https://twitter.com/{{../user.screen_name}}/status/{{../id_str}}" target="_blank">
|
|
34
35
|
HTML
|
|
35
36
|
expect(dummy_object.grep_twitter_profile(html.to_s)).to eq([])
|
|
36
37
|
end
|
|
@@ -30,13 +30,15 @@ describe 'Vimeo Profile' do
|
|
|
30
30
|
<a href="https://vimeo.com/channels/332103" target="_blank">
|
|
31
31
|
<a href="https://vimeo.com/talech" target="_blank">
|
|
32
32
|
<a href="https://vimeo.com/292173295/fdb8634a35/" target="_blank">
|
|
33
|
+
<a href="https://vimeo.com/337614648\\" target="_blank">
|
|
33
34
|
HTML
|
|
34
35
|
vimeo_profiles = dummy_object.grep_vimeo_profile(html.to_s)
|
|
35
36
|
expected_profiles = [
|
|
36
|
-
'https://vimeo.com/107578087',
|
|
37
|
-
'https://vimeo.com/channels/332103',
|
|
38
|
-
'https://vimeo.com/talech',
|
|
39
|
-
'https://vimeo.com/292173295/fdb8634a35/',
|
|
37
|
+
'https://vimeo.com/107578087',
|
|
38
|
+
'https://vimeo.com/channels/332103',
|
|
39
|
+
'https://vimeo.com/talech',
|
|
40
|
+
'https://vimeo.com/292173295/fdb8634a35/',
|
|
41
|
+
'https://vimeo.com/337614648'
|
|
40
42
|
]
|
|
41
43
|
expect(vimeo_profiles).to eq(expected_profiles)
|
|
42
44
|
end
|
metadata
CHANGED
|
@@ -1,75 +1,69 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: brilliant_web_scraper
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: '0.
|
|
4
|
+
version: '0.2'
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kotu Bhaskara Rao
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2019-08-
|
|
11
|
+
date: 2019-08-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
|
-
name:
|
|
14
|
+
name: charlock_holmes
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
17
|
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version:
|
|
20
|
-
- - ">="
|
|
21
|
-
- !ruby/object:Gem::Version
|
|
22
|
-
version: 1.0.1
|
|
19
|
+
version: 0.7.6
|
|
23
20
|
type: :runtime
|
|
24
21
|
prerelease: false
|
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
26
23
|
requirements:
|
|
27
24
|
- - "~>"
|
|
28
25
|
- !ruby/object:Gem::Version
|
|
29
|
-
version:
|
|
30
|
-
- - ">="
|
|
31
|
-
- !ruby/object:Gem::Version
|
|
32
|
-
version: 1.0.1
|
|
26
|
+
version: 0.7.6
|
|
33
27
|
- !ruby/object:Gem::Dependency
|
|
34
|
-
name:
|
|
28
|
+
name: nesty
|
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
|
36
30
|
requirements:
|
|
37
31
|
- - "~>"
|
|
38
32
|
- !ruby/object:Gem::Version
|
|
39
|
-
version: '
|
|
33
|
+
version: '1.0'
|
|
40
34
|
- - ">="
|
|
41
35
|
- !ruby/object:Gem::Version
|
|
42
|
-
version:
|
|
36
|
+
version: 1.0.1
|
|
43
37
|
type: :runtime
|
|
44
38
|
prerelease: false
|
|
45
39
|
version_requirements: !ruby/object:Gem::Requirement
|
|
46
40
|
requirements:
|
|
47
41
|
- - "~>"
|
|
48
42
|
- !ruby/object:Gem::Version
|
|
49
|
-
version: '
|
|
43
|
+
version: '1.0'
|
|
50
44
|
- - ">="
|
|
51
45
|
- !ruby/object:Gem::Version
|
|
52
|
-
version:
|
|
46
|
+
version: 1.0.1
|
|
53
47
|
- !ruby/object:Gem::Dependency
|
|
54
|
-
name:
|
|
48
|
+
name: rest-client
|
|
55
49
|
requirement: !ruby/object:Gem::Requirement
|
|
56
50
|
requirements:
|
|
57
51
|
- - "~>"
|
|
58
52
|
- !ruby/object:Gem::Version
|
|
59
|
-
version: '
|
|
53
|
+
version: '2.0'
|
|
60
54
|
- - ">="
|
|
61
55
|
- !ruby/object:Gem::Version
|
|
62
|
-
version:
|
|
63
|
-
type: :
|
|
56
|
+
version: 2.0.2
|
|
57
|
+
type: :runtime
|
|
64
58
|
prerelease: false
|
|
65
59
|
version_requirements: !ruby/object:Gem::Requirement
|
|
66
60
|
requirements:
|
|
67
61
|
- - "~>"
|
|
68
62
|
- !ruby/object:Gem::Version
|
|
69
|
-
version: '
|
|
63
|
+
version: '2.0'
|
|
70
64
|
- - ">="
|
|
71
65
|
- !ruby/object:Gem::Version
|
|
72
|
-
version:
|
|
66
|
+
version: 2.0.2
|
|
73
67
|
- !ruby/object:Gem::Dependency
|
|
74
68
|
name: pry
|
|
75
69
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -84,26 +78,6 @@ dependencies:
|
|
|
84
78
|
- - "~>"
|
|
85
79
|
- !ruby/object:Gem::Version
|
|
86
80
|
version: 0.12.2
|
|
87
|
-
- !ruby/object:Gem::Dependency
|
|
88
|
-
name: rest-client
|
|
89
|
-
requirement: !ruby/object:Gem::Requirement
|
|
90
|
-
requirements:
|
|
91
|
-
- - "~>"
|
|
92
|
-
- !ruby/object:Gem::Version
|
|
93
|
-
version: '2.0'
|
|
94
|
-
- - ">="
|
|
95
|
-
- !ruby/object:Gem::Version
|
|
96
|
-
version: 2.0.2
|
|
97
|
-
type: :development
|
|
98
|
-
prerelease: false
|
|
99
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
100
|
-
requirements:
|
|
101
|
-
- - "~>"
|
|
102
|
-
- !ruby/object:Gem::Version
|
|
103
|
-
version: '2.0'
|
|
104
|
-
- - ">="
|
|
105
|
-
- !ruby/object:Gem::Version
|
|
106
|
-
version: 2.0.2
|
|
107
81
|
- !ruby/object:Gem::Dependency
|
|
108
82
|
name: rspec
|
|
109
83
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -166,16 +140,17 @@ dependencies:
|
|
|
166
140
|
- - "~>"
|
|
167
141
|
- !ruby/object:Gem::Version
|
|
168
142
|
version: '2.1'
|
|
169
|
-
description:
|
|
143
|
+
description: A decent web scraping gem.Scrapes website's title, description,social
|
|
144
|
+
profiles such as linkedin, facebook, twitter, instgram, vimeo,pinterest, youtube
|
|
145
|
+
channel and contact details such as emails, phone numbers.
|
|
170
146
|
email: bkotu6717@gmail.com
|
|
171
147
|
executables: []
|
|
172
148
|
extensions: []
|
|
173
149
|
extra_rdoc_files: []
|
|
174
150
|
files:
|
|
175
151
|
- Gemfile
|
|
152
|
+
- Gemfile.lock
|
|
176
153
|
- README.md
|
|
177
|
-
- brilliant_web_scraper-1.0.0.gem
|
|
178
|
-
- brilliant_web_scraper-1.0.gem
|
|
179
154
|
- brilliant_web_scraper.gemspec
|
|
180
155
|
- lib/brilliant_web_scraper.rb
|
|
181
156
|
- lib/parsers/description_helper.rb
|
|
@@ -246,5 +221,5 @@ rubyforge_project:
|
|
|
246
221
|
rubygems_version: 2.5.1
|
|
247
222
|
signing_key:
|
|
248
223
|
specification_version: 4
|
|
249
|
-
summary: A decent web scraping ruby
|
|
224
|
+
summary: A decent web scraping ruby gem!
|
|
250
225
|
test_files: []
|
|
Binary file
|
|
Binary file
|