brilliant_web_scraper 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/README.md +31 -0
- data/brilliant_web_scraper-1.0.0.gem +0 -0
- data/brilliant_web_scraper-1.0.gem +0 -0
- data/brilliant_web_scraper.gemspec +30 -0
- data/lib/brilliant_web_scraper.rb +55 -0
- data/lib/parsers/description_helper.rb +28 -0
- data/lib/parsers/emails.rb +30 -0
- data/lib/parsers/facebook_profile.rb +11 -0
- data/lib/parsers/instagram_profile.rb +11 -0
- data/lib/parsers/linkedin_profile.rb +11 -0
- data/lib/parsers/meta_description.rb +13 -0
- data/lib/parsers/org_description.rb +13 -0
- data/lib/parsers/phone_numbers.rb +34 -0
- data/lib/parsers/pinterest_profile.rb +11 -0
- data/lib/parsers/redirected_to.rb +29 -0
- data/lib/parsers/title.rb +13 -0
- data/lib/parsers/twitter_description.rb +13 -0
- data/lib/parsers/twitter_profile.rb +11 -0
- data/lib/parsers/unescape_html_helper.rb +17 -0
- data/lib/parsers/vimeo_profile.rb +11 -0
- data/lib/parsers/youtube_channel.rb +29 -0
- data/lib/scraper/errors.rb +19 -0
- data/lib/scraper/scrape_exceptions.rb +49 -0
- data/lib/scraper/scrape_helper.rb +59 -0
- data/lib/scraper/scrape_request.rb +29 -0
- data/lib/version.rb +6 -0
- data/spec/lib/parsers/description_helper_spec.rb +24 -0
- data/spec/lib/parsers/emails_spec.rb +60 -0
- data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
- data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
- data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
- data/spec/lib/parsers/meta_description_spec.rb +321 -0
- data/spec/lib/parsers/org_description_spec.rb +316 -0
- data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
- data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
- data/spec/lib/parsers/redirected_to_spec.rb +207 -0
- data/spec/lib/parsers/title_spec.rb +87 -0
- data/spec/lib/parsers/twitter_description_spec.rb +314 -0
- data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
- data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
- data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
- data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
- data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
- data/spec/lib/scraper/scrape_request_test.rb +34 -0
- data/spec/spec_helper.rb +111 -0
- data/spec/vcr/encoding_compatibility_error.yml +316 -0
- data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
- data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
- data/spec/vcr/non_html_scrape.yml +163 -0
- data/spec/vcr/valid_scrape_response.yml +696 -0
- metadata +250 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: efbe9d1a0688fd10e200d972b56c3e2ec86203f1
|
4
|
+
data.tar.gz: 20cce1c52197f11dcea73813831bb4172829ddaa
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 638c34f7efbc963613f4bb841abbf183bf134ee3197bebc99f9403ba7864befd44243f53092a9aa3ba7ea58314475b61d6671816e8e3f8ef4deb7f49b6f0ef52
|
7
|
+
data.tar.gz: f91110f69e8228de408aa0c35050fe6137fac22bdb93ff86be3c70d380e1cf57534f50f78e5d585cb63e421ff0fc51aa04089d38e8a86ab3a6ca305659dc909a
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# WebScraper [![Build Status](https://api.travis-ci.com/bkotu6717/brilliant_web_scraper.svg)](https://travis-ci.com/bkotu6717/brilliant_web_scraper)
|
2
|
+
|
3
|
+
A decent web scraping gem. Scrapes website description, social profiles, contact details, youtube channels.
|
4
|
+
|
5
|
+
|
6
|
+
It accepts a URL or Domain as input and gets it's title, descrptios, social profiles, YouTube channels and it's current URL if got redirected.
|
7
|
+
|
8
|
+
|
9
|
+
## See it in action!
|
10
|
+
|
11
|
+
You can try WebScraper live at this little demo: [https://brilliantweb-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
|
16
|
+
If you're using it on a Rails application, just add it to your Gemfile and run `bundle install`
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
gem 'brilliant_web_scraper'
|
20
|
+
```
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
Initialize a BrilliantWebScraper instance for an URL, like this:
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
require 'brilliant_web_scraper'
|
28
|
+
results = BrilliantWebScraper.new('http://pwc.com')
|
29
|
+
```
|
30
|
+
|
31
|
+
If you don't include the scheme on the URL, it is fine:
|
Binary file
|
Binary file
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require File.expand_path('./lib/version')
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'brilliant_web_scraper'
|
7
|
+
s.version = WebScraper::VERSION
|
8
|
+
s.licenses = ['Nonstandard']
|
9
|
+
s.summary = 'A decent web scraping ruby library!'
|
10
|
+
s.description = 'Scrapes data such as description, social profiles, contact details'
|
11
|
+
s.authors = ['Kotu Bhaskara Rao']
|
12
|
+
s.email = 'bkotu6717@gmail.com'
|
13
|
+
s.require_paths = ['lib']
|
14
|
+
s.homepage = 'https://github.com/bkotu6717/brilliant_web_scraper'
|
15
|
+
s.files = Dir['**/*'].keep_if { |file|
|
16
|
+
file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" && File.file?(file)
|
17
|
+
}
|
18
|
+
s.required_ruby_version = '>= 2.3.0'
|
19
|
+
|
20
|
+
s.add_dependency 'nesty', '~> 1.0', '>= 1.0.1'
|
21
|
+
s.add_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
22
|
+
|
23
|
+
s.add_development_dependency 'nesty', '~> 1.0', '>= 1.0.1'
|
24
|
+
s.add_development_dependency 'pry', '~> 0.12.2'
|
25
|
+
s.add_development_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
26
|
+
s.add_development_dependency 'rspec', '~> 3.5'
|
27
|
+
s.add_development_dependency 'rubocop', '~> 0.73.0'
|
28
|
+
s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
|
29
|
+
s.add_development_dependency 'webmock', '~> 2.1'
|
30
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rest-client'
|
4
|
+
require 'cgi'
|
5
|
+
require 'benchmark'
|
6
|
+
|
7
|
+
current_directory = File.dirname(__FILE__) + '/scraper'
|
8
|
+
require File.expand_path(File.join(current_directory, 'errors'))
|
9
|
+
require File.expand_path(File.join(current_directory, 'scrape_exceptions'))
|
10
|
+
require File.expand_path(File.join(current_directory, 'scrape_helper'))
|
11
|
+
require File.expand_path(File.join(current_directory, 'scrape_request'))
|
12
|
+
|
13
|
+
current_directory = File.dirname(__FILE__) + '/parsers'
|
14
|
+
require File.expand_path(File.join(current_directory, 'unescape_html_helper'))
|
15
|
+
require File.expand_path(File.join(current_directory, 'description_helper'))
|
16
|
+
require File.expand_path(File.join(current_directory, 'title'))
|
17
|
+
require File.expand_path(File.join(current_directory, 'meta_description'))
|
18
|
+
require File.expand_path(File.join(current_directory, 'org_description'))
|
19
|
+
require File.expand_path(File.join(current_directory, 'twitter_description'))
|
20
|
+
require File.expand_path(File.join(current_directory, 'twitter_profile'))
|
21
|
+
require File.expand_path(File.join(current_directory, 'linkedin_profile'))
|
22
|
+
require File.expand_path(File.join(current_directory, 'facebook_profile'))
|
23
|
+
require File.expand_path(File.join(current_directory, 'youtube_channel'))
|
24
|
+
require File.expand_path(File.join(current_directory, 'instagram_profile'))
|
25
|
+
require File.expand_path(File.join(current_directory, 'vimeo_profile'))
|
26
|
+
require File.expand_path(File.join(current_directory, 'pinterest_profile'))
|
27
|
+
require File.expand_path(File.join(current_directory, 'emails'))
|
28
|
+
require File.expand_path(File.join(current_directory, 'phone_numbers'))
|
29
|
+
require File.expand_path(File.join(current_directory, 'redirected_to'))
|
30
|
+
|
31
|
+
# Main scraping class
|
32
|
+
class BrilliantWebScraper
|
33
|
+
extend ScrapeHelper
|
34
|
+
extend ScrapeRequest
|
35
|
+
extend Title
|
36
|
+
extend MetaDescription
|
37
|
+
extend OrgDescription
|
38
|
+
extend TwitterDescription
|
39
|
+
extend TwitterProfile
|
40
|
+
extend LinkedinProfile
|
41
|
+
extend FacebookProfile
|
42
|
+
extend YoutubeChannel
|
43
|
+
extend InstagramProfile
|
44
|
+
extend VimeoProfile
|
45
|
+
extend PinterestProfile
|
46
|
+
extend Emails
|
47
|
+
extend PhoneNumbers
|
48
|
+
extend RedirectedTo
|
49
|
+
|
50
|
+
class << self
|
51
|
+
def new(url, connection_timeout = 10, read_timeout = 10)
|
52
|
+
perform_scrape(url, connection_timeout, read_timeout)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# @Parses meta, twitter, org description tags
|
4
|
+
module DescriptionHelper
|
5
|
+
include UnescapeHtmlHelper
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
def scrape_description(response, regexes)
|
10
|
+
return if response.to_s.empty? || regexes.empty?
|
11
|
+
|
12
|
+
description = nil
|
13
|
+
regexes.each do |regex|
|
14
|
+
description = response.scan(regex).flatten.compact
|
15
|
+
description = parse_description(description)
|
16
|
+
break unless description.nil?
|
17
|
+
end
|
18
|
+
unescape_html(description)
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse_description(descriptions)
|
22
|
+
return if descriptions.nil? || descriptions.empty?
|
23
|
+
|
24
|
+
descriptions = descriptions.reject { |x| x.nil? || x.empty? }
|
25
|
+
descriptions = descriptions.map { |x| unescape_html(x) }
|
26
|
+
descriptions.find { |x| (x !~ /^\s*[|-]?\s*$/) }
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
current_directory = File.dirname(__FILE__)
|
4
|
+
require File.expand_path(File.join(current_directory, 'unescape_html_helper'))
|
5
|
+
|
6
|
+
# Parses emails from html string
|
7
|
+
module Emails
|
8
|
+
include UnescapeHtmlHelper
|
9
|
+
def grep_emails(response)
|
10
|
+
return if response.nil? || response.empty?
|
11
|
+
|
12
|
+
first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
|
13
|
+
second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3})["'\s><]}
|
14
|
+
first_set = response.scan(first_regex).flatten.compact
|
15
|
+
first_set = get_processed_emails(first_set)
|
16
|
+
second_set = response.scan(second_regex).flatten.compact
|
17
|
+
second_set = get_processed_emails(second_set)
|
18
|
+
(first_set | second_set).compact.map(&:downcase).uniq
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_processed_emails(email_set)
|
22
|
+
return [] if email_set.nil? || email_set.empty?
|
23
|
+
|
24
|
+
unescaped_emails = email_set.map { |email| unescape_html(email) }
|
25
|
+
return [] if unescaped_emails.empty?
|
26
|
+
|
27
|
+
email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3}/im
|
28
|
+
unescaped_emails.select { |data| data =~ email_match_regex }
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep facebook profiles
|
4
|
+
module FacebookProfile
|
5
|
+
def grep_facebook_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
|
9
|
+
response.scan(facebook_url_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep instgram profiles
|
4
|
+
module InstagramProfile
|
5
|
+
def grep_instagram_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
instagram_regex = %r{(?im)(https?:\/\/(?:www\.)?+instagram\.com\/(?!#|%|"|'|(?:explore|p)\/).+?[^"'<>\s?&\/]+)}
|
9
|
+
response.scan(instagram_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep linkedin profile
|
4
|
+
module LinkedinProfile
|
5
|
+
def grep_linkedin_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
linkedin_profile_regex = %r{(?im)(https:\/\/www\.linkedin\.com\/company\/[^"'\?<>\s\/]+)}
|
9
|
+
response.scan(linkedin_profile_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep description in meta tag with attribute name='description'
|
4
|
+
module MetaDescription
|
5
|
+
include DescriptionHelper
|
6
|
+
def grep_meta_description(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
first_regex = %r{(?im)(?im)<meta\s+[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
|
10
|
+
second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*description\s*(?:'|")?[\w\s"'=-]*[\/>]}
|
11
|
+
scrape_description(response, [first_regex, second_regex])
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Greps description from meta tag with attrbute org:description
|
4
|
+
module OrgDescription
|
5
|
+
include DescriptionHelper
|
6
|
+
def grep_org_description(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
first_regex = %r{(?im)<meta\s+[\w\s"'=-]*(?:property|itemprop)\s*=\s*(?:'|")?\s*og:description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
|
10
|
+
second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:property|itemprop)\s*=\s*(?:'|")?\s*og:description\s*(?:'|")?[\w\s"'=-]*[\/>]}
|
11
|
+
scrape_description(response, [first_regex, second_regex])
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep phonenumbers from 'href=tel:' attributes
|
4
|
+
module PhoneNumbers
|
5
|
+
include UnescapeHtmlHelper
|
6
|
+
def grep_phone_numbers(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
phone_number_regex = %r{(?im)href\s*=\s*(?:"|')?\s*tel:\s*(?:https?:)?\/*(?!#(?:"|'))([^"'\/<>\{\[]+)}
|
10
|
+
phone_numbers = response.scan(phone_number_regex).flatten.uniq
|
11
|
+
get_processed_phone_numbers(phone_numbers)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def get_processed_phone_numbers(phone_numbers)
|
17
|
+
return [] if phone_numbers.nil? || phone_numbers.empty?
|
18
|
+
|
19
|
+
unescaped_contacts = phone_numbers.map { |phone_number| unescape_html(phone_number) }
|
20
|
+
good_phone_numbers = []
|
21
|
+
unescaped_contacts.each do |x|
|
22
|
+
next if x !~ /\d+/
|
23
|
+
|
24
|
+
if x =~ /\w+=/
|
25
|
+
good_phone_numbers << x.gsub(/\w+=.*/, '')
|
26
|
+
next
|
27
|
+
|
28
|
+
else
|
29
|
+
good_phone_numbers << x
|
30
|
+
end
|
31
|
+
end
|
32
|
+
good_phone_numbers.uniq
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep pinterest profile
|
4
|
+
module PinterestProfile
|
5
|
+
def grep_pinterest_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
pinterest_regex = %r{(?im)(https?:\/\/[\w\.]*pinterest\.com\/(?!"|'|\?|#|cookies(?:"|'')|(?:pin|v3|js|feed)\/)[^"'<>?&\s\/]+)}
|
9
|
+
response.scan(pinterest_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Fetch latest url of the given website
|
4
|
+
module RedirectedTo
|
5
|
+
|
6
|
+
def grep_redirected_to_url(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
patterns = [
|
10
|
+
%r{(?im)<link\s+[\s\w="'-]*rel\s*=\s*(?:"|')canonical(?:"|')[\s\w='"-]*?\s+href\s*=\s*(?:"|')([^"']*)(?:"|')[\s\w='"-]*?(?:>|\/>)},
|
11
|
+
%r{(?im)<link\s+[\s\w='"-]*href\s*=\s*(?:"|')([^'"]*)(?:"|')[\s\w='"-]*?rel\s*=\s*(?:"|')\s*canonical\s*(?:"|')[\s\w='"-]*(?:>|\/>)},
|
12
|
+
%r{(?im)<meta\s+[\s\w="'-]*property=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w="'-]*content=\s*(?:'|")([^'"]*)(?:'|")[\s\w="'-]*(?:>|\/>)},
|
13
|
+
%r{(?im)<meta\s+[\s\w"'=-]*content\s*=\s*(?:'|")([^'"]*)(?:'|")[\s\w"'=-]*property\s*=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w"'=-]*(?:>|\/>)}
|
14
|
+
]
|
15
|
+
url = nil
|
16
|
+
patterns.each do |pattern|
|
17
|
+
web_urls = response.scan(pattern).flatten
|
18
|
+
url = parser(web_urls)
|
19
|
+
break unless url.nil?
|
20
|
+
end
|
21
|
+
url
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def parser(urls)
|
27
|
+
urls.find { |x| x =~ %r{(?im)^\s*(?:https*)?:?(?:\/\/)?\w+[.&%-]} }
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep title form very first title tag
|
4
|
+
module Title
|
5
|
+
include UnescapeHtmlHelper
|
6
|
+
def grep_title(response)
|
7
|
+
return if !response.is_a?(String) || response.empty?
|
8
|
+
|
9
|
+
title_regex = /<\s*title.*?>(.*?)<?\s*\/?title\s*?>/im
|
10
|
+
title = response.match(title_regex).captures[0].strip rescue nil
|
11
|
+
unescape_html(title) unless title.nil? || title.empty?
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep twitter description from attribute `twitter:description`
|
4
|
+
module TwitterDescription
|
5
|
+
include DescriptionHelper
|
6
|
+
def grep_twitter_description(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
first_regex = %r{(?im)<meta\s+[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*twitter:description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
|
10
|
+
second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*twitter:description\s*(?:'|")?[\w\s"'=-]*[\/>]}
|
11
|
+
scrape_description(response, [first_regex, second_regex])
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep twitter profile
|
4
|
+
module TwitterProfile
|
5
|
+
def grep_twitter_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
|
9
|
+
response.scan(twitter_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Decode HTML & URL encodings
|
4
|
+
module UnescapeHtmlHelper
|
5
|
+
private
|
6
|
+
|
7
|
+
def unescape_html(text)
|
8
|
+
return if text.nil? && !text.is_a?(String) || text.empty?
|
9
|
+
|
10
|
+
unescaped_html_text = CGI.unescapeHTML(text)
|
11
|
+
if unescaped_html_text =~ /%[a-z0-9]{2}/i
|
12
|
+
plus_sign_preserved_text = unescaped_html_text.gsub(/\+/, '%2B')
|
13
|
+
unescaped_html_text = CGI.unescape(plus_sign_preserved_text)
|
14
|
+
end
|
15
|
+
unescaped_html_text.strip
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep Vimeo social profile
|
4
|
+
module VimeoProfile
|
5
|
+
def grep_vimeo_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\&\?<>\s]+)}
|
9
|
+
response.scan(vimeo_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep youtube channels
|
4
|
+
module YoutubeChannel
|
5
|
+
def grep_youtube_channel(response)
|
6
|
+
return if response !~ %r{(?im)https?:\/\/(?:www\.)?youtube\.com\/}
|
7
|
+
|
8
|
+
first_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/(?!\?gl=\w{2}|(?:embed|feeds)\/|(?:player_api|iframe_api)(?:"|'|\/|\?)|watch\?|user\/#)[^"'\&<>\s]+)}
|
9
|
+
second_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/watch?\S*v=[^<>&'"]+)}
|
10
|
+
third_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/embed\/(?!id|{|}|\[|\]|\$|\?|\\|%|\+)[^"'\?<>\s]+)}
|
11
|
+
youtube_channels = scrape_profile(response, [first_regex, second_regex, third_regex])
|
12
|
+
youtube_channels.compact.uniq
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def scrape_profile(response, regexes)
|
18
|
+
return if response.to_s.empty? || regexes.empty?
|
19
|
+
|
20
|
+
profiles = []
|
21
|
+
regexes.each do |regex|
|
22
|
+
profiles = response.scan(regex).flatten.compact
|
23
|
+
break unless profiles.empty?
|
24
|
+
end
|
25
|
+
return [] if profiles.none?
|
26
|
+
|
27
|
+
profiles
|
28
|
+
end
|
29
|
+
end
|