brilliant_web_scraper 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/README.md +31 -0
- data/brilliant_web_scraper-1.0.0.gem +0 -0
- data/brilliant_web_scraper-1.0.gem +0 -0
- data/brilliant_web_scraper.gemspec +30 -0
- data/lib/brilliant_web_scraper.rb +55 -0
- data/lib/parsers/description_helper.rb +28 -0
- data/lib/parsers/emails.rb +30 -0
- data/lib/parsers/facebook_profile.rb +11 -0
- data/lib/parsers/instagram_profile.rb +11 -0
- data/lib/parsers/linkedin_profile.rb +11 -0
- data/lib/parsers/meta_description.rb +13 -0
- data/lib/parsers/org_description.rb +13 -0
- data/lib/parsers/phone_numbers.rb +34 -0
- data/lib/parsers/pinterest_profile.rb +11 -0
- data/lib/parsers/redirected_to.rb +29 -0
- data/lib/parsers/title.rb +13 -0
- data/lib/parsers/twitter_description.rb +13 -0
- data/lib/parsers/twitter_profile.rb +11 -0
- data/lib/parsers/unescape_html_helper.rb +17 -0
- data/lib/parsers/vimeo_profile.rb +11 -0
- data/lib/parsers/youtube_channel.rb +29 -0
- data/lib/scraper/errors.rb +19 -0
- data/lib/scraper/scrape_exceptions.rb +49 -0
- data/lib/scraper/scrape_helper.rb +59 -0
- data/lib/scraper/scrape_request.rb +29 -0
- data/lib/version.rb +6 -0
- data/spec/lib/parsers/description_helper_spec.rb +24 -0
- data/spec/lib/parsers/emails_spec.rb +60 -0
- data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
- data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
- data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
- data/spec/lib/parsers/meta_description_spec.rb +321 -0
- data/spec/lib/parsers/org_description_spec.rb +316 -0
- data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
- data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
- data/spec/lib/parsers/redirected_to_spec.rb +207 -0
- data/spec/lib/parsers/title_spec.rb +87 -0
- data/spec/lib/parsers/twitter_description_spec.rb +314 -0
- data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
- data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
- data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
- data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
- data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
- data/spec/lib/scraper/scrape_request_test.rb +34 -0
- data/spec/spec_helper.rb +111 -0
- data/spec/vcr/encoding_compatibility_error.yml +316 -0
- data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
- data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
- data/spec/vcr/non_html_scrape.yml +163 -0
- data/spec/vcr/valid_scrape_response.yml +696 -0
- metadata +250 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: efbe9d1a0688fd10e200d972b56c3e2ec86203f1
|
4
|
+
data.tar.gz: 20cce1c52197f11dcea73813831bb4172829ddaa
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 638c34f7efbc963613f4bb841abbf183bf134ee3197bebc99f9403ba7864befd44243f53092a9aa3ba7ea58314475b61d6671816e8e3f8ef4deb7f49b6f0ef52
|
7
|
+
data.tar.gz: f91110f69e8228de408aa0c35050fe6137fac22bdb93ff86be3c70d380e1cf57534f50f78e5d585cb63e421ff0fc51aa04089d38e8a86ab3a6ca305659dc909a
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# WebScraper [](https://travis-ci.com/bkotu6717/brilliant_web_scraper)
|
2
|
+
|
3
|
+
A decent web scraping gem. Scrapes website description, social profiles, contact details, youtube channels.
|
4
|
+
|
5
|
+
|
6
|
+
It accepts a URL or Domain as input and gets it's title, descrptios, social profiles, YouTube channels and it's current URL if got redirected.
|
7
|
+
|
8
|
+
|
9
|
+
## See it in action!
|
10
|
+
|
11
|
+
You can try WebScraper live at this little demo: [https://brilliantweb-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
|
16
|
+
If you're using it on a Rails application, just add it to your Gemfile and run `bundle install`
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
gem 'brilliant_web_scraper'
|
20
|
+
```
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
Initialize a BrilliantWebScraper instance for an URL, like this:
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
require 'brilliant_web_scraper'
|
28
|
+
results = BrilliantWebScraper.new('http://pwc.com')
|
29
|
+
```
|
30
|
+
|
31
|
+
If you don't include the scheme on the URL, it is fine:
|
Binary file
|
Binary file
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require File.expand_path('./lib/version')
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'brilliant_web_scraper'
|
7
|
+
s.version = WebScraper::VERSION
|
8
|
+
s.licenses = ['Nonstandard']
|
9
|
+
s.summary = 'A decent web scraping ruby library!'
|
10
|
+
s.description = 'Scrapes data such as description, social profiles, contact details'
|
11
|
+
s.authors = ['Kotu Bhaskara Rao']
|
12
|
+
s.email = 'bkotu6717@gmail.com'
|
13
|
+
s.require_paths = ['lib']
|
14
|
+
s.homepage = 'https://github.com/bkotu6717/brilliant_web_scraper'
|
15
|
+
s.files = Dir['**/*'].keep_if { |file|
|
16
|
+
file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" && File.file?(file)
|
17
|
+
}
|
18
|
+
s.required_ruby_version = '>= 2.3.0'
|
19
|
+
|
20
|
+
s.add_dependency 'nesty', '~> 1.0', '>= 1.0.1'
|
21
|
+
s.add_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
22
|
+
|
23
|
+
s.add_development_dependency 'nesty', '~> 1.0', '>= 1.0.1'
|
24
|
+
s.add_development_dependency 'pry', '~> 0.12.2'
|
25
|
+
s.add_development_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
|
26
|
+
s.add_development_dependency 'rspec', '~> 3.5'
|
27
|
+
s.add_development_dependency 'rubocop', '~> 0.73.0'
|
28
|
+
s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
|
29
|
+
s.add_development_dependency 'webmock', '~> 2.1'
|
30
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rest-client'
|
4
|
+
require 'cgi'
|
5
|
+
require 'benchmark'
|
6
|
+
|
7
|
+
current_directory = File.dirname(__FILE__) + '/scraper'
|
8
|
+
require File.expand_path(File.join(current_directory, 'errors'))
|
9
|
+
require File.expand_path(File.join(current_directory, 'scrape_exceptions'))
|
10
|
+
require File.expand_path(File.join(current_directory, 'scrape_helper'))
|
11
|
+
require File.expand_path(File.join(current_directory, 'scrape_request'))
|
12
|
+
|
13
|
+
current_directory = File.dirname(__FILE__) + '/parsers'
|
14
|
+
require File.expand_path(File.join(current_directory, 'unescape_html_helper'))
|
15
|
+
require File.expand_path(File.join(current_directory, 'description_helper'))
|
16
|
+
require File.expand_path(File.join(current_directory, 'title'))
|
17
|
+
require File.expand_path(File.join(current_directory, 'meta_description'))
|
18
|
+
require File.expand_path(File.join(current_directory, 'org_description'))
|
19
|
+
require File.expand_path(File.join(current_directory, 'twitter_description'))
|
20
|
+
require File.expand_path(File.join(current_directory, 'twitter_profile'))
|
21
|
+
require File.expand_path(File.join(current_directory, 'linkedin_profile'))
|
22
|
+
require File.expand_path(File.join(current_directory, 'facebook_profile'))
|
23
|
+
require File.expand_path(File.join(current_directory, 'youtube_channel'))
|
24
|
+
require File.expand_path(File.join(current_directory, 'instagram_profile'))
|
25
|
+
require File.expand_path(File.join(current_directory, 'vimeo_profile'))
|
26
|
+
require File.expand_path(File.join(current_directory, 'pinterest_profile'))
|
27
|
+
require File.expand_path(File.join(current_directory, 'emails'))
|
28
|
+
require File.expand_path(File.join(current_directory, 'phone_numbers'))
|
29
|
+
require File.expand_path(File.join(current_directory, 'redirected_to'))
|
30
|
+
|
31
|
+
# Main scraping class
|
32
|
+
class BrilliantWebScraper
|
33
|
+
extend ScrapeHelper
|
34
|
+
extend ScrapeRequest
|
35
|
+
extend Title
|
36
|
+
extend MetaDescription
|
37
|
+
extend OrgDescription
|
38
|
+
extend TwitterDescription
|
39
|
+
extend TwitterProfile
|
40
|
+
extend LinkedinProfile
|
41
|
+
extend FacebookProfile
|
42
|
+
extend YoutubeChannel
|
43
|
+
extend InstagramProfile
|
44
|
+
extend VimeoProfile
|
45
|
+
extend PinterestProfile
|
46
|
+
extend Emails
|
47
|
+
extend PhoneNumbers
|
48
|
+
extend RedirectedTo
|
49
|
+
|
50
|
+
class << self
|
51
|
+
def new(url, connection_timeout = 10, read_timeout = 10)
|
52
|
+
perform_scrape(url, connection_timeout, read_timeout)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# @Parses meta, twitter, org description tags
|
4
|
+
module DescriptionHelper
|
5
|
+
include UnescapeHtmlHelper
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
def scrape_description(response, regexes)
|
10
|
+
return if response.to_s.empty? || regexes.empty?
|
11
|
+
|
12
|
+
description = nil
|
13
|
+
regexes.each do |regex|
|
14
|
+
description = response.scan(regex).flatten.compact
|
15
|
+
description = parse_description(description)
|
16
|
+
break unless description.nil?
|
17
|
+
end
|
18
|
+
unescape_html(description)
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse_description(descriptions)
|
22
|
+
return if descriptions.nil? || descriptions.empty?
|
23
|
+
|
24
|
+
descriptions = descriptions.reject { |x| x.nil? || x.empty? }
|
25
|
+
descriptions = descriptions.map { |x| unescape_html(x) }
|
26
|
+
descriptions.find { |x| (x !~ /^\s*[|-]?\s*$/) }
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
current_directory = File.dirname(__FILE__)
|
4
|
+
require File.expand_path(File.join(current_directory, 'unescape_html_helper'))
|
5
|
+
|
6
|
+
# Parses emails from html string
|
7
|
+
module Emails
|
8
|
+
include UnescapeHtmlHelper
|
9
|
+
def grep_emails(response)
|
10
|
+
return if response.nil? || response.empty?
|
11
|
+
|
12
|
+
first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
|
13
|
+
second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3})["'\s><]}
|
14
|
+
first_set = response.scan(first_regex).flatten.compact
|
15
|
+
first_set = get_processed_emails(first_set)
|
16
|
+
second_set = response.scan(second_regex).flatten.compact
|
17
|
+
second_set = get_processed_emails(second_set)
|
18
|
+
(first_set | second_set).compact.map(&:downcase).uniq
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_processed_emails(email_set)
|
22
|
+
return [] if email_set.nil? || email_set.empty?
|
23
|
+
|
24
|
+
unescaped_emails = email_set.map { |email| unescape_html(email) }
|
25
|
+
return [] if unescaped_emails.empty?
|
26
|
+
|
27
|
+
email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3}/im
|
28
|
+
unescaped_emails.select { |data| data =~ email_match_regex }
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep facebook profiles
|
4
|
+
module FacebookProfile
|
5
|
+
def grep_facebook_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
|
9
|
+
response.scan(facebook_url_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep instgram profiles
|
4
|
+
module InstagramProfile
|
5
|
+
def grep_instagram_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
instagram_regex = %r{(?im)(https?:\/\/(?:www\.)?+instagram\.com\/(?!#|%|"|'|(?:explore|p)\/).+?[^"'<>\s?&\/]+)}
|
9
|
+
response.scan(instagram_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep linkedin profile
|
4
|
+
module LinkedinProfile
|
5
|
+
def grep_linkedin_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
linkedin_profile_regex = %r{(?im)(https:\/\/www\.linkedin\.com\/company\/[^"'\?<>\s\/]+)}
|
9
|
+
response.scan(linkedin_profile_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep description in meta tag with attribute name='description'
|
4
|
+
module MetaDescription
|
5
|
+
include DescriptionHelper
|
6
|
+
def grep_meta_description(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
first_regex = %r{(?im)(?im)<meta\s+[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
|
10
|
+
second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*description\s*(?:'|")?[\w\s"'=-]*[\/>]}
|
11
|
+
scrape_description(response, [first_regex, second_regex])
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Greps description from meta tag with attrbute org:description
|
4
|
+
module OrgDescription
|
5
|
+
include DescriptionHelper
|
6
|
+
def grep_org_description(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
first_regex = %r{(?im)<meta\s+[\w\s"'=-]*(?:property|itemprop)\s*=\s*(?:'|")?\s*og:description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
|
10
|
+
second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:property|itemprop)\s*=\s*(?:'|")?\s*og:description\s*(?:'|")?[\w\s"'=-]*[\/>]}
|
11
|
+
scrape_description(response, [first_regex, second_regex])
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep phonenumbers from 'href=tel:' attributes
|
4
|
+
module PhoneNumbers
|
5
|
+
include UnescapeHtmlHelper
|
6
|
+
def grep_phone_numbers(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
phone_number_regex = %r{(?im)href\s*=\s*(?:"|')?\s*tel:\s*(?:https?:)?\/*(?!#(?:"|'))([^"'\/<>\{\[]+)}
|
10
|
+
phone_numbers = response.scan(phone_number_regex).flatten.uniq
|
11
|
+
get_processed_phone_numbers(phone_numbers)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def get_processed_phone_numbers(phone_numbers)
|
17
|
+
return [] if phone_numbers.nil? || phone_numbers.empty?
|
18
|
+
|
19
|
+
unescaped_contacts = phone_numbers.map { |phone_number| unescape_html(phone_number) }
|
20
|
+
good_phone_numbers = []
|
21
|
+
unescaped_contacts.each do |x|
|
22
|
+
next if x !~ /\d+/
|
23
|
+
|
24
|
+
if x =~ /\w+=/
|
25
|
+
good_phone_numbers << x.gsub(/\w+=.*/, '')
|
26
|
+
next
|
27
|
+
|
28
|
+
else
|
29
|
+
good_phone_numbers << x
|
30
|
+
end
|
31
|
+
end
|
32
|
+
good_phone_numbers.uniq
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep pinterest profile
|
4
|
+
module PinterestProfile
|
5
|
+
def grep_pinterest_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
pinterest_regex = %r{(?im)(https?:\/\/[\w\.]*pinterest\.com\/(?!"|'|\?|#|cookies(?:"|'')|(?:pin|v3|js|feed)\/)[^"'<>?&\s\/]+)}
|
9
|
+
response.scan(pinterest_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Fetch latest url of the given website
|
4
|
+
module RedirectedTo
|
5
|
+
|
6
|
+
def grep_redirected_to_url(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
patterns = [
|
10
|
+
%r{(?im)<link\s+[\s\w="'-]*rel\s*=\s*(?:"|')canonical(?:"|')[\s\w='"-]*?\s+href\s*=\s*(?:"|')([^"']*)(?:"|')[\s\w='"-]*?(?:>|\/>)},
|
11
|
+
%r{(?im)<link\s+[\s\w='"-]*href\s*=\s*(?:"|')([^'"]*)(?:"|')[\s\w='"-]*?rel\s*=\s*(?:"|')\s*canonical\s*(?:"|')[\s\w='"-]*(?:>|\/>)},
|
12
|
+
%r{(?im)<meta\s+[\s\w="'-]*property=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w="'-]*content=\s*(?:'|")([^'"]*)(?:'|")[\s\w="'-]*(?:>|\/>)},
|
13
|
+
%r{(?im)<meta\s+[\s\w"'=-]*content\s*=\s*(?:'|")([^'"]*)(?:'|")[\s\w"'=-]*property\s*=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w"'=-]*(?:>|\/>)}
|
14
|
+
]
|
15
|
+
url = nil
|
16
|
+
patterns.each do |pattern|
|
17
|
+
web_urls = response.scan(pattern).flatten
|
18
|
+
url = parser(web_urls)
|
19
|
+
break unless url.nil?
|
20
|
+
end
|
21
|
+
url
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def parser(urls)
|
27
|
+
urls.find { |x| x =~ %r{(?im)^\s*(?:https*)?:?(?:\/\/)?\w+[.&%-]} }
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep title form very first title tag
|
4
|
+
module Title
|
5
|
+
include UnescapeHtmlHelper
|
6
|
+
def grep_title(response)
|
7
|
+
return if !response.is_a?(String) || response.empty?
|
8
|
+
|
9
|
+
title_regex = /<\s*title.*?>(.*?)<?\s*\/?title\s*?>/im
|
10
|
+
title = response.match(title_regex).captures[0].strip rescue nil
|
11
|
+
unescape_html(title) unless title.nil? || title.empty?
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep twitter description from attribute `twitter:description`
|
4
|
+
module TwitterDescription
|
5
|
+
include DescriptionHelper
|
6
|
+
def grep_twitter_description(response)
|
7
|
+
return if response.nil? || response.empty?
|
8
|
+
|
9
|
+
first_regex = %r{(?im)<meta\s+[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*twitter:description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
|
10
|
+
second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*twitter:description\s*(?:'|")?[\w\s"'=-]*[\/>]}
|
11
|
+
scrape_description(response, [first_regex, second_regex])
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep twitter profile
|
4
|
+
module TwitterProfile
|
5
|
+
def grep_twitter_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
|
9
|
+
response.scan(twitter_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Decode HTML & URL encodings
|
4
|
+
module UnescapeHtmlHelper
|
5
|
+
private
|
6
|
+
|
7
|
+
def unescape_html(text)
|
8
|
+
return if text.nil? && !text.is_a?(String) || text.empty?
|
9
|
+
|
10
|
+
unescaped_html_text = CGI.unescapeHTML(text)
|
11
|
+
if unescaped_html_text =~ /%[a-z0-9]{2}/i
|
12
|
+
plus_sign_preserved_text = unescaped_html_text.gsub(/\+/, '%2B')
|
13
|
+
unescaped_html_text = CGI.unescape(plus_sign_preserved_text)
|
14
|
+
end
|
15
|
+
unescaped_html_text.strip
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep Vimeo social profile
|
4
|
+
module VimeoProfile
|
5
|
+
def grep_vimeo_profile(response)
|
6
|
+
return if response.nil? || response.empty?
|
7
|
+
|
8
|
+
vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\&\?<>\s]+)}
|
9
|
+
response.scan(vimeo_regex).flatten.compact.uniq
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Grep youtube channels
|
4
|
+
module YoutubeChannel
|
5
|
+
def grep_youtube_channel(response)
|
6
|
+
return if response !~ %r{(?im)https?:\/\/(?:www\.)?youtube\.com\/}
|
7
|
+
|
8
|
+
first_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/(?!\?gl=\w{2}|(?:embed|feeds)\/|(?:player_api|iframe_api)(?:"|'|\/|\?)|watch\?|user\/#)[^"'\&<>\s]+)}
|
9
|
+
second_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/watch?\S*v=[^<>&'"]+)}
|
10
|
+
third_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/embed\/(?!id|{|}|\[|\]|\$|\?|\\|%|\+)[^"'\?<>\s]+)}
|
11
|
+
youtube_channels = scrape_profile(response, [first_regex, second_regex, third_regex])
|
12
|
+
youtube_channels.compact.uniq
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def scrape_profile(response, regexes)
|
18
|
+
return if response.to_s.empty? || regexes.empty?
|
19
|
+
|
20
|
+
profiles = []
|
21
|
+
regexes.each do |regex|
|
22
|
+
profiles = response.scan(regex).flatten.compact
|
23
|
+
break unless profiles.empty?
|
24
|
+
end
|
25
|
+
return [] if profiles.none?
|
26
|
+
|
27
|
+
profiles
|
28
|
+
end
|
29
|
+
end
|