brilliant_web_scraper 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/README.md +31 -0
  4. data/brilliant_web_scraper-1.0.0.gem +0 -0
  5. data/brilliant_web_scraper-1.0.gem +0 -0
  6. data/brilliant_web_scraper.gemspec +30 -0
  7. data/lib/brilliant_web_scraper.rb +55 -0
  8. data/lib/parsers/description_helper.rb +28 -0
  9. data/lib/parsers/emails.rb +30 -0
  10. data/lib/parsers/facebook_profile.rb +11 -0
  11. data/lib/parsers/instagram_profile.rb +11 -0
  12. data/lib/parsers/linkedin_profile.rb +11 -0
  13. data/lib/parsers/meta_description.rb +13 -0
  14. data/lib/parsers/org_description.rb +13 -0
  15. data/lib/parsers/phone_numbers.rb +34 -0
  16. data/lib/parsers/pinterest_profile.rb +11 -0
  17. data/lib/parsers/redirected_to.rb +29 -0
  18. data/lib/parsers/title.rb +13 -0
  19. data/lib/parsers/twitter_description.rb +13 -0
  20. data/lib/parsers/twitter_profile.rb +11 -0
  21. data/lib/parsers/unescape_html_helper.rb +17 -0
  22. data/lib/parsers/vimeo_profile.rb +11 -0
  23. data/lib/parsers/youtube_channel.rb +29 -0
  24. data/lib/scraper/errors.rb +19 -0
  25. data/lib/scraper/scrape_exceptions.rb +49 -0
  26. data/lib/scraper/scrape_helper.rb +59 -0
  27. data/lib/scraper/scrape_request.rb +29 -0
  28. data/lib/version.rb +6 -0
  29. data/spec/lib/parsers/description_helper_spec.rb +24 -0
  30. data/spec/lib/parsers/emails_spec.rb +60 -0
  31. data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
  32. data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
  33. data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
  34. data/spec/lib/parsers/meta_description_spec.rb +321 -0
  35. data/spec/lib/parsers/org_description_spec.rb +316 -0
  36. data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
  37. data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
  38. data/spec/lib/parsers/redirected_to_spec.rb +207 -0
  39. data/spec/lib/parsers/title_spec.rb +87 -0
  40. data/spec/lib/parsers/twitter_description_spec.rb +314 -0
  41. data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
  42. data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
  43. data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
  44. data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
  45. data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
  46. data/spec/lib/scraper/scrape_request_test.rb +34 -0
  47. data/spec/spec_helper.rb +111 -0
  48. data/spec/vcr/encoding_compatibility_error.yml +316 -0
  49. data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
  50. data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
  51. data/spec/vcr/non_html_scrape.yml +163 -0
  52. data/spec/vcr/valid_scrape_response.yml +696 -0
  53. metadata +250 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: efbe9d1a0688fd10e200d972b56c3e2ec86203f1
4
+ data.tar.gz: 20cce1c52197f11dcea73813831bb4172829ddaa
5
+ SHA512:
6
+ metadata.gz: 638c34f7efbc963613f4bb841abbf183bf134ee3197bebc99f9403ba7864befd44243f53092a9aa3ba7ea58314475b61d6671816e8e3f8ef4deb7f49b6f0ef52
7
+ data.tar.gz: f91110f69e8228de408aa0c35050fe6137fac22bdb93ff86be3c70d380e1cf57534f50f78e5d585cb63e421ff0fc51aa04089d38e8a86ab3a6ca305659dc909a
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in web_scraping.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # WebScraper [![Build Status](https://api.travis-ci.com/bkotu6717/brilliant_web_scraper.svg)](https://travis-ci.com/bkotu6717/brilliant_web_scraper)
2
+
3
+ A decent web scraping gem. Scrapes website description, social profiles, contact details, youtube channels.
4
+
5
+
6
+ It accepts a URL or Domain as input and gets it's title, descrptios, social profiles, YouTube channels and it's current URL if got redirected.
7
+
8
+
9
+ ## See it in action!
10
+
11
+ You can try WebScraper live at this little demo: [https://brilliantweb-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
12
+
13
+ ## Installation
14
+
15
+
16
+ If you're using it on a Rails application, just add it to your Gemfile and run `bundle install`
17
+
18
+ ```ruby
19
+ gem 'brilliant_web_scraper'
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ Initialize a BrilliantWebScraper instance for an URL, like this:
25
+
26
+ ```ruby
27
+ require 'brilliant_web_scraper'
28
+ results = BrilliantWebScraper.new('http://pwc.com')
29
+ ```
30
+
31
+ If you don't include the scheme on the URL, it is fine:
Binary file
Binary file
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require File.expand_path('./lib/version')
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'brilliant_web_scraper'
7
+ s.version = WebScraper::VERSION
8
+ s.licenses = ['Nonstandard']
9
+ s.summary = 'A decent web scraping ruby library!'
10
+ s.description = 'Scrapes data such as description, social profiles, contact details'
11
+ s.authors = ['Kotu Bhaskara Rao']
12
+ s.email = 'bkotu6717@gmail.com'
13
+ s.require_paths = ['lib']
14
+ s.homepage = 'https://github.com/bkotu6717/brilliant_web_scraper'
15
+ s.files = Dir['**/*'].keep_if { |file|
16
+ file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" && File.file?(file)
17
+ }
18
+ s.required_ruby_version = '>= 2.3.0'
19
+
20
+ s.add_dependency 'nesty', '~> 1.0', '>= 1.0.1'
21
+ s.add_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
22
+
23
+ s.add_development_dependency 'nesty', '~> 1.0', '>= 1.0.1'
24
+ s.add_development_dependency 'pry', '~> 0.12.2'
25
+ s.add_development_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
26
+ s.add_development_dependency 'rspec', '~> 3.5'
27
+ s.add_development_dependency 'rubocop', '~> 0.73.0'
28
+ s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
29
+ s.add_development_dependency 'webmock', '~> 2.1'
30
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rest-client'
4
+ require 'cgi'
5
+ require 'benchmark'
6
+
7
+ current_directory = File.dirname(__FILE__) + '/scraper'
8
+ require File.expand_path(File.join(current_directory, 'errors'))
9
+ require File.expand_path(File.join(current_directory, 'scrape_exceptions'))
10
+ require File.expand_path(File.join(current_directory, 'scrape_helper'))
11
+ require File.expand_path(File.join(current_directory, 'scrape_request'))
12
+
13
+ current_directory = File.dirname(__FILE__) + '/parsers'
14
+ require File.expand_path(File.join(current_directory, 'unescape_html_helper'))
15
+ require File.expand_path(File.join(current_directory, 'description_helper'))
16
+ require File.expand_path(File.join(current_directory, 'title'))
17
+ require File.expand_path(File.join(current_directory, 'meta_description'))
18
+ require File.expand_path(File.join(current_directory, 'org_description'))
19
+ require File.expand_path(File.join(current_directory, 'twitter_description'))
20
+ require File.expand_path(File.join(current_directory, 'twitter_profile'))
21
+ require File.expand_path(File.join(current_directory, 'linkedin_profile'))
22
+ require File.expand_path(File.join(current_directory, 'facebook_profile'))
23
+ require File.expand_path(File.join(current_directory, 'youtube_channel'))
24
+ require File.expand_path(File.join(current_directory, 'instagram_profile'))
25
+ require File.expand_path(File.join(current_directory, 'vimeo_profile'))
26
+ require File.expand_path(File.join(current_directory, 'pinterest_profile'))
27
+ require File.expand_path(File.join(current_directory, 'emails'))
28
+ require File.expand_path(File.join(current_directory, 'phone_numbers'))
29
+ require File.expand_path(File.join(current_directory, 'redirected_to'))
30
+
31
+ # Main scraping class
32
+ class BrilliantWebScraper
33
+ extend ScrapeHelper
34
+ extend ScrapeRequest
35
+ extend Title
36
+ extend MetaDescription
37
+ extend OrgDescription
38
+ extend TwitterDescription
39
+ extend TwitterProfile
40
+ extend LinkedinProfile
41
+ extend FacebookProfile
42
+ extend YoutubeChannel
43
+ extend InstagramProfile
44
+ extend VimeoProfile
45
+ extend PinterestProfile
46
+ extend Emails
47
+ extend PhoneNumbers
48
+ extend RedirectedTo
49
+
50
+ class << self
51
+ def new(url, connection_timeout = 10, read_timeout = 10)
52
+ perform_scrape(url, connection_timeout, read_timeout)
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ # @Parses meta, twitter, org description tags
4
+ module DescriptionHelper
5
+ include UnescapeHtmlHelper
6
+
7
+ private
8
+
9
+ def scrape_description(response, regexes)
10
+ return if response.to_s.empty? || regexes.empty?
11
+
12
+ description = nil
13
+ regexes.each do |regex|
14
+ description = response.scan(regex).flatten.compact
15
+ description = parse_description(description)
16
+ break unless description.nil?
17
+ end
18
+ unescape_html(description)
19
+ end
20
+
21
+ def parse_description(descriptions)
22
+ return if descriptions.nil? || descriptions.empty?
23
+
24
+ descriptions = descriptions.reject { |x| x.nil? || x.empty? }
25
+ descriptions = descriptions.map { |x| unescape_html(x) }
26
+ descriptions.find { |x| (x !~ /^\s*[|-]?\s*$/) }
27
+ end
28
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ current_directory = File.dirname(__FILE__)
4
+ require File.expand_path(File.join(current_directory, 'unescape_html_helper'))
5
+
6
+ # Parses emails from html string
7
+ module Emails
8
+ include UnescapeHtmlHelper
9
+ def grep_emails(response)
10
+ return if response.nil? || response.empty?
11
+
12
+ first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
13
+ second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3})["'\s><]}
14
+ first_set = response.scan(first_regex).flatten.compact
15
+ first_set = get_processed_emails(first_set)
16
+ second_set = response.scan(second_regex).flatten.compact
17
+ second_set = get_processed_emails(second_set)
18
+ (first_set | second_set).compact.map(&:downcase).uniq
19
+ end
20
+
21
+ def get_processed_emails(email_set)
22
+ return [] if email_set.nil? || email_set.empty?
23
+
24
+ unescaped_emails = email_set.map { |email| unescape_html(email) }
25
+ return [] if unescaped_emails.empty?
26
+
27
+ email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3}/im
28
+ unescaped_emails.select { |data| data =~ email_match_regex }
29
+ end
30
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep facebook profiles
4
+ module FacebookProfile
5
+ def grep_facebook_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
9
+ response.scan(facebook_url_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep instgram profiles
4
+ module InstagramProfile
5
+ def grep_instagram_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ instagram_regex = %r{(?im)(https?:\/\/(?:www\.)?+instagram\.com\/(?!#|%|"|'|(?:explore|p)\/).+?[^"'<>\s?&\/]+)}
9
+ response.scan(instagram_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep linkedin profile
4
+ module LinkedinProfile
5
+ def grep_linkedin_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ linkedin_profile_regex = %r{(?im)(https:\/\/www\.linkedin\.com\/company\/[^"'\?<>\s\/]+)}
9
+ response.scan(linkedin_profile_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep description in meta tag with attribute name='description'
4
+ module MetaDescription
5
+ include DescriptionHelper
6
+ def grep_meta_description(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ first_regex = %r{(?im)(?im)<meta\s+[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
10
+ second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*description\s*(?:'|")?[\w\s"'=-]*[\/>]}
11
+ scrape_description(response, [first_regex, second_regex])
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Greps description from meta tag with attrbute org:description
4
+ module OrgDescription
5
+ include DescriptionHelper
6
+ def grep_org_description(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ first_regex = %r{(?im)<meta\s+[\w\s"'=-]*(?:property|itemprop)\s*=\s*(?:'|")?\s*og:description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
10
+ second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:property|itemprop)\s*=\s*(?:'|")?\s*og:description\s*(?:'|")?[\w\s"'=-]*[\/>]}
11
+ scrape_description(response, [first_regex, second_regex])
12
+ end
13
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep phonenumbers from 'href=tel:' attributes
4
+ module PhoneNumbers
5
+ include UnescapeHtmlHelper
6
+ def grep_phone_numbers(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ phone_number_regex = %r{(?im)href\s*=\s*(?:"|')?\s*tel:\s*(?:https?:)?\/*(?!#(?:"|'))([^"'\/<>\{\[]+)}
10
+ phone_numbers = response.scan(phone_number_regex).flatten.uniq
11
+ get_processed_phone_numbers(phone_numbers)
12
+ end
13
+
14
+ private
15
+
16
+ def get_processed_phone_numbers(phone_numbers)
17
+ return [] if phone_numbers.nil? || phone_numbers.empty?
18
+
19
+ unescaped_contacts = phone_numbers.map { |phone_number| unescape_html(phone_number) }
20
+ good_phone_numbers = []
21
+ unescaped_contacts.each do |x|
22
+ next if x !~ /\d+/
23
+
24
+ if x =~ /\w+=/
25
+ good_phone_numbers << x.gsub(/\w+=.*/, '')
26
+ next
27
+
28
+ else
29
+ good_phone_numbers << x
30
+ end
31
+ end
32
+ good_phone_numbers.uniq
33
+ end
34
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep pinterest profile
4
+ module PinterestProfile
5
+ def grep_pinterest_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ pinterest_regex = %r{(?im)(https?:\/\/[\w\.]*pinterest\.com\/(?!"|'|\?|#|cookies(?:"|'')|(?:pin|v3|js|feed)\/)[^"'<>?&\s\/]+)}
9
+ response.scan(pinterest_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Fetch latest url of the given website
4
+ module RedirectedTo
5
+
6
+ def grep_redirected_to_url(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ patterns = [
10
+ %r{(?im)<link\s+[\s\w="'-]*rel\s*=\s*(?:"|')canonical(?:"|')[\s\w='"-]*?\s+href\s*=\s*(?:"|')([^"']*)(?:"|')[\s\w='"-]*?(?:>|\/>)},
11
+ %r{(?im)<link\s+[\s\w='"-]*href\s*=\s*(?:"|')([^'"]*)(?:"|')[\s\w='"-]*?rel\s*=\s*(?:"|')\s*canonical\s*(?:"|')[\s\w='"-]*(?:>|\/>)},
12
+ %r{(?im)<meta\s+[\s\w="'-]*property=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w="'-]*content=\s*(?:'|")([^'"]*)(?:'|")[\s\w="'-]*(?:>|\/>)},
13
+ %r{(?im)<meta\s+[\s\w"'=-]*content\s*=\s*(?:'|")([^'"]*)(?:'|")[\s\w"'=-]*property\s*=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w"'=-]*(?:>|\/>)}
14
+ ]
15
+ url = nil
16
+ patterns.each do |pattern|
17
+ web_urls = response.scan(pattern).flatten
18
+ url = parser(web_urls)
19
+ break unless url.nil?
20
+ end
21
+ url
22
+ end
23
+
24
+ private
25
+
26
+ def parser(urls)
27
+ urls.find { |x| x =~ %r{(?im)^\s*(?:https*)?:?(?:\/\/)?\w+[.&%-]} }
28
+ end
29
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep title form very first title tag
4
+ module Title
5
+ include UnescapeHtmlHelper
6
+ def grep_title(response)
7
+ return if !response.is_a?(String) || response.empty?
8
+
9
+ title_regex = /<\s*title.*?>(.*?)<?\s*\/?title\s*?>/im
10
+ title = response.match(title_regex).captures[0].strip rescue nil
11
+ unescape_html(title) unless title.nil? || title.empty?
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep twitter description from attribute `twitter:description`
4
+ module TwitterDescription
5
+ include DescriptionHelper
6
+ def grep_twitter_description(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ first_regex = %r{(?im)<meta\s+[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*twitter:description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
10
+ second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*twitter:description\s*(?:'|")?[\w\s"'=-]*[\/>]}
11
+ scrape_description(response, [first_regex, second_regex])
12
+ end
13
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep twitter profile
4
+ module TwitterProfile
5
+ def grep_twitter_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
9
+ response.scan(twitter_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Decode HTML & URL encodings
4
+ module UnescapeHtmlHelper
5
+ private
6
+
7
+ def unescape_html(text)
8
+ return if text.nil? && !text.is_a?(String) || text.empty?
9
+
10
+ unescaped_html_text = CGI.unescapeHTML(text)
11
+ if unescaped_html_text =~ /%[a-z0-9]{2}/i
12
+ plus_sign_preserved_text = unescaped_html_text.gsub(/\+/, '%2B')
13
+ unescaped_html_text = CGI.unescape(plus_sign_preserved_text)
14
+ end
15
+ unescaped_html_text.strip
16
+ end
17
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep Vimeo social profile
4
+ module VimeoProfile
5
+ def grep_vimeo_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\&\?<>\s]+)}
9
+ response.scan(vimeo_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep youtube channels
4
+ module YoutubeChannel
5
+ def grep_youtube_channel(response)
6
+ return if response !~ %r{(?im)https?:\/\/(?:www\.)?youtube\.com\/}
7
+
8
+ first_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/(?!\?gl=\w{2}|(?:embed|feeds)\/|(?:player_api|iframe_api)(?:"|'|\/|\?)|watch\?|user\/#)[^"'\&<>\s]+)}
9
+ second_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/watch?\S*v=[^<>&'"]+)}
10
+ third_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/embed\/(?!id|{|}|\[|\]|\$|\?|\\|%|\+)[^"'\?<>\s]+)}
11
+ youtube_channels = scrape_profile(response, [first_regex, second_regex, third_regex])
12
+ youtube_channels.compact.uniq
13
+ end
14
+
15
+ private
16
+
17
+ def scrape_profile(response, regexes)
18
+ return if response.to_s.empty? || regexes.empty?
19
+
20
+ profiles = []
21
+ regexes.each do |regex|
22
+ profiles = response.scan(regex).flatten.compact
23
+ break unless profiles.empty?
24
+ end
25
+ return [] if profiles.none?
26
+
27
+ profiles
28
+ end
29
+ end