brilliant_web_scraper 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/README.md +31 -0
  4. data/brilliant_web_scraper-1.0.0.gem +0 -0
  5. data/brilliant_web_scraper-1.0.gem +0 -0
  6. data/brilliant_web_scraper.gemspec +30 -0
  7. data/lib/brilliant_web_scraper.rb +55 -0
  8. data/lib/parsers/description_helper.rb +28 -0
  9. data/lib/parsers/emails.rb +30 -0
  10. data/lib/parsers/facebook_profile.rb +11 -0
  11. data/lib/parsers/instagram_profile.rb +11 -0
  12. data/lib/parsers/linkedin_profile.rb +11 -0
  13. data/lib/parsers/meta_description.rb +13 -0
  14. data/lib/parsers/org_description.rb +13 -0
  15. data/lib/parsers/phone_numbers.rb +34 -0
  16. data/lib/parsers/pinterest_profile.rb +11 -0
  17. data/lib/parsers/redirected_to.rb +29 -0
  18. data/lib/parsers/title.rb +13 -0
  19. data/lib/parsers/twitter_description.rb +13 -0
  20. data/lib/parsers/twitter_profile.rb +11 -0
  21. data/lib/parsers/unescape_html_helper.rb +17 -0
  22. data/lib/parsers/vimeo_profile.rb +11 -0
  23. data/lib/parsers/youtube_channel.rb +29 -0
  24. data/lib/scraper/errors.rb +19 -0
  25. data/lib/scraper/scrape_exceptions.rb +49 -0
  26. data/lib/scraper/scrape_helper.rb +59 -0
  27. data/lib/scraper/scrape_request.rb +29 -0
  28. data/lib/version.rb +6 -0
  29. data/spec/lib/parsers/description_helper_spec.rb +24 -0
  30. data/spec/lib/parsers/emails_spec.rb +60 -0
  31. data/spec/lib/parsers/facebook_profile_spec.rb +77 -0
  32. data/spec/lib/parsers/instagram_profile_spec.rb +45 -0
  33. data/spec/lib/parsers/linkedin_profile_spec.rb +43 -0
  34. data/spec/lib/parsers/meta_description_spec.rb +321 -0
  35. data/spec/lib/parsers/org_description_spec.rb +316 -0
  36. data/spec/lib/parsers/phone_numbers_spec.rb +69 -0
  37. data/spec/lib/parsers/pinterest_profile_spec.rb +44 -0
  38. data/spec/lib/parsers/redirected_to_spec.rb +207 -0
  39. data/spec/lib/parsers/title_spec.rb +87 -0
  40. data/spec/lib/parsers/twitter_description_spec.rb +314 -0
  41. data/spec/lib/parsers/twitter_profile_spec.rb +59 -0
  42. data/spec/lib/parsers/unescape_html_helper_spec.rb +0 -0
  43. data/spec/lib/parsers/vimeo_profile_spec.rb +43 -0
  44. data/spec/lib/parsers/youtube_profile_spec.rb +82 -0
  45. data/spec/lib/scraper/brilliant_web_scrape_test.rb +66 -0
  46. data/spec/lib/scraper/scrape_request_test.rb +34 -0
  47. data/spec/spec_helper.rb +111 -0
  48. data/spec/vcr/encoding_compatibility_error.yml +316 -0
  49. data/spec/vcr/invalid_byte_sequence_utf_8.yml +2383 -0
  50. data/spec/vcr/no_valid_data_to_scrape.yml +109 -0
  51. data/spec/vcr/non_html_scrape.yml +163 -0
  52. data/spec/vcr/valid_scrape_response.yml +696 -0
  53. metadata +250 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: efbe9d1a0688fd10e200d972b56c3e2ec86203f1
4
+ data.tar.gz: 20cce1c52197f11dcea73813831bb4172829ddaa
5
+ SHA512:
6
+ metadata.gz: 638c34f7efbc963613f4bb841abbf183bf134ee3197bebc99f9403ba7864befd44243f53092a9aa3ba7ea58314475b61d6671816e8e3f8ef4deb7f49b6f0ef52
7
+ data.tar.gz: f91110f69e8228de408aa0c35050fe6137fac22bdb93ff86be3c70d380e1cf57534f50f78e5d585cb63e421ff0fc51aa04089d38e8a86ab3a6ca305659dc909a
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in web_scraping.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # WebScraper [![Build Status](https://api.travis-ci.com/bkotu6717/brilliant_web_scraper.svg)](https://travis-ci.com/bkotu6717/brilliant_web_scraper)
2
+
3
+ A decent web scraping gem. Scrapes website description, social profiles, contact details, youtube channels.
4
+
5
+
6
+ It accepts a URL or Domain as input and gets it's title, descrptios, social profiles, YouTube channels and it's current URL if got redirected.
7
+
8
+
9
+ ## See it in action!
10
+
11
+ You can try WebScraper live at this little demo: [https://brilliantweb-scraper-demo.herokuapp.com](https://brilliant-web-scraper-demo.herokuapp.com)
12
+
13
+ ## Installation
14
+
15
+
16
+ If you're using it on a Rails application, just add it to your Gemfile and run `bundle install`
17
+
18
+ ```ruby
19
+ gem 'brilliant_web_scraper'
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ Initialize a BrilliantWebScraper instance for an URL, like this:
25
+
26
+ ```ruby
27
+ require 'brilliant_web_scraper'
28
+ results = BrilliantWebScraper.new('http://pwc.com')
29
+ ```
30
+
31
+ If you don't include the scheme on the URL, it is fine:
Binary file
Binary file
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require File.expand_path('./lib/version')
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'brilliant_web_scraper'
7
+ s.version = WebScraper::VERSION
8
+ s.licenses = ['Nonstandard']
9
+ s.summary = 'A decent web scraping ruby library!'
10
+ s.description = 'Scrapes data such as description, social profiles, contact details'
11
+ s.authors = ['Kotu Bhaskara Rao']
12
+ s.email = 'bkotu6717@gmail.com'
13
+ s.require_paths = ['lib']
14
+ s.homepage = 'https://github.com/bkotu6717/brilliant_web_scraper'
15
+ s.files = Dir['**/*'].keep_if { |file|
16
+ file != "brilliant_web_scraper-#{WebScraper::VERSION}.gem" && File.file?(file)
17
+ }
18
+ s.required_ruby_version = '>= 2.3.0'
19
+
20
+ s.add_dependency 'nesty', '~> 1.0', '>= 1.0.1'
21
+ s.add_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
22
+
23
+ s.add_development_dependency 'nesty', '~> 1.0', '>= 1.0.1'
24
+ s.add_development_dependency 'pry', '~> 0.12.2'
25
+ s.add_development_dependency 'rest-client', '~> 2.0', '>= 2.0.2'
26
+ s.add_development_dependency 'rspec', '~> 3.5'
27
+ s.add_development_dependency 'rubocop', '~> 0.73.0'
28
+ s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
29
+ s.add_development_dependency 'webmock', '~> 2.1'
30
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rest-client'
4
+ require 'cgi'
5
+ require 'benchmark'
6
+
7
+ current_directory = File.dirname(__FILE__) + '/scraper'
8
+ require File.expand_path(File.join(current_directory, 'errors'))
9
+ require File.expand_path(File.join(current_directory, 'scrape_exceptions'))
10
+ require File.expand_path(File.join(current_directory, 'scrape_helper'))
11
+ require File.expand_path(File.join(current_directory, 'scrape_request'))
12
+
13
+ current_directory = File.dirname(__FILE__) + '/parsers'
14
+ require File.expand_path(File.join(current_directory, 'unescape_html_helper'))
15
+ require File.expand_path(File.join(current_directory, 'description_helper'))
16
+ require File.expand_path(File.join(current_directory, 'title'))
17
+ require File.expand_path(File.join(current_directory, 'meta_description'))
18
+ require File.expand_path(File.join(current_directory, 'org_description'))
19
+ require File.expand_path(File.join(current_directory, 'twitter_description'))
20
+ require File.expand_path(File.join(current_directory, 'twitter_profile'))
21
+ require File.expand_path(File.join(current_directory, 'linkedin_profile'))
22
+ require File.expand_path(File.join(current_directory, 'facebook_profile'))
23
+ require File.expand_path(File.join(current_directory, 'youtube_channel'))
24
+ require File.expand_path(File.join(current_directory, 'instagram_profile'))
25
+ require File.expand_path(File.join(current_directory, 'vimeo_profile'))
26
+ require File.expand_path(File.join(current_directory, 'pinterest_profile'))
27
+ require File.expand_path(File.join(current_directory, 'emails'))
28
+ require File.expand_path(File.join(current_directory, 'phone_numbers'))
29
+ require File.expand_path(File.join(current_directory, 'redirected_to'))
30
+
31
+ # Main scraping class
32
+ class BrilliantWebScraper
33
+ extend ScrapeHelper
34
+ extend ScrapeRequest
35
+ extend Title
36
+ extend MetaDescription
37
+ extend OrgDescription
38
+ extend TwitterDescription
39
+ extend TwitterProfile
40
+ extend LinkedinProfile
41
+ extend FacebookProfile
42
+ extend YoutubeChannel
43
+ extend InstagramProfile
44
+ extend VimeoProfile
45
+ extend PinterestProfile
46
+ extend Emails
47
+ extend PhoneNumbers
48
+ extend RedirectedTo
49
+
50
+ class << self
51
+ def new(url, connection_timeout = 10, read_timeout = 10)
52
+ perform_scrape(url, connection_timeout, read_timeout)
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ # @Parses meta, twitter, org description tags
4
+ module DescriptionHelper
5
+ include UnescapeHtmlHelper
6
+
7
+ private
8
+
9
+ def scrape_description(response, regexes)
10
+ return if response.to_s.empty? || regexes.empty?
11
+
12
+ description = nil
13
+ regexes.each do |regex|
14
+ description = response.scan(regex).flatten.compact
15
+ description = parse_description(description)
16
+ break unless description.nil?
17
+ end
18
+ unescape_html(description)
19
+ end
20
+
21
+ def parse_description(descriptions)
22
+ return if descriptions.nil? || descriptions.empty?
23
+
24
+ descriptions = descriptions.reject { |x| x.nil? || x.empty? }
25
+ descriptions = descriptions.map { |x| unescape_html(x) }
26
+ descriptions.find { |x| (x !~ /^\s*[|-]?\s*$/) }
27
+ end
28
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ current_directory = File.dirname(__FILE__)
4
+ require File.expand_path(File.join(current_directory, 'unescape_html_helper'))
5
+
6
+ # Parses emails from html string
7
+ module Emails
8
+ include UnescapeHtmlHelper
9
+ def grep_emails(response)
10
+ return if response.nil? || response.empty?
11
+
12
+ first_regex = /(?im)mailto:\s*([^\?"',\\<>\s]+)/
13
+ second_regex = %r{(?im)["'\s><\/]*([\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3})["'\s><]}
14
+ first_set = response.scan(first_regex).flatten.compact
15
+ first_set = get_processed_emails(first_set)
16
+ second_set = response.scan(second_regex).flatten.compact
17
+ second_set = get_processed_emails(second_set)
18
+ (first_set | second_set).compact.map(&:downcase).uniq
19
+ end
20
+
21
+ def get_processed_emails(email_set)
22
+ return [] if email_set.nil? || email_set.empty?
23
+
24
+ unescaped_emails = email_set.map { |email| unescape_html(email) }
25
+ return [] if unescaped_emails.empty?
26
+
27
+ email_match_regex = /[\w._%-]+@(?!(?:example|e?mail|domain|company|your(?:domain|company|email)|address|emailad?dress|yyy|test)\.)[\w._%-]+\.(?!png|jpe?g|tif|svg)[A-Z]{2,3}/im
28
+ unescaped_emails.select { |data| data =~ email_match_regex }
29
+ end
30
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep facebook profiles
4
+ module FacebookProfile
5
+ def grep_facebook_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ facebook_url_regex = /(https?:\/\/(?:www\.)?(?:facebook|fb)\.com\/(?!tr\?|(?:[\/\w\d]*(?:photo|sharer?|like(?:box)?|offsite_event|plugins|permalink|home|search))\.php|\d+\/fbml|(?:dialog|hashtag|plugins|sharer|login|recover|security|help|v\d+\.\d+)\/|(?:privacy|#|your-profile|yourfacebookpage)\/?|home\?)[^"'<>\&\s]+)/im
9
+ response.scan(facebook_url_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep instgram profiles
4
+ module InstagramProfile
5
+ def grep_instagram_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ instagram_regex = %r{(?im)(https?:\/\/(?:www\.)?+instagram\.com\/(?!#|%|"|'|(?:explore|p)\/).+?[^"'<>\s?&\/]+)}
9
+ response.scan(instagram_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep linkedin profile
4
+ module LinkedinProfile
5
+ def grep_linkedin_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ linkedin_profile_regex = %r{(?im)(https:\/\/www\.linkedin\.com\/company\/[^"'\?<>\s\/]+)}
9
+ response.scan(linkedin_profile_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep description in meta tag with attribute name='description'
4
+ module MetaDescription
5
+ include DescriptionHelper
6
+ def grep_meta_description(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ first_regex = %r{(?im)(?im)<meta\s+[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
10
+ second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*description\s*(?:'|")?[\w\s"'=-]*[\/>]}
11
+ scrape_description(response, [first_regex, second_regex])
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Greps description from meta tag with attrbute org:description
4
+ module OrgDescription
5
+ include DescriptionHelper
6
+ def grep_org_description(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ first_regex = %r{(?im)<meta\s+[\w\s"'=-]*(?:property|itemprop)\s*=\s*(?:'|")?\s*og:description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
10
+ second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:property|itemprop)\s*=\s*(?:'|")?\s*og:description\s*(?:'|")?[\w\s"'=-]*[\/>]}
11
+ scrape_description(response, [first_regex, second_regex])
12
+ end
13
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep phonenumbers from 'href=tel:' attributes
4
+ module PhoneNumbers
5
+ include UnescapeHtmlHelper
6
+ def grep_phone_numbers(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ phone_number_regex = %r{(?im)href\s*=\s*(?:"|')?\s*tel:\s*(?:https?:)?\/*(?!#(?:"|'))([^"'\/<>\{\[]+)}
10
+ phone_numbers = response.scan(phone_number_regex).flatten.uniq
11
+ get_processed_phone_numbers(phone_numbers)
12
+ end
13
+
14
+ private
15
+
16
+ def get_processed_phone_numbers(phone_numbers)
17
+ return [] if phone_numbers.nil? || phone_numbers.empty?
18
+
19
+ unescaped_contacts = phone_numbers.map { |phone_number| unescape_html(phone_number) }
20
+ good_phone_numbers = []
21
+ unescaped_contacts.each do |x|
22
+ next if x !~ /\d+/
23
+
24
+ if x =~ /\w+=/
25
+ good_phone_numbers << x.gsub(/\w+=.*/, '')
26
+ next
27
+
28
+ else
29
+ good_phone_numbers << x
30
+ end
31
+ end
32
+ good_phone_numbers.uniq
33
+ end
34
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep pinterest profile
4
+ module PinterestProfile
5
+ def grep_pinterest_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ pinterest_regex = %r{(?im)(https?:\/\/[\w\.]*pinterest\.com\/(?!"|'|\?|#|cookies(?:"|'')|(?:pin|v3|js|feed)\/)[^"'<>?&\s\/]+)}
9
+ response.scan(pinterest_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Fetch latest url of the given website
4
+ module RedirectedTo
5
+
6
+ def grep_redirected_to_url(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ patterns = [
10
+ %r{(?im)<link\s+[\s\w="'-]*rel\s*=\s*(?:"|')canonical(?:"|')[\s\w='"-]*?\s+href\s*=\s*(?:"|')([^"']*)(?:"|')[\s\w='"-]*?(?:>|\/>)},
11
+ %r{(?im)<link\s+[\s\w='"-]*href\s*=\s*(?:"|')([^'"]*)(?:"|')[\s\w='"-]*?rel\s*=\s*(?:"|')\s*canonical\s*(?:"|')[\s\w='"-]*(?:>|\/>)},
12
+ %r{(?im)<meta\s+[\s\w="'-]*property=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w="'-]*content=\s*(?:'|")([^'"]*)(?:'|")[\s\w="'-]*(?:>|\/>)},
13
+ %r{(?im)<meta\s+[\s\w"'=-]*content\s*=\s*(?:'|")([^'"]*)(?:'|")[\s\w"'=-]*property\s*=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w"'=-]*(?:>|\/>)}
14
+ ]
15
+ url = nil
16
+ patterns.each do |pattern|
17
+ web_urls = response.scan(pattern).flatten
18
+ url = parser(web_urls)
19
+ break unless url.nil?
20
+ end
21
+ url
22
+ end
23
+
24
+ private
25
+
26
+ def parser(urls)
27
+ urls.find { |x| x =~ %r{(?im)^\s*(?:https*)?:?(?:\/\/)?\w+[.&%-]} }
28
+ end
29
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep title form very first title tag
4
+ module Title
5
+ include UnescapeHtmlHelper
6
+ def grep_title(response)
7
+ return if !response.is_a?(String) || response.empty?
8
+
9
+ title_regex = /<\s*title.*?>(.*?)<?\s*\/?title\s*?>/im
10
+ title = response.match(title_regex).captures[0].strip rescue nil
11
+ unescape_html(title) unless title.nil? || title.empty?
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep twitter description from attribute `twitter:description`
4
+ module TwitterDescription
5
+ include DescriptionHelper
6
+ def grep_twitter_description(response)
7
+ return if response.nil? || response.empty?
8
+
9
+ first_regex = %r{(?im)<meta\s+[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*twitter:description\s*(?:'|")?[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*[\/>]}
10
+ second_regex = %r{(?im)<meta\s+[\w\s"'=-]*content\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)'))[\w\s"'=-]*(?:name|itemprop)\s*=\s*(?:'|")?\s*twitter:description\s*(?:'|")?[\w\s"'=-]*[\/>]}
11
+ scrape_description(response, [first_regex, second_regex])
12
+ end
13
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep twitter profile
4
+ module TwitterProfile
5
+ def grep_twitter_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ twitter_regex = %r{(?im)(https?:\/\/(?:www\.)?twitter\.com\/(?!(?:share|download|search|home|login|privacy)(?:\?|\/|\b)|(?:hashtag|i|javascripts|statuses|#!|intent)\/|(?:#|'|%))[^"'&\?<>\s\\]+)}
9
+ response.scan(twitter_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Decode HTML & URL encodings
4
+ module UnescapeHtmlHelper
5
+ private
6
+
7
+ def unescape_html(text)
8
+ return if text.nil? && !text.is_a?(String) || text.empty?
9
+
10
+ unescaped_html_text = CGI.unescapeHTML(text)
11
+ if unescaped_html_text =~ /%[a-z0-9]{2}/i
12
+ plus_sign_preserved_text = unescaped_html_text.gsub(/\+/, '%2B')
13
+ unescaped_html_text = CGI.unescape(plus_sign_preserved_text)
14
+ end
15
+ unescaped_html_text.strip
16
+ end
17
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep Vimeo social profile
4
+ module VimeoProfile
5
+ def grep_vimeo_profile(response)
6
+ return if response.nil? || response.empty?
7
+
8
+ vimeo_regex = %r{(?im)(https?:\/\/(?:www\.)?vimeo\.com\/(?!upgrade|features|enterprise|upload|api)\/?[^"'\&\?<>\s]+)}
9
+ response.scan(vimeo_regex).flatten.compact.uniq
10
+ end
11
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Grep youtube channels
4
+ module YoutubeChannel
5
+ def grep_youtube_channel(response)
6
+ return if response !~ %r{(?im)https?:\/\/(?:www\.)?youtube\.com\/}
7
+
8
+ first_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/(?!\?gl=\w{2}|(?:embed|feeds)\/|(?:player_api|iframe_api)(?:"|'|\/|\?)|watch\?|user\/#)[^"'\&<>\s]+)}
9
+ second_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/watch?\S*v=[^<>&'"]+)}
10
+ third_regex = %r{(?im)(https?:\/\/(?:www\.)?youtube\.com\/embed\/(?!id|{|}|\[|\]|\$|\?|\\|%|\+)[^"'\?<>\s]+)}
11
+ youtube_channels = scrape_profile(response, [first_regex, second_regex, third_regex])
12
+ youtube_channels.compact.uniq
13
+ end
14
+
15
+ private
16
+
17
+ def scrape_profile(response, regexes)
18
+ return if response.to_s.empty? || regexes.empty?
19
+
20
+ profiles = []
21
+ regexes.each do |regex|
22
+ profiles = response.scan(regex).flatten.compact
23
+ break unless profiles.empty?
24
+ end
25
+ return [] if profiles.none?
26
+
27
+ profiles
28
+ end
29
+ end