RubyGems - bots - Versions diffs - 1.0.9 → 1.0.11 - Mend

bots 1.0.9 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 63a5cef7be688d0bd777c7fde8fe8f52dd9a6789106b7691c2b6c3bfc8300144
-  data.tar.gz: b6e1e2df2d171bc4bae8fe50652221551d27b76b6945b76634c612fb63e37ba3
+  metadata.gz: 3d4d967dc18df73987b5f5d812fc91695b171b4ed5af8f274ca1adc2c3b522d4
+  data.tar.gz: dd12b36f5e74842b985f433130d6d5c1fb6dbc0c5d6c8f973bb7c243d72f7891
 SHA512:
-  metadata.gz: d0288d8e1195c903dabe6560ac1ab2482898c46ed91326ed89dbf878663c459f3fbeb043b2b03eaf2aec71dfaa387c5007fec656dc2310b93f1cd5a0eb582560
-  data.tar.gz: c1db09ced8386a9c8f341182f62c672110c211973fa6cd409a49de7a62d6cc53278cdccd42dc0497569fd5c5dc83ca2086f0a56e7ad43e91122ccf6af86f19b2
+  metadata.gz: a9c99e15e50f3ea6b78f7be2011a44f927b3f9f8c3ac9aeea5d54232559b6af4b72581aeb17b5cf23afc7701ab9f4ef2eaffd4e6283187f7d1f682b22dc1fd55
+  data.tar.gz: d1122c000ee843f027a6b0ab788b44fc6e2859e99420f77496764e874b084eb78507fc45355ee86b390c832b0faacb55185c02331c64565a8c7437f2721eaace

data/lib/browser.rb CHANGED Viewed

@@ -7,15 +7,13 @@ module BlackStack
             def initialize()
                 self.lockfile = File.open(LOCKFILENAME, 'w+')
-                n = 30 # timeout in seconds
+                n = 20 # timeout in seconds
                 # wait the lock file /tmp/blackstack.bots.browser.lock
                 self.lockfile.flock(File::LOCK_EX)
                 begin
                     # get list of PID of all opened chrome browsers, before launching this one
                     pids_before = `pgrep -f chrome`.split("\n")
-# track # of chrome processes
-#print "(#{pids_before.size})"
                     # setup driver
                     client = Selenium::WebDriver::Remote::Http::Default.new
                     begin
@@ -25,6 +23,11 @@ module BlackStack
                     end
                     options = Selenium::WebDriver::Chrome::Options.new
                     options.add_argument('--headless')
+                    # setup user agent with-out the keyword "headless"
+                    # otherwise, our scraper may be detected as a bot and blocked
+                    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
+                    #+"AppleWebKit/537.36 (KHTML, like Gecko)"
+                    #+"Chrome/87.0.4280.141 Safari/537.36")
                     # Add this parameter to run Chrome from a root user.
                     # https://stackoverflow.com/questions/50642308/webdriverexception-unknown-error-devtoolsactiveport-file-doesnt-exist-while-t

data/lib/scraper.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module BlackStack
     module Bots
         class Scraper
-            attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
+            attr_accessor :browser, :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
             # auxiliar array of links that I have extracted links from
             attr_accessor :links_processed
@@ -14,24 +14,94 @@ module BlackStack
                 #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
                 self.links = []
                 self.links_processed = []
+                self.browser = BlackStack::Bots::Browser.new()
             end # def initialize
-            def get_links_from_sitemap(l=nil)
-                i = 0
+            def get_links_from_sitemap(stop_at=100, l=nil)
+                max_allowed_timeout_errors = 3
+                timeout_errors = 0
+                max_links = self.links.size + stop_at
                 l.logs "Scrape sitemaps... "
                 begin
+                    l.logs "get_sitemaps from #{self.domain}... "
                     # download the robots.txt
                     url = "http://#{domain}/robots.txt"
                     # get the content of robots.txt from url
-                    s = Timeout::timeout(self.timeout) { URI.open(url).read }
+                    browser.goto url
+                    s = browser.text
                     # get the sitemap
                     sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
-                    sitemaps.each { |b|
-                        parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
-                        self.links += Timeout::timeout(self.timeout) { parser.to_a }
-                        self.links.uniq!
-                    }
+                    processed = []
+                    to_process = sitemaps - processed
+                    l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
+                    # while there are sitemaps to process
+                    while to_process.size > 0 && timeout_errors < max_allowed_timeout_errors && max_links >= self.links.size
+                        to_process.each { |b|
+                            l.logs "go to #{b}... "
+                            begin
+                                browser.goto b
+                                l.done
+                                l.logs "parsing #{b}... "
+                                s = browser.text
+                                # extract all URLs
+                                doc = Nokogiri::HTML(s)
+                                l.done
+                                # get the value of all <loc> tags with .xml extension
+                                l.logs "get_sitemaps from #{b}... "
+                                sitemaps += doc.xpath('//loc').map(&:text).select { |s| s =~ /\.xml$/ }.map { |s| s.downcase }
+                                sitemaps.uniq!
+                                l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
+                                # get the value of all <loc> tags without .xml extension
+                                l.logs "get_links from #{b}..."
+                                self.links += doc.xpath('//loc').map(&:text).select { |s| s !~ /\.xml$/ }.map { |s| s.downcase }
+                                self.links.uniq!
+                                l.logf self.links.size == 0 ? 'no links found'.yellow : "#{self.links.size} links found".green # get_links
+                                # add the sitemap to the list of processed sitemaps
+                                processed << b
+                                # reset timeout errors
+                                timeout_errors = 0
+                                # break if I exceeded the limit of links
+                                break if max_links <= self.links.size
+                            rescue Net::ReadTimeout => e
+                                l.logf "Timeout Error: #{e.message}".red
+                                l.logs "Restarting browser..."
+                                browser.close if browser
+                                self.browser = BlackStack::Bots::Browser.new()
+                                l.done
+                                timeout_errors += 1
+                                break if timeout_errors >= max_allowed_timeout_errors
+                            rescue => e
+                                l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
+                            end
+                        }
+                        # update the list of sitemaps to process
+                        processed.uniq!
+                        to_process = sitemaps - processed
+                    end
                     l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
+                rescue Net::ReadTimeout => e
+                    l.logf "Timeout Error: #{e.message}".red
+                    l.logs "Restarting browser..."
+                    browser.close if browser
+                    self.browser = BlackStack::Bots::Browser.new()
+                    l.done
                 rescue => e
                     l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
                 end
@@ -42,14 +112,12 @@ module BlackStack
                 l = BlackStack::DummyLogger.new(nil) if l.nil?
                 l.logs "get_links (#{url})... "
                 aux = []
-                browser = nil
                 begin
                     # trim url
                     url = url.strip
                     # get domain of the url using open-uri
                     domain = URI.parse(url).host
                     # visit the main page of the website
-                    browser = BlackStack::Bots::Browser.new()
                     browser.goto url
                     sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
                     # get the self.links to the pages of the website
@@ -76,16 +144,23 @@ module BlackStack
                     l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
                 rescue Net::ReadTimeout => e
                     l.logf "Timeout Error: #{e.message}".red
+                    l.logs "Restarting browser..."
+                    browser.close if browser
+                    self.browser = BlackStack::Bots::Browser.new()
+                    l.done
                 rescue => e
                     l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
-                ensure
-                    browser.close if browser
                 end
                 self.links += aux
             end # def get_links_from_url
-            def get_links(stop_at=10, l=nil)
+            def get_links(stop_at=100, l=nil)
                 l = BlackStack::DummyLogger.new(nil) if l.nil?
+                # get links from the sitemap
+                self.get_links_from_sitemap(stop_at, l)
+=begin
                 # working with root url
                 url = "http://#{self.domain}/"
                 self.links << url if self.links.select { |link| link == url }.empty?
@@ -99,13 +174,11 @@ module BlackStack
                         self.links_processed << link
                     }
                 end # while
-                # get links from the sitemap
-                self.get_links_from_sitemap(l)
+=end
             end # def get_links
             def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
                 pages = []
-                browser = nil
                 l = BlackStack::DummyLogger.new(nil) if l.nil?
                 # iterate the links
                 j = 0
@@ -115,7 +188,6 @@ module BlackStack
                     l.logs "#{j.to_s}. find_keywords (#{link})... "
                     begin
                         # get the page
-                        browser = BlackStack::Bots::Browser.new()
                         browser.goto link
                         sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
                         # get page body content in plain text
@@ -142,11 +214,14 @@ module BlackStack
                         l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
                     rescue Net::ReadTimeout => e
-                        l.logf "Timeout Error: #{e.message}".red
+                        l.logf "Timeout Error: #{e.message}".red
+                        l.logs "Restarting browser..."
+                        browser.close if browser
+                        self.browser = BlackStack::Bots::Browser.new()
+                        l.done
                     rescue => e
                         l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
-                    ensure
-                        browser.close if browser
                     end
                 } # each
                 # return

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: bots
 version: !ruby/object:Gem::Version
-  version: 1.0.9
+  version: 1.0.11
 platform: ruby
 authors:
 - Leandro Daniel Sardi