RubyGems - bots - Versions diffs - 1.0.5 → 1.0.7 - Mend

bots 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d58802b035984822506024bb8745293a9c6edf666c9ed4b69311282f226b079a
-  data.tar.gz: 250faa62467ee442ed198b36247f4d5a271dfc03f9614e0021357873bdf749a4
+  metadata.gz: 46e3e095564f34c1f9a0375dd4fefb669d4920b23a8c8d925b5a598250cc9ee2
+  data.tar.gz: 65075cef5bebe85cfbbd5098363ece05a19673144c3ba487e6c844523dc5195f
 SHA512:
-  metadata.gz: ddb39e422226e1b490e18d364ae7ab15b6bb8c20425091f41168b32c59ef59809c769e7cf2d8628998676799a21b2cb7957aeb807a9bd1995e1658217822be65
-  data.tar.gz: f8431a7d719c30326d3c1dd69e8b80dd4f916b004c7a44b464c22d83fa5acce26cd59f0b94709b1955ff1dda6450ccd8bc7121b33368b91f92380e88a1f01e54
+  metadata.gz: d319b1aee567eb3bad89a2611965409af49a3e7d00997c0c3c6b0060ea68fe60c82c4d4b4c4e51bcb4efb7b5393c0fe5438ec0b02bd1ee25b42d1354265dddf3
+  data.tar.gz: abe09849e8951f87a7012096be40bade937d658a1ff64852ea4db96941a90ae0cf5b82e14967dd6010403fc40ecd5bfdaa9ebf70e7e93f2d084d0a3c71f38948

data/lib/bots.rb CHANGED Viewed

@@ -7,8 +7,10 @@ require 'csv'
 require 'pry'
 require 'sitemap-parser'
 require 'timeout'
+require 'watir'
 require_relative './base'
 require_relative './google'
 require_relative './scraper'
-require_relative './indeed'
+require_relative './indeed'
+require_relative './browser'

data/lib/scraper.rb CHANGED Viewed

@@ -1,31 +1,21 @@
 module BlackStack
     module Bots
-        class Scraper < BlackStack::Bots::MechanizeBot
-            attr_accessor :domain, :links
+        class Scraper
+            attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
             # auxiliar array of links that I have extracted links from
             attr_accessor :links_processed
-            def initialize(init_domain, h)
-                super(h)
+            def initialize(init_domain, timeout, h)
                 self.domain = init_domain
+                self.timeout = timeout || 10
+                self.load_wait_time = 3
+                self.stop_scraping_at_page_number = 25
+                self.stop_scraping_at_match_number = 1
                 #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
                 self.links = []
                 self.links_processed = []
             end # def initialize
-            def get(url)
-                # initialize mechanize agent
-                self.agent = Mechanize.new
-                # set a proxy with user and password
-                self.port_index += 1
-                self.port_index = 0 if self.port_index >= self.ports.length
-                self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
-                self.agent.open_timeout = 5
-                self.agent.read_timeout = 5
-                # return
-                return Timeout::timeout(5) { self.agent.get(url) }
-            end
             def get_links_from_sitemap(l=nil)
                 i = 0
                 l.logs "Scrape sitemaps... "
@@ -33,17 +23,17 @@ module BlackStack
                     # download the robots.txt
                     url = "http://#{domain}/robots.txt"
                     # get the content of robots.txt from url
-                    s = Timeout::timeout(5) { URI.open(url).read }
+                    s = Timeout::timeout(self.timeout) { URI.open(url).read }
                     # get the sitemap
                     sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
                     sitemaps.each { |b|
-                        parser = Timeout::timeout(5) { SitemapParser.new b }
-                        self.links += Timeout::timeout(5) { parser.to_a }
+                        parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
+                        self.links += Timeout::timeout(self.timeout) { parser.to_a }
                         self.links.uniq!
                     }
                     l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
                 rescue => e
-                    l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
+                    l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
                 end
             end
@@ -51,16 +41,19 @@ module BlackStack
             def get_links_from_url(url, l=nil)
                 l = BlackStack::DummyLogger.new(nil) if l.nil?
                 l.logs "get_links (#{url})... "
+                aux = []
+                browser = nil
                 begin
-                    aux = []
                     # trim url
                     url = url.strip
                     # get domain of the url using open-uri
                     domain = URI.parse(url).host
                     # visit the main page of the website
-                    page = self.get(url)
+                    browser = BlackStack::Bots::Browser.new()
+                    browser.goto url
+                    sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
                     # get the self.links to the pages of the website
-                    aux = page.links.map(&:href)
+                    aux = browser.links.map(&:href)
                     # remove non-string elements
                     aux = aux.select { |link| link.is_a?(String) }
                     # remove # from the self.links
@@ -80,11 +73,15 @@ module BlackStack
                     aux = aux.select { |link| !self.links.include?(link) }
                     b = aux.size
                     # add new links to self.links
-                    self.links += aux
                     l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
+                rescue Net::ReadTimeout => e
+                    l.logf "Timeout Error: #{e.message}".red
                 rescue => e
-                    l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
+                    l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
+                ensure
+                    browser.close if browser
                 end
+                self.links += aux
             end # def get_links_from_url
             def get_links(stop_at=10, l=nil)
@@ -106,8 +103,9 @@ module BlackStack
                 self.get_links_from_sitemap(l)
             end # def get_links
-            def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
+            def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
                 pages = []
+                browser = nil
                 l = BlackStack::DummyLogger.new(nil) if l.nil?
                 # iterate the links
                 j = 0
@@ -117,12 +115,14 @@ module BlackStack
                     l.logs "#{j.to_s}. find_keywords (#{link})... "
                     begin
                         # get the page
-                        page = self.get(link)
+                        browser = BlackStack::Bots::Browser.new()
+                        browser.goto link
+                        sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
                         # get page body content in plain text
-                        title = page.title
-                        s = Timeout::timeout(5) { page.search('body').text }
+                        title = browser.title
+                        s = browser.body.text
                         # add the link to the results of no-keyword
-                        hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => page.body, 'keywords' => [] }
+                        hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] }
                         pages << hpage
                         # iterate the keywords
                         i = 0
@@ -140,9 +140,14 @@ module BlackStack
                         } # each
                         break if match && stop_on_first_link_found
                         l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
+                    rescue Net::ReadTimeout => e
+                        l.logf "Timeout Error: #{e.message}".red
                     rescue => e
                         l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
-                    end # begin
+                    ensure
+                        browser.close if browser
+                    end
                 } # each
                 # return
                 pages

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bots
 version: !ruby/object:Gem::Version
-  version: 1.0.5
+  version: 1.0.7
 platform: ruby
 authors:
 - Leandro Daniel Sardi
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-08-18 00:00:00.000000000 Z
+date: 2023-08-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: simple_cloud_logging
@@ -190,6 +190,26 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: 0.4.0
+- !ruby/object:Gem::Dependency
+  name: watir
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 7.3.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 7.3.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 7.3.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 7.3.0
 description: Ruby gem for scraping information from the public web.
 email: leandro@connectionsphere.com
 executables: []