RubyGems - bots - Versions diffs - 1.0.4 → 1.0.6 - Mend

bots 1.0.4 → 1.0.6

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ef6ff9fa026dd9e1f1c750b5963c957a4deacf5413f92a3dc8fd1f0584305898
-  data.tar.gz: e652c088971c2fd9c97c12902d09ce7ed4933da36da6e010f2f3ffae1abdb292
+  metadata.gz: c75697e4698d3a14ac4feeced1a6011febf800b798806ff624d24ee9604e468f
+  data.tar.gz: 842f91d7870719ed98d6cb445de1bb33e4069e1dc3b86969c9a05c7c5cfc0814
 SHA512:
-  metadata.gz: 1c8fce3accf4fc84701706355880bd59cc1cd5a16dfbc4a05bbdb30faa492c7f212f4a22ef917b81e7b8ba5f0c6caaa911f23bef1df4f4f23868ab225673adff
-  data.tar.gz: 3942240b8cff2bdbfdb003912af8da5c26f6b117a3744becbff6a781f77f9eb32d2c5e0aef1439edc6b288ec0ff359f5e45fec68e21db8d6cca78c8d36cf04dc
+  metadata.gz: c64ba194a4f6bc68662eef99324a56b5036796d2e43576f2d44c308809cc4c567c72df4c5ae4474250cf98d3b366cb6d82c5c0df6b5cb13985a3cf96d63f85bd
+  data.tar.gz: 10625bee7634f62a3c45741d92789e6f599ce13de15d951d772900179abec0392c8a190706853b0077ed9bb9de0fbf522e658eecfae2b8777f0288d98fd637fb

data/lib/bots.rb CHANGED Viewed

@@ -7,8 +7,10 @@ require 'csv'
 require 'pry'
 require 'sitemap-parser'
 require 'timeout'
+require 'watir'
 require_relative './base'
 require_relative './google'
 require_relative './scraper'
-require_relative './indeed'
+require_relative './indeed'
+require_relative './browser'

data/lib/scraper.rb CHANGED Viewed

@@ -1,31 +1,21 @@
 module BlackStack
     module Bots
-        class Scraper < BlackStack::Bots::MechanizeBot
-            attr_accessor :domain, :links
+        class Scraper
+            attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
             # auxiliar array of links that I have extracted links from
             attr_accessor :links_processed
-            def initialize(init_domain, h)
-                super(h)
+            def initialize(init_domain, timeout, h)
                 self.domain = init_domain
+                self.timeout = timeout || 10
+                self.load_wait_time = 3
+                self.stop_scraping_at_page_number = 25
+                self.stop_scraping_at_match_number = 1
                 #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
                 self.links = []
                 self.links_processed = []
             end # def initialize
-            def get(url)
-                # initialize mechanize agent
-                self.agent = Mechanize.new
-                # set a proxy with user and password
-                self.port_index += 1
-                self.port_index = 0 if self.port_index >= self.ports.length
-                self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
-                self.agent.open_timeout = 5
-                self.agent.read_timeout = 5
-                # return
-                return Timeout::timeout(5) { self.agent.get(url) }
-            end
             def get_links_from_sitemap(l=nil)
                 i = 0
                 l.logs "Scrape sitemaps... "
@@ -33,17 +23,17 @@ module BlackStack
                     # download the robots.txt
                     url = "http://#{domain}/robots.txt"
                     # get the content of robots.txt from url
-                    s = Timeout::timeout(5) { URI.open(url).read }
+                    s = Timeout::timeout(self.timeout) { URI.open(url).read }
                     # get the sitemap
                     sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
                     sitemaps.each { |b|
-                        parser = Timeout::timeout(5) { SitemapParser.new b }
-                        self.links += Timeout::timeout(5) { parser.to_a }
+                        parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
+                        self.links += Timeout::timeout(self.timeout) { parser.to_a }
                         self.links.uniq!
                     }
                     l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
                 rescue => e
-                    l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
+                    l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
                 end
             end
@@ -51,16 +41,19 @@ module BlackStack
             def get_links_from_url(url, l=nil)
                 l = BlackStack::DummyLogger.new(nil) if l.nil?
                 l.logs "get_links (#{url})... "
+                aux = []
+                browser = nil
                 begin
-                    aux = []
                     # trim url
                     url = url.strip
                     # get domain of the url using open-uri
                     domain = URI.parse(url).host
                     # visit the main page of the website
-                    page = self.get(url)
+                    browser = BlackStack::Bots::Browser.new()
+                    browser.goto url
+                    sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
                     # get the self.links to the pages of the website
-                    aux = page.links.map(&:href)
+                    aux = browser.links.map(&:href)
                     # remove non-string elements
                     aux = aux.select { |link| link.is_a?(String) }
                     # remove # from the self.links
@@ -80,11 +73,15 @@ module BlackStack
                     aux = aux.select { |link| !self.links.include?(link) }
                     b = aux.size
                     # add new links to self.links
-                    self.links += aux
                     l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
+                rescue Net::ReadTimeout => e
+                    l.logf "Timeout Error: #{e.message}".red
                 rescue => e
-                    l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
+                    l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
+                ensure
+                    browser.close if browser
                 end
+                self.links += aux
             end # def get_links_from_url
             def get_links(stop_at=10, l=nil)
@@ -106,8 +103,9 @@ module BlackStack
                 self.get_links_from_sitemap(l)
             end # def get_links
-            def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
+            def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
                 pages = []
+                browser = nil
                 l = BlackStack::DummyLogger.new(nil) if l.nil?
                 # iterate the links
                 j = 0
@@ -117,12 +115,14 @@ module BlackStack
                     l.logs "#{j.to_s}. find_keywords (#{link})... "
                     begin
                         # get the page
-                        page = self.get(link)
+                        browser = BlackStack::Bots::Browser.new()
+                        browser.goto link
+                        sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
                         # get page body content in plain text
-                        title = page.title
-                        s = Timeout::timeout(5) { page.search('body').text }
+                        title = browser.title
+                        s = browser.body.text
                         # add the link to the results of no-keyword
-                        hpage = { :url => link, :title => title, :html => page.body, :keywords => [] }
+                        hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] }
                         pages << hpage
                         # iterate the keywords
                         i = 0
@@ -130,7 +130,7 @@ module BlackStack
                         a.each { |k|
                             # find the keyword
                             match = ( s =~ /#{Regexp.escape(k)}/i )
-                            hpage[:keywords] << k if match
+                            hpage['keywords'] << k if match
                             # count the number of links with match
                             # break if only 1 link is needed
                             if match
@@ -140,9 +140,14 @@ module BlackStack
                         } # each
                         break if match && stop_on_first_link_found
                         l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
+                    rescue Net::ReadTimeout => e
+                        l.logf "Timeout Error: #{e.message}".red
                     rescue => e
                         l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
-                    end # begin
+                    ensure
+                        browser.close if browser
+                    end
                 } # each
                 # return
                 pages

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bots
 version: !ruby/object:Gem::Version
-  version: 1.0.4
+  version: 1.0.6
 platform: ruby
 authors:
 - Leandro Daniel Sardi
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-08-18 00:00:00.000000000 Z
+date: 2023-08-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: simple_cloud_logging
@@ -190,6 +190,26 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: 0.4.0
+- !ruby/object:Gem::Dependency
+  name: watir
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 7.3.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 7.3.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 7.3.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 7.3.0
 description: Ruby gem for scraping information from the public web.
 email: leandro@connectionsphere.com
 executables: []