RubyGems - bots - Versions diffs - 1.0.1 → 1.0.3 - Mend

bots 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: be4c3337063af2d514fd1016c94d66385fa84dbde27dbfa5174d5b0908c9efb1
-  data.tar.gz: 13ef3dccd336269fdb1960f243c8b7dd08abf782ba89cfbf935c4e802a29ca45
+  metadata.gz: 3a8018e0d8575a415699c41dcba236e3c4f400e8132111093e421ac02e792548
+  data.tar.gz: 4b876044081e94743d1b719c53424331d44bb200a8bdcfddd7d78562209eeed3
 SHA512:
-  metadata.gz: 354704a255a80def04f1993d3a73d5b358fedfeeda387278f47ab995ed10d2afced94fe175a15bc8dd7c55b1209798598b55a5f2d2071424f11613091cd87813
-  data.tar.gz: c1fae508d2b55269f0039524ee4e87714d7cc7244922faa83ac5b6c374abc29cdaab34355059ff92e5d6a0469be44900ac628fcfe55da80def3993549a2bcf27
+  metadata.gz: 326e82a582132f2d267e906df73aad0d812f9fc4fe00c8af2ba9ef6cd93a174ae2004719ec76f5cc6b018da0eda1b2cd891dd51f7df7adaad6044da773f207ec
+  data.tar.gz: 2a4944c21854faee39f81b81004fb63baf3f270b2374f081b98ca25b4f7695af80184b1449dff38207d4c21b7f34b524d59c997931f2db2743df42c56a714875

data/lib/base.rb CHANGED Viewed

@@ -9,11 +9,13 @@ module BlackStack
             def initialize(h)
                 # array of numbers from 4000 to 4249
-                unless h[:proxy].nil?
-                    self.ip = h[:proxy][:ip]
-                    self.user = h[:proxy][:user]
-                    self.password = h[:proxy][:password]
-                    self.ports = (h[:proxy][:port_from]..h[:proxy][:port_to]).to_a
+                if h
+                    self.ip = h[:ip]
+                    self.user = h[:user]
+                    self.password = h[:password]
+                    self.ports = (h[:port_from]..h[:port_to]).to_a
+                else
+                    self.ports = []
                 end
                 self.port_index = -1
             end # initialize
@@ -26,10 +28,16 @@ module BlackStack
         class MechanizeBot < BlackStack::Bots::Bot
             attr_accessor :agent # mechanize agent
+            def initialize(h)
+                super(h)
+            end
         end # MechanizeBot
         class SeleniumBot < BlackStack::Bots::Bot
             attr_accessor :driver # selenium driver
+            def initialize(h)
+                super(h)
+            end
         end # MechanizeBot
     end # Bots

data/lib/bots.rb CHANGED Viewed

@@ -1,9 +1,14 @@
+require 'open-uri'
 require 'mechanize'
 require 'selenium-webdriver'
 require 'simple_cloud_logging'
 require 'colorize'
 require 'csv'
+require 'pry'
+require 'sitemap-parser'
+require 'timeout'
 require_relative './base'
 require_relative './google'
+require_relative './scraper'
 require_relative './indeed'

data/lib/scraper.rb ADDED Viewed

@@ -0,0 +1,145 @@
+module BlackStack
+    module Bots
+        class Scraper < BlackStack::Bots::MechanizeBot
+            attr_accessor :domain, :links
+            # auxiliar array of links that I have extracted links from
+            attr_accessor :links_processed
+            def initialize(init_domain, h)
+                super(h)
+                self.domain = init_domain
+                #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+                self.links = []
+                self.links_processed = []
+            end # def initialize
+            def get(url)
+                # initialize mechanize agent
+                self.agent = Mechanize.new
+                # set a proxy with user and password
+                self.port_index += 1
+                self.port_index = 0 if self.port_index >= self.ports.length
+                self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
+                self.agent.open_timeout = 5
+                self.agent.read_timeout = 5
+                # return
+                return Timeout::timeout(5) { self.agent.get(url) }
+            end
+            def get_links_from_sitemap(l=nil)
+                i = 0
+                l.logs "Scrape sitemaps... "
+                begin
+                    # download the robots.txt
+                    url = "http://#{domain}/robots.txt"
+                    # get the content of robots.txt from url
+                    s = Timeout::timeout(5) { URI.open(url).read }
+                    # get the sitemap
+                    sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
+                    sitemaps.each { |b|
+                        parser = Timeout::timeout(5) { SitemapParser.new b }
+                        self.links += Timeout::timeout(5) { parser.to_a }
+                        self.links.uniq!
+                    }
+                    l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
+                rescue => e
+                    l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
+                end
+            end
+            # internal use only
+            def get_links_from_url(url, l=nil)
+                l = BlackStack::DummyLogger.new(nil) if l.nil?
+                l.logs "get_links (#{url})... "
+                begin
+                    aux = []
+                    # trim url
+                    url = url.strip
+                    # get domain of the url using open-uri
+                    domain = URI.parse(url).host
+                    # visit the main page of the website
+                    page = self.get(url)
+                    # get the self.links to the pages of the website
+                    aux = page.links.map(&:href)
+                    # remove non-string elements
+                    aux = aux.select { |link| link.is_a?(String) }
+                    # remove # from the self.links
+                    aux = aux.map { |link| !link.nil? && link.split('#').first }
+                    # remove querystring from the self.links
+                    aux = aux.map { |link| !link.nil? && link.split('?').first }
+                    # remove the self.links that are not http:// or https://
+                    aux = aux.select { |link| !link.nil? && link =~ /^https?:\/\// }
+                    # remove the self.links that are not from the same domain
+                    aux = aux.select { |link| !link.nil? && link =~ /#{domain}/ }
+                    # remove nil values
+                    aux = aux.compact
+                    # remove duplications
+                    aux = aux.uniq
+                    # filter links who already are in the list
+                    a = aux.size
+                    aux = aux.select { |link| !self.links.include?(link) }
+                    b = aux.size
+                    # add new links to self.links
+                    self.links += aux
+                    l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
+                rescue => e
+                    l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
+                end
+            end # def get_links_from_url
+            def get_links(stop_at=10, l=nil)
+                l = BlackStack::DummyLogger.new(nil) if l.nil?
+                # working with root url
+                url = "http://#{self.domain}/"
+                self.links << url if self.links.select { |link| link == url }.empty?
+                # iterate until I have discovered all the links
+                while self.links.size != self.links_processed.size && stop_at >= self.links.size
+                    # iterate the links who are not in links_processed
+                    self.links.select { |link| !self.links_processed.include?(link) }.each { |link|
+                        # get the links from the url
+                        self.get_links_from_url(link, l)
+                        # add the link to the list of processed links
+                        self.links_processed << link
+                    }
+                end # while
+                # get links from the sitemap
+                self.get_links_from_sitemap(l)
+            end # def get_links
+            def find_keywords(a, stop_at=50, l=nil)
+                ret = []
+                l = BlackStack::DummyLogger.new(nil) if l.nil?
+                # iterate the links
+                j = 0
+                self.links.reject { |link| link =~ /\.pdf$/i || link =~ /\.jpg$/i || link =~ /\.jpeg$/i || link =~ /\.gif$/i }.each { |link|
+                    j += 1
+                    break if j > stop_at
+                    l.logs "#{j.to_s}. find_keywords (#{link})... "
+                    begin
+                        # get the page
+                        page = self.get(link)
+                        # get page body content in plain text
+                        s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
+                        # iterate the keywords
+                        i = 0
+                        a.each { |k|
+                            # find the keyword
+                            if s =~ /#{Regexp.escape(k)}/i
+                                i += 1
+                                ret << link if ret.select { |link| link == link }.empty?
+                                break
+                            end # if
+                        } # each
+                        break if ret.size > 0
+                        l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
+                    rescue => e
+                        l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
+                    end # begin
+                } # each
+                # return
+                ret
+            end
+        end # class Scraper
+    end # module Bots
+    end # module BlackStack

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bots
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.3
 platform: ruby
 authors:
 - Leandro Daniel Sardi
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-08-08 00:00:00.000000000 Z
+date: 2023-08-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: simple_cloud_logging
@@ -110,6 +110,86 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: 0.8.1
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.14.2
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.14.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.14.2
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.14.2
+- !ruby/object:Gem::Dependency
+  name: open-uri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.2.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.2.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.2.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.2.0
+- !ruby/object:Gem::Dependency
+  name: sitemap-parser
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.5.6
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.5.6
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.5.6
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.5.6
+- !ruby/object:Gem::Dependency
+  name: timeout
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.4.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.4.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.4.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.4.0
 description: Ruby gem for scraping information from the public web.
 email: leandro@connectionsphere.com
 executables: []
@@ -120,6 +200,7 @@ files:
 - lib/bots.rb
 - lib/google.rb
 - lib/indeed.rb
+- lib/scraper.rb
 homepage: https://rubygems.org/gems/bots
 licenses:
 - MIT