bots 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bots.rb +3 -1
  3. data/lib/scraper.rb +37 -32
  4. metadata +22 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d58802b035984822506024bb8745293a9c6edf666c9ed4b69311282f226b079a
4
- data.tar.gz: 250faa62467ee442ed198b36247f4d5a271dfc03f9614e0021357873bdf749a4
3
+ metadata.gz: 46e3e095564f34c1f9a0375dd4fefb669d4920b23a8c8d925b5a598250cc9ee2
4
+ data.tar.gz: 65075cef5bebe85cfbbd5098363ece05a19673144c3ba487e6c844523dc5195f
5
5
  SHA512:
6
- metadata.gz: ddb39e422226e1b490e18d364ae7ab15b6bb8c20425091f41168b32c59ef59809c769e7cf2d8628998676799a21b2cb7957aeb807a9bd1995e1658217822be65
7
- data.tar.gz: f8431a7d719c30326d3c1dd69e8b80dd4f916b004c7a44b464c22d83fa5acce26cd59f0b94709b1955ff1dda6450ccd8bc7121b33368b91f92380e88a1f01e54
6
+ metadata.gz: d319b1aee567eb3bad89a2611965409af49a3e7d00997c0c3c6b0060ea68fe60c82c4d4b4c4e51bcb4efb7b5393c0fe5438ec0b02bd1ee25b42d1354265dddf3
7
+ data.tar.gz: abe09849e8951f87a7012096be40bade937d658a1ff64852ea4db96941a90ae0cf5b82e14967dd6010403fc40ecd5bfdaa9ebf70e7e93f2d084d0a3c71f38948
data/lib/bots.rb CHANGED
@@ -7,8 +7,10 @@ require 'csv'
7
7
  require 'pry'
8
8
  require 'sitemap-parser'
9
9
  require 'timeout'
10
+ require 'watir'
10
11
 
11
12
  require_relative './base'
12
13
  require_relative './google'
13
14
  require_relative './scraper'
14
- require_relative './indeed'
15
+ require_relative './indeed'
16
+ require_relative './browser'
data/lib/scraper.rb CHANGED
@@ -1,31 +1,21 @@
1
1
  module BlackStack
2
2
  module Bots
3
- class Scraper < BlackStack::Bots::MechanizeBot
4
- attr_accessor :domain, :links
3
+ class Scraper
4
+ attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
5
5
  # auxiliar array of links that I have extracted links from
6
6
  attr_accessor :links_processed
7
7
 
8
- def initialize(init_domain, h)
9
- super(h)
8
+ def initialize(init_domain, timeout, h)
10
9
  self.domain = init_domain
10
+ self.timeout = timeout || 10
11
+ self.load_wait_time = 3
12
+ self.stop_scraping_at_page_number = 25
13
+ self.stop_scraping_at_match_number = 1
11
14
  #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
15
  self.links = []
13
16
  self.links_processed = []
14
17
  end # def initialize
15
18
 
16
- def get(url)
17
- # initialize mechanize agent
18
- self.agent = Mechanize.new
19
- # set a proxy with user and password
20
- self.port_index += 1
21
- self.port_index = 0 if self.port_index >= self.ports.length
22
- self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
23
- self.agent.open_timeout = 5
24
- self.agent.read_timeout = 5
25
- # return
26
- return Timeout::timeout(5) { self.agent.get(url) }
27
- end
28
-
29
19
  def get_links_from_sitemap(l=nil)
30
20
  i = 0
31
21
  l.logs "Scrape sitemaps... "
@@ -33,17 +23,17 @@ module BlackStack
33
23
  # download the robots.txt
34
24
  url = "http://#{domain}/robots.txt"
35
25
  # get the content of robots.txt from url
36
- s = Timeout::timeout(5) { URI.open(url).read }
26
+ s = Timeout::timeout(self.timeout) { URI.open(url).read }
37
27
  # get the sitemap
38
28
  sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
39
29
  sitemaps.each { |b|
40
- parser = Timeout::timeout(5) { SitemapParser.new b }
41
- self.links += Timeout::timeout(5) { parser.to_a }
30
+ parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
31
+ self.links += Timeout::timeout(self.timeout) { parser.to_a }
42
32
  self.links.uniq!
43
33
  }
44
34
  l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
45
35
  rescue => e
46
- l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
36
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
47
37
  end
48
38
  end
49
39
 
@@ -51,16 +41,19 @@ module BlackStack
51
41
  def get_links_from_url(url, l=nil)
52
42
  l = BlackStack::DummyLogger.new(nil) if l.nil?
53
43
  l.logs "get_links (#{url})... "
44
+ aux = []
45
+ browser = nil
54
46
  begin
55
- aux = []
56
47
  # trim url
57
48
  url = url.strip
58
49
  # get domain of the url using open-uri
59
50
  domain = URI.parse(url).host
60
51
  # visit the main page of the website
61
- page = self.get(url)
52
+ browser = BlackStack::Bots::Browser.new()
53
+ browser.goto url
54
+ sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
62
55
  # get the self.links to the pages of the website
63
- aux = page.links.map(&:href)
56
+ aux = browser.links.map(&:href)
64
57
  # remove non-string elements
65
58
  aux = aux.select { |link| link.is_a?(String) }
66
59
  # remove # from the self.links
@@ -80,11 +73,15 @@ module BlackStack
80
73
  aux = aux.select { |link| !self.links.include?(link) }
81
74
  b = aux.size
82
75
  # add new links to self.links
83
- self.links += aux
84
76
  l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
77
+ rescue Net::ReadTimeout => e
78
+ l.logf "Timeout Error: #{e.message}".red
85
79
  rescue => e
86
- l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
80
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
81
+ ensure
82
+ browser.close if browser
87
83
  end
84
+ self.links += aux
88
85
  end # def get_links_from_url
89
86
 
90
87
  def get_links(stop_at=10, l=nil)
@@ -106,8 +103,9 @@ module BlackStack
106
103
  self.get_links_from_sitemap(l)
107
104
  end # def get_links
108
105
 
109
- def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
106
+ def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
110
107
  pages = []
108
+ browser = nil
111
109
  l = BlackStack::DummyLogger.new(nil) if l.nil?
112
110
  # iterate the links
113
111
  j = 0
@@ -117,12 +115,14 @@ module BlackStack
117
115
  l.logs "#{j.to_s}. find_keywords (#{link})... "
118
116
  begin
119
117
  # get the page
120
- page = self.get(link)
118
+ browser = BlackStack::Bots::Browser.new()
119
+ browser.goto link
120
+ sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
121
121
  # get page body content in plain text
122
- title = page.title
123
- s = Timeout::timeout(5) { page.search('body').text }
122
+ title = browser.title
123
+ s = browser.body.text
124
124
  # add the link to the results of no-keyword
125
- hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => page.body, 'keywords' => [] }
125
+ hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] }
126
126
  pages << hpage
127
127
  # iterate the keywords
128
128
  i = 0
@@ -140,9 +140,14 @@ module BlackStack
140
140
  } # each
141
141
  break if match && stop_on_first_link_found
142
142
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
143
+
144
+ rescue Net::ReadTimeout => e
145
+ l.logf "Timeout Error: #{e.message}".red
143
146
  rescue => e
144
147
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
145
- end # begin
148
+ ensure
149
+ browser.close if browser
150
+ end
146
151
  } # each
147
152
  # return
148
153
  pages
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-18 00:00:00.000000000 Z
11
+ date: 2023-08-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging
@@ -190,6 +190,26 @@ dependencies:
190
190
  - - ">="
191
191
  - !ruby/object:Gem::Version
192
192
  version: 0.4.0
193
+ - !ruby/object:Gem::Dependency
194
+ name: watir
195
+ requirement: !ruby/object:Gem::Requirement
196
+ requirements:
197
+ - - "~>"
198
+ - !ruby/object:Gem::Version
199
+ version: 7.3.0
200
+ - - ">="
201
+ - !ruby/object:Gem::Version
202
+ version: 7.3.0
203
+ type: :runtime
204
+ prerelease: false
205
+ version_requirements: !ruby/object:Gem::Requirement
206
+ requirements:
207
+ - - "~>"
208
+ - !ruby/object:Gem::Version
209
+ version: 7.3.0
210
+ - - ">="
211
+ - !ruby/object:Gem::Version
212
+ version: 7.3.0
193
213
  description: Ruby gem for scraping information from the public web.
194
214
  email: leandro@connectionsphere.com
195
215
  executables: []