bots 1.0.5 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bots.rb +3 -1
  3. data/lib/scraper.rb +37 -32
  4. metadata +22 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d58802b035984822506024bb8745293a9c6edf666c9ed4b69311282f226b079a
4
- data.tar.gz: 250faa62467ee442ed198b36247f4d5a271dfc03f9614e0021357873bdf749a4
3
+ metadata.gz: 46e3e095564f34c1f9a0375dd4fefb669d4920b23a8c8d925b5a598250cc9ee2
4
+ data.tar.gz: 65075cef5bebe85cfbbd5098363ece05a19673144c3ba487e6c844523dc5195f
5
5
  SHA512:
6
- metadata.gz: ddb39e422226e1b490e18d364ae7ab15b6bb8c20425091f41168b32c59ef59809c769e7cf2d8628998676799a21b2cb7957aeb807a9bd1995e1658217822be65
7
- data.tar.gz: f8431a7d719c30326d3c1dd69e8b80dd4f916b004c7a44b464c22d83fa5acce26cd59f0b94709b1955ff1dda6450ccd8bc7121b33368b91f92380e88a1f01e54
6
+ metadata.gz: d319b1aee567eb3bad89a2611965409af49a3e7d00997c0c3c6b0060ea68fe60c82c4d4b4c4e51bcb4efb7b5393c0fe5438ec0b02bd1ee25b42d1354265dddf3
7
+ data.tar.gz: abe09849e8951f87a7012096be40bade937d658a1ff64852ea4db96941a90ae0cf5b82e14967dd6010403fc40ecd5bfdaa9ebf70e7e93f2d084d0a3c71f38948
data/lib/bots.rb CHANGED
@@ -7,8 +7,10 @@ require 'csv'
7
7
  require 'pry'
8
8
  require 'sitemap-parser'
9
9
  require 'timeout'
10
+ require 'watir'
10
11
 
11
12
  require_relative './base'
12
13
  require_relative './google'
13
14
  require_relative './scraper'
14
- require_relative './indeed'
15
+ require_relative './indeed'
16
+ require_relative './browser'
data/lib/scraper.rb CHANGED
@@ -1,31 +1,21 @@
1
1
  module BlackStack
2
2
  module Bots
3
- class Scraper < BlackStack::Bots::MechanizeBot
4
- attr_accessor :domain, :links
3
+ class Scraper
4
+ attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
5
5
  # auxiliar array of links that I have extracted links from
6
6
  attr_accessor :links_processed
7
7
 
8
- def initialize(init_domain, h)
9
- super(h)
8
+ def initialize(init_domain, timeout, h)
10
9
  self.domain = init_domain
10
+ self.timeout = timeout || 10
11
+ self.load_wait_time = 3
12
+ self.stop_scraping_at_page_number = 25
13
+ self.stop_scraping_at_match_number = 1
11
14
  #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
15
  self.links = []
13
16
  self.links_processed = []
14
17
  end # def initialize
15
18
 
16
- def get(url)
17
- # initialize mechanize agent
18
- self.agent = Mechanize.new
19
- # set a proxy with user and password
20
- self.port_index += 1
21
- self.port_index = 0 if self.port_index >= self.ports.length
22
- self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
23
- self.agent.open_timeout = 5
24
- self.agent.read_timeout = 5
25
- # return
26
- return Timeout::timeout(5) { self.agent.get(url) }
27
- end
28
-
29
19
  def get_links_from_sitemap(l=nil)
30
20
  i = 0
31
21
  l.logs "Scrape sitemaps... "
@@ -33,17 +23,17 @@ module BlackStack
33
23
  # download the robots.txt
34
24
  url = "http://#{domain}/robots.txt"
35
25
  # get the content of robots.txt from url
36
- s = Timeout::timeout(5) { URI.open(url).read }
26
+ s = Timeout::timeout(self.timeout) { URI.open(url).read }
37
27
  # get the sitemap
38
28
  sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
39
29
  sitemaps.each { |b|
40
- parser = Timeout::timeout(5) { SitemapParser.new b }
41
- self.links += Timeout::timeout(5) { parser.to_a }
30
+ parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
31
+ self.links += Timeout::timeout(self.timeout) { parser.to_a }
42
32
  self.links.uniq!
43
33
  }
44
34
  l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
45
35
  rescue => e
46
- l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
36
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
47
37
  end
48
38
  end
49
39
 
@@ -51,16 +41,19 @@ module BlackStack
51
41
  def get_links_from_url(url, l=nil)
52
42
  l = BlackStack::DummyLogger.new(nil) if l.nil?
53
43
  l.logs "get_links (#{url})... "
44
+ aux = []
45
+ browser = nil
54
46
  begin
55
- aux = []
56
47
  # trim url
57
48
  url = url.strip
58
49
  # get domain of the url using open-uri
59
50
  domain = URI.parse(url).host
60
51
  # visit the main page of the website
61
- page = self.get(url)
52
+ browser = BlackStack::Bots::Browser.new()
53
+ browser.goto url
54
+ sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
62
55
  # get the self.links to the pages of the website
63
- aux = page.links.map(&:href)
56
+ aux = browser.links.map(&:href)
64
57
  # remove non-string elements
65
58
  aux = aux.select { |link| link.is_a?(String) }
66
59
  # remove # from the self.links
@@ -80,11 +73,15 @@ module BlackStack
80
73
  aux = aux.select { |link| !self.links.include?(link) }
81
74
  b = aux.size
82
75
  # add new links to self.links
83
- self.links += aux
84
76
  l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
77
+ rescue Net::ReadTimeout => e
78
+ l.logf "Timeout Error: #{e.message}".red
85
79
  rescue => e
86
- l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
80
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
81
+ ensure
82
+ browser.close if browser
87
83
  end
84
+ self.links += aux
88
85
  end # def get_links_from_url
89
86
 
90
87
  def get_links(stop_at=10, l=nil)
@@ -106,8 +103,9 @@ module BlackStack
106
103
  self.get_links_from_sitemap(l)
107
104
  end # def get_links
108
105
 
109
- def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
106
+ def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
110
107
  pages = []
108
+ browser = nil
111
109
  l = BlackStack::DummyLogger.new(nil) if l.nil?
112
110
  # iterate the links
113
111
  j = 0
@@ -117,12 +115,14 @@ module BlackStack
117
115
  l.logs "#{j.to_s}. find_keywords (#{link})... "
118
116
  begin
119
117
  # get the page
120
- page = self.get(link)
118
+ browser = BlackStack::Bots::Browser.new()
119
+ browser.goto link
120
+ sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
121
121
  # get page body content in plain text
122
- title = page.title
123
- s = Timeout::timeout(5) { page.search('body').text }
122
+ title = browser.title
123
+ s = browser.body.text
124
124
  # add the link to the results of no-keyword
125
- hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => page.body, 'keywords' => [] }
125
+ hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] }
126
126
  pages << hpage
127
127
  # iterate the keywords
128
128
  i = 0
@@ -140,9 +140,14 @@ module BlackStack
140
140
  } # each
141
141
  break if match && stop_on_first_link_found
142
142
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
143
+
144
+ rescue Net::ReadTimeout => e
145
+ l.logf "Timeout Error: #{e.message}".red
143
146
  rescue => e
144
147
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
145
- end # begin
148
+ ensure
149
+ browser.close if browser
150
+ end
146
151
  } # each
147
152
  # return
148
153
  pages
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-18 00:00:00.000000000 Z
11
+ date: 2023-08-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging
@@ -190,6 +190,26 @@ dependencies:
190
190
  - - ">="
191
191
  - !ruby/object:Gem::Version
192
192
  version: 0.4.0
193
+ - !ruby/object:Gem::Dependency
194
+ name: watir
195
+ requirement: !ruby/object:Gem::Requirement
196
+ requirements:
197
+ - - "~>"
198
+ - !ruby/object:Gem::Version
199
+ version: 7.3.0
200
+ - - ">="
201
+ - !ruby/object:Gem::Version
202
+ version: 7.3.0
203
+ type: :runtime
204
+ prerelease: false
205
+ version_requirements: !ruby/object:Gem::Requirement
206
+ requirements:
207
+ - - "~>"
208
+ - !ruby/object:Gem::Version
209
+ version: 7.3.0
210
+ - - ">="
211
+ - !ruby/object:Gem::Version
212
+ version: 7.3.0
193
213
  description: Ruby gem for scraping information from the public web.
194
214
  email: leandro@connectionsphere.com
195
215
  executables: []