bots 1.0.9 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/browser.rb +6 -3
  3. data/lib/scraper.rb +96 -21
  4. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 63a5cef7be688d0bd777c7fde8fe8f52dd9a6789106b7691c2b6c3bfc8300144
4
- data.tar.gz: b6e1e2df2d171bc4bae8fe50652221551d27b76b6945b76634c612fb63e37ba3
3
+ metadata.gz: 3d4d967dc18df73987b5f5d812fc91695b171b4ed5af8f274ca1adc2c3b522d4
4
+ data.tar.gz: dd12b36f5e74842b985f433130d6d5c1fb6dbc0c5d6c8f973bb7c243d72f7891
5
5
  SHA512:
6
- metadata.gz: d0288d8e1195c903dabe6560ac1ab2482898c46ed91326ed89dbf878663c459f3fbeb043b2b03eaf2aec71dfaa387c5007fec656dc2310b93f1cd5a0eb582560
7
- data.tar.gz: c1db09ced8386a9c8f341182f62c672110c211973fa6cd409a49de7a62d6cc53278cdccd42dc0497569fd5c5dc83ca2086f0a56e7ad43e91122ccf6af86f19b2
6
+ metadata.gz: a9c99e15e50f3ea6b78f7be2011a44f927b3f9f8c3ac9aeea5d54232559b6af4b72581aeb17b5cf23afc7701ab9f4ef2eaffd4e6283187f7d1f682b22dc1fd55
7
+ data.tar.gz: d1122c000ee843f027a6b0ab788b44fc6e2859e99420f77496764e874b084eb78507fc45355ee86b390c832b0faacb55185c02331c64565a8c7437f2721eaace
data/lib/browser.rb CHANGED
@@ -7,15 +7,13 @@ module BlackStack
7
7
  def initialize()
8
8
  self.lockfile = File.open(LOCKFILENAME, 'w+')
9
9
 
10
- n = 30 # timeout in seconds
10
+ n = 20 # timeout in seconds
11
11
 
12
12
  # wait the lock file /tmp/blackstack.bots.browser.lock
13
13
  self.lockfile.flock(File::LOCK_EX)
14
14
  begin
15
15
  # get list of PID of all opened chrome browsers, before launching this one
16
16
  pids_before = `pgrep -f chrome`.split("\n")
17
- # track # of chrome processes
18
- #print "(#{pids_before.size})"
19
17
  # setup driver
20
18
  client = Selenium::WebDriver::Remote::Http::Default.new
21
19
  begin
@@ -25,6 +23,11 @@ module BlackStack
25
23
  end
26
24
  options = Selenium::WebDriver::Chrome::Options.new
27
25
  options.add_argument('--headless')
26
+ # setup user agent with-out the keyword "headless"
27
+ # otherwise, our scraper may be detected as a bot and blocked
28
+ options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
29
+ #+"AppleWebKit/537.36 (KHTML, like Gecko)"
30
+ #+"Chrome/87.0.4280.141 Safari/537.36")
28
31
 
29
32
  # Add this parameter to run Chrome from a root user.
30
33
  # https://stackoverflow.com/questions/50642308/webdriverexception-unknown-error-devtoolsactiveport-file-doesnt-exist-while-t
data/lib/scraper.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module BlackStack
2
2
  module Bots
3
3
  class Scraper
4
- attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
4
+ attr_accessor :browser, :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
5
5
  # auxiliar array of links that I have extracted links from
6
6
  attr_accessor :links_processed
7
7
 
@@ -14,24 +14,94 @@ module BlackStack
14
14
  #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
15
15
  self.links = []
16
16
  self.links_processed = []
17
+ self.browser = BlackStack::Bots::Browser.new()
17
18
  end # def initialize
18
19
 
19
- def get_links_from_sitemap(l=nil)
20
- i = 0
20
+ def get_links_from_sitemap(stop_at=100, l=nil)
21
+ max_allowed_timeout_errors = 3
22
+ timeout_errors = 0
23
+ max_links = self.links.size + stop_at
24
+
21
25
  l.logs "Scrape sitemaps... "
22
26
  begin
27
+ l.logs "get_sitemaps from #{self.domain}... "
28
+
23
29
  # download the robots.txt
24
30
  url = "http://#{domain}/robots.txt"
31
+
25
32
  # get the content of robots.txt from url
26
- s = Timeout::timeout(self.timeout) { URI.open(url).read }
33
+ browser.goto url
34
+ s = browser.text
35
+
27
36
  # get the sitemap
28
37
  sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
29
- sitemaps.each { |b|
30
- parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
31
- self.links += Timeout::timeout(self.timeout) { parser.to_a }
32
- self.links.uniq!
33
- }
38
+ processed = []
39
+ to_process = sitemaps - processed
40
+ l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
41
+
42
+ # while there are sitemaps to process
43
+ while to_process.size > 0 && timeout_errors < max_allowed_timeout_errors && max_links >= self.links.size
44
+ to_process.each { |b|
45
+ l.logs "go to #{b}... "
46
+ begin
47
+ browser.goto b
48
+ l.done
49
+
50
+ l.logs "parsing #{b}... "
51
+ s = browser.text
52
+ # extract all URLs
53
+ doc = Nokogiri::HTML(s)
54
+ l.done
55
+
56
+ # get the value of all <loc> tags with .xml extension
57
+ l.logs "get_sitemaps from #{b}... "
58
+ sitemaps += doc.xpath('//loc').map(&:text).select { |s| s =~ /\.xml$/ }.map { |s| s.downcase }
59
+ sitemaps.uniq!
60
+ l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
61
+
62
+ # get the value of all <loc> tags without .xml extension
63
+ l.logs "get_links from #{b}..."
64
+ self.links += doc.xpath('//loc').map(&:text).select { |s| s !~ /\.xml$/ }.map { |s| s.downcase }
65
+ self.links.uniq!
66
+ l.logf self.links.size == 0 ? 'no links found'.yellow : "#{self.links.size} links found".green # get_links
67
+
68
+ # add the sitemap to the list of processed sitemaps
69
+ processed << b
70
+
71
+ # reset timeout errors
72
+ timeout_errors = 0
73
+
74
+ # break if I exceeded the limit of links
75
+ break if max_links <= self.links.size
76
+
77
+ rescue Net::ReadTimeout => e
78
+ l.logf "Timeout Error: #{e.message}".red
79
+
80
+ l.logs "Restarting browser..."
81
+ browser.close if browser
82
+ self.browser = BlackStack::Bots::Browser.new()
83
+ l.done
84
+
85
+ timeout_errors += 1
86
+ break if timeout_errors >= max_allowed_timeout_errors
87
+
88
+ rescue => e
89
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
90
+ end
91
+ }
92
+ # update the list of sitemaps to process
93
+ processed.uniq!
94
+ to_process = sitemaps - processed
95
+ end
34
96
  l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
97
+
98
+ rescue Net::ReadTimeout => e
99
+ l.logf "Timeout Error: #{e.message}".red
100
+
101
+ l.logs "Restarting browser..."
102
+ browser.close if browser
103
+ self.browser = BlackStack::Bots::Browser.new()
104
+ l.done
35
105
  rescue => e
36
106
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
37
107
  end
@@ -42,14 +112,12 @@ module BlackStack
42
112
  l = BlackStack::DummyLogger.new(nil) if l.nil?
43
113
  l.logs "get_links (#{url})... "
44
114
  aux = []
45
- browser = nil
46
115
  begin
47
116
  # trim url
48
117
  url = url.strip
49
118
  # get domain of the url using open-uri
50
119
  domain = URI.parse(url).host
51
120
  # visit the main page of the website
52
- browser = BlackStack::Bots::Browser.new()
53
121
  browser.goto url
54
122
  sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
55
123
  # get the self.links to the pages of the website
@@ -76,16 +144,23 @@ module BlackStack
76
144
  l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
77
145
  rescue Net::ReadTimeout => e
78
146
  l.logf "Timeout Error: #{e.message}".red
147
+
148
+ l.logs "Restarting browser..."
149
+ browser.close if browser
150
+ self.browser = BlackStack::Bots::Browser.new()
151
+ l.done
79
152
  rescue => e
80
153
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
81
- ensure
82
- browser.close if browser
83
154
  end
84
155
  self.links += aux
85
156
  end # def get_links_from_url
86
157
 
87
- def get_links(stop_at=10, l=nil)
158
+ def get_links(stop_at=100, l=nil)
88
159
  l = BlackStack::DummyLogger.new(nil) if l.nil?
160
+
161
+ # get links from the sitemap
162
+ self.get_links_from_sitemap(stop_at, l)
163
+ =begin
89
164
  # working with root url
90
165
  url = "http://#{self.domain}/"
91
166
  self.links << url if self.links.select { |link| link == url }.empty?
@@ -99,13 +174,11 @@ module BlackStack
99
174
  self.links_processed << link
100
175
  }
101
176
  end # while
102
- # get links from the sitemap
103
- self.get_links_from_sitemap(l)
177
+ =end
104
178
  end # def get_links
105
179
 
106
180
  def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
107
181
  pages = []
108
- browser = nil
109
182
  l = BlackStack::DummyLogger.new(nil) if l.nil?
110
183
  # iterate the links
111
184
  j = 0
@@ -115,7 +188,6 @@ module BlackStack
115
188
  l.logs "#{j.to_s}. find_keywords (#{link})... "
116
189
  begin
117
190
  # get the page
118
- browser = BlackStack::Bots::Browser.new()
119
191
  browser.goto link
120
192
  sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
121
193
  # get page body content in plain text
@@ -142,11 +214,14 @@ module BlackStack
142
214
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
143
215
 
144
216
  rescue Net::ReadTimeout => e
145
- l.logf "Timeout Error: #{e.message}".red
217
+ l.logf "Timeout Error: #{e.message}".red
218
+
219
+ l.logs "Restarting browser..."
220
+ browser.close if browser
221
+ self.browser = BlackStack::Bots::Browser.new()
222
+ l.done
146
223
  rescue => e
147
224
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
148
- ensure
149
- browser.close if browser
150
225
  end
151
226
  } # each
152
227
  # return
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.9
4
+ version: 1.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi