bots 1.0.10 → 1.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/browser.rb +6 -3
  3. data/lib/scraper.rb +96 -21
  4. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be18e066f6136f7a3d57a2b2b0519a3c570187c27fa981993062839c6b4cc5c9
4
- data.tar.gz: 3e3d8d83d56d838e34cdbdeb29624f4eac80914c1d13aa5cf2592feeb365bc97
3
+ metadata.gz: 3d4d967dc18df73987b5f5d812fc91695b171b4ed5af8f274ca1adc2c3b522d4
4
+ data.tar.gz: dd12b36f5e74842b985f433130d6d5c1fb6dbc0c5d6c8f973bb7c243d72f7891
5
5
  SHA512:
6
- metadata.gz: 7a9508b65f0bcb71f30c9c2bed17828dc8af29ed7bbb7fad89663941527e558d8c60189e590ef539303a7324b0cd2fff6a2cf4ab005bf2aa8b2b377db728f003
7
- data.tar.gz: 1b8ef1c3c7817d9b5fe4c4d740ae1175f8db1acef0c0c695ceb68dc63c3fdb6bcd4fdb2d6bef0fb77fb563f3a47240c1d3d9f3b3e38456ffc0836be1b1e682c7
6
+ metadata.gz: a9c99e15e50f3ea6b78f7be2011a44f927b3f9f8c3ac9aeea5d54232559b6af4b72581aeb17b5cf23afc7701ab9f4ef2eaffd4e6283187f7d1f682b22dc1fd55
7
+ data.tar.gz: d1122c000ee843f027a6b0ab788b44fc6e2859e99420f77496764e874b084eb78507fc45355ee86b390c832b0faacb55185c02331c64565a8c7437f2721eaace
data/lib/browser.rb CHANGED
@@ -7,15 +7,13 @@ module BlackStack
7
7
  def initialize()
8
8
  self.lockfile = File.open(LOCKFILENAME, 'w+')
9
9
 
10
- n = 10 # timeout in seconds
10
+ n = 20 # timeout in seconds
11
11
 
12
12
  # wait the lock file /tmp/blackstack.bots.browser.lock
13
13
  self.lockfile.flock(File::LOCK_EX)
14
14
  begin
15
15
  # get list of PID of all opened chrome browsers, before launching this one
16
16
  pids_before = `pgrep -f chrome`.split("\n")
17
- # track # of chrome processes
18
- #print "(#{pids_before.size})"
19
17
  # setup driver
20
18
  client = Selenium::WebDriver::Remote::Http::Default.new
21
19
  begin
@@ -25,6 +23,11 @@ module BlackStack
25
23
  end
26
24
  options = Selenium::WebDriver::Chrome::Options.new
27
25
  options.add_argument('--headless')
26
+ # setup user agent with-out the keyword "headless"
27
+ # otherwise, our scraper may be detected as a bot and blocked
28
+ options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
29
+ #+"AppleWebKit/537.36 (KHTML, like Gecko)"
30
+ #+"Chrome/87.0.4280.141 Safari/537.36")
28
31
 
29
32
  # Add this parameter to run Chrome from a root user.
30
33
  # https://stackoverflow.com/questions/50642308/webdriverexception-unknown-error-devtoolsactiveport-file-doesnt-exist-while-t
data/lib/scraper.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module BlackStack
2
2
  module Bots
3
3
  class Scraper
4
- attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
4
+ attr_accessor :browser, :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
5
5
  # auxiliar array of links that I have extracted links from
6
6
  attr_accessor :links_processed
7
7
 
@@ -14,24 +14,94 @@ module BlackStack
14
14
  #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
15
15
  self.links = []
16
16
  self.links_processed = []
17
+ self.browser = BlackStack::Bots::Browser.new()
17
18
  end # def initialize
18
19
 
19
- def get_links_from_sitemap(l=nil)
20
- i = 0
20
+ def get_links_from_sitemap(stop_at=100, l=nil)
21
+ max_allowed_timeout_errors = 3
22
+ timeout_errors = 0
23
+ max_links = self.links.size + stop_at
24
+
21
25
  l.logs "Scrape sitemaps... "
22
26
  begin
27
+ l.logs "get_sitemaps from #{self.domain}... "
28
+
23
29
  # download the robots.txt
24
30
  url = "http://#{domain}/robots.txt"
31
+
25
32
  # get the content of robots.txt from url
26
- s = Timeout::timeout(self.timeout) { URI.open(url).read }
33
+ browser.goto url
34
+ s = browser.text
35
+
27
36
  # get the sitemap
28
37
  sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
29
- sitemaps.each { |b|
30
- parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
31
- self.links += Timeout::timeout(self.timeout) { parser.to_a }
32
- self.links.uniq!
33
- }
38
+ processed = []
39
+ to_process = sitemaps - processed
40
+ l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
41
+
42
+ # while there are sitemaps to process
43
+ while to_process.size > 0 && timeout_errors < max_allowed_timeout_errors && max_links >= self.links.size
44
+ to_process.each { |b|
45
+ l.logs "go to #{b}... "
46
+ begin
47
+ browser.goto b
48
+ l.done
49
+
50
+ l.logs "parsing #{b}... "
51
+ s = browser.text
52
+ # extract all URLs
53
+ doc = Nokogiri::HTML(s)
54
+ l.done
55
+
56
+ # get the value of all <loc> tags with .xml extension
57
+ l.logs "get_sitemaps from #{b}... "
58
+ sitemaps += doc.xpath('//loc').map(&:text).select { |s| s =~ /\.xml$/ }.map { |s| s.downcase }
59
+ sitemaps.uniq!
60
+ l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
61
+
62
+ # get the value of all <loc> tags without .xml extension
63
+ l.logs "get_links from #{b}..."
64
+ self.links += doc.xpath('//loc').map(&:text).select { |s| s !~ /\.xml$/ }.map { |s| s.downcase }
65
+ self.links.uniq!
66
+ l.logf self.links.size == 0 ? 'no links found'.yellow : "#{self.links.size} links found".green # get_links
67
+
68
+ # add the sitemap to the list of processed sitemaps
69
+ processed << b
70
+
71
+ # reset timeout errors
72
+ timeout_errors = 0
73
+
74
+ # break if I exceeded the limit of links
75
+ break if max_links <= self.links.size
76
+
77
+ rescue Net::ReadTimeout => e
78
+ l.logf "Timeout Error: #{e.message}".red
79
+
80
+ l.logs "Restarting browser..."
81
+ browser.close if browser
82
+ self.browser = BlackStack::Bots::Browser.new()
83
+ l.done
84
+
85
+ timeout_errors += 1
86
+ break if timeout_errors >= max_allowed_timeout_errors
87
+
88
+ rescue => e
89
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
90
+ end
91
+ }
92
+ # update the list of sitemaps to process
93
+ processed.uniq!
94
+ to_process = sitemaps - processed
95
+ end
34
96
  l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
97
+
98
+ rescue Net::ReadTimeout => e
99
+ l.logf "Timeout Error: #{e.message}".red
100
+
101
+ l.logs "Restarting browser..."
102
+ browser.close if browser
103
+ self.browser = BlackStack::Bots::Browser.new()
104
+ l.done
35
105
  rescue => e
36
106
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
37
107
  end
@@ -42,14 +112,12 @@ module BlackStack
42
112
  l = BlackStack::DummyLogger.new(nil) if l.nil?
43
113
  l.logs "get_links (#{url})... "
44
114
  aux = []
45
- browser = nil
46
115
  begin
47
116
  # trim url
48
117
  url = url.strip
49
118
  # get domain of the url using open-uri
50
119
  domain = URI.parse(url).host
51
120
  # visit the main page of the website
52
- browser = BlackStack::Bots::Browser.new()
53
121
  browser.goto url
54
122
  sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
55
123
  # get the self.links to the pages of the website
@@ -76,16 +144,23 @@ module BlackStack
76
144
  l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
77
145
  rescue Net::ReadTimeout => e
78
146
  l.logf "Timeout Error: #{e.message}".red
147
+
148
+ l.logs "Restarting browser..."
149
+ browser.close if browser
150
+ self.browser = BlackStack::Bots::Browser.new()
151
+ l.done
79
152
  rescue => e
80
153
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
81
- ensure
82
- browser.close if browser
83
154
  end
84
155
  self.links += aux
85
156
  end # def get_links_from_url
86
157
 
87
- def get_links(stop_at=10, l=nil)
158
+ def get_links(stop_at=100, l=nil)
88
159
  l = BlackStack::DummyLogger.new(nil) if l.nil?
160
+
161
+ # get links from the sitemap
162
+ self.get_links_from_sitemap(stop_at, l)
163
+ =begin
89
164
  # working with root url
90
165
  url = "http://#{self.domain}/"
91
166
  self.links << url if self.links.select { |link| link == url }.empty?
@@ -99,13 +174,11 @@ module BlackStack
99
174
  self.links_processed << link
100
175
  }
101
176
  end # while
102
- # get links from the sitemap
103
- self.get_links_from_sitemap(l)
177
+ =end
104
178
  end # def get_links
105
179
 
106
180
  def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
107
181
  pages = []
108
- browser = nil
109
182
  l = BlackStack::DummyLogger.new(nil) if l.nil?
110
183
  # iterate the links
111
184
  j = 0
@@ -115,7 +188,6 @@ module BlackStack
115
188
  l.logs "#{j.to_s}. find_keywords (#{link})... "
116
189
  begin
117
190
  # get the page
118
- browser = BlackStack::Bots::Browser.new()
119
191
  browser.goto link
120
192
  sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
121
193
  # get page body content in plain text
@@ -142,11 +214,14 @@ module BlackStack
142
214
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
143
215
 
144
216
  rescue Net::ReadTimeout => e
145
- l.logf "Timeout Error: #{e.message}".red
217
+ l.logf "Timeout Error: #{e.message}".red
218
+
219
+ l.logs "Restarting browser..."
220
+ browser.close if browser
221
+ self.browser = BlackStack::Bots::Browser.new()
222
+ l.done
146
223
  rescue => e
147
224
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
148
- ensure
149
- browser.close if browser
150
225
  end
151
226
  } # each
152
227
  # return
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.10
4
+ version: 1.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi