bots 1.0.9 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/browser.rb +6 -3
- data/lib/scraper.rb +96 -21
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d4d967dc18df73987b5f5d812fc91695b171b4ed5af8f274ca1adc2c3b522d4
|
4
|
+
data.tar.gz: dd12b36f5e74842b985f433130d6d5c1fb6dbc0c5d6c8f973bb7c243d72f7891
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9c99e15e50f3ea6b78f7be2011a44f927b3f9f8c3ac9aeea5d54232559b6af4b72581aeb17b5cf23afc7701ab9f4ef2eaffd4e6283187f7d1f682b22dc1fd55
|
7
|
+
data.tar.gz: d1122c000ee843f027a6b0ab788b44fc6e2859e99420f77496764e874b084eb78507fc45355ee86b390c832b0faacb55185c02331c64565a8c7437f2721eaace
|
data/lib/browser.rb
CHANGED
@@ -7,15 +7,13 @@ module BlackStack
|
|
7
7
|
def initialize()
|
8
8
|
self.lockfile = File.open(LOCKFILENAME, 'w+')
|
9
9
|
|
10
|
-
n =
|
10
|
+
n = 20 # timeout in seconds
|
11
11
|
|
12
12
|
# wait the lock file /tmp/blackstack.bots.browser.lock
|
13
13
|
self.lockfile.flock(File::LOCK_EX)
|
14
14
|
begin
|
15
15
|
# get list of PID of all opened chrome browsers, before launching this one
|
16
16
|
pids_before = `pgrep -f chrome`.split("\n")
|
17
|
-
# track # of chrome processes
|
18
|
-
#print "(#{pids_before.size})"
|
19
17
|
# setup driver
|
20
18
|
client = Selenium::WebDriver::Remote::Http::Default.new
|
21
19
|
begin
|
@@ -25,6 +23,11 @@ module BlackStack
|
|
25
23
|
end
|
26
24
|
options = Selenium::WebDriver::Chrome::Options.new
|
27
25
|
options.add_argument('--headless')
|
26
|
+
# setup user agent with-out the keyword "headless"
|
27
|
+
# otherwise, our scraper may be detected as a bot and blocked
|
28
|
+
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
|
29
|
+
#+"AppleWebKit/537.36 (KHTML, like Gecko)"
|
30
|
+
#+"Chrome/87.0.4280.141 Safari/537.36")
|
28
31
|
|
29
32
|
# Add this parameter to run Chrome from a root user.
|
30
33
|
# https://stackoverflow.com/questions/50642308/webdriverexception-unknown-error-devtoolsactiveport-file-doesnt-exist-while-t
|
data/lib/scraper.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module BlackStack
|
2
2
|
module Bots
|
3
3
|
class Scraper
|
4
|
-
attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
|
4
|
+
attr_accessor :browser, :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
|
5
5
|
# auxiliar array of links that I have extracted links from
|
6
6
|
attr_accessor :links_processed
|
7
7
|
|
@@ -14,24 +14,94 @@ module BlackStack
|
|
14
14
|
#self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
15
15
|
self.links = []
|
16
16
|
self.links_processed = []
|
17
|
+
self.browser = BlackStack::Bots::Browser.new()
|
17
18
|
end # def initialize
|
18
19
|
|
19
|
-
def get_links_from_sitemap(l=nil)
|
20
|
-
|
20
|
+
def get_links_from_sitemap(stop_at=100, l=nil)
|
21
|
+
max_allowed_timeout_errors = 3
|
22
|
+
timeout_errors = 0
|
23
|
+
max_links = self.links.size + stop_at
|
24
|
+
|
21
25
|
l.logs "Scrape sitemaps... "
|
22
26
|
begin
|
27
|
+
l.logs "get_sitemaps from #{self.domain}... "
|
28
|
+
|
23
29
|
# download the robots.txt
|
24
30
|
url = "http://#{domain}/robots.txt"
|
31
|
+
|
25
32
|
# get the content of robots.txt from url
|
26
|
-
|
33
|
+
browser.goto url
|
34
|
+
s = browser.text
|
35
|
+
|
27
36
|
# get the sitemap
|
28
37
|
sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
38
|
+
processed = []
|
39
|
+
to_process = sitemaps - processed
|
40
|
+
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
41
|
+
|
42
|
+
# while there are sitemaps to process
|
43
|
+
while to_process.size > 0 && timeout_errors < max_allowed_timeout_errors && max_links >= self.links.size
|
44
|
+
to_process.each { |b|
|
45
|
+
l.logs "go to #{b}... "
|
46
|
+
begin
|
47
|
+
browser.goto b
|
48
|
+
l.done
|
49
|
+
|
50
|
+
l.logs "parsing #{b}... "
|
51
|
+
s = browser.text
|
52
|
+
# extract all URLs
|
53
|
+
doc = Nokogiri::HTML(s)
|
54
|
+
l.done
|
55
|
+
|
56
|
+
# get the value of all <loc> tags with .xml extension
|
57
|
+
l.logs "get_sitemaps from #{b}... "
|
58
|
+
sitemaps += doc.xpath('//loc').map(&:text).select { |s| s =~ /\.xml$/ }.map { |s| s.downcase }
|
59
|
+
sitemaps.uniq!
|
60
|
+
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
61
|
+
|
62
|
+
# get the value of all <loc> tags without .xml extension
|
63
|
+
l.logs "get_links from #{b}..."
|
64
|
+
self.links += doc.xpath('//loc').map(&:text).select { |s| s !~ /\.xml$/ }.map { |s| s.downcase }
|
65
|
+
self.links.uniq!
|
66
|
+
l.logf self.links.size == 0 ? 'no links found'.yellow : "#{self.links.size} links found".green # get_links
|
67
|
+
|
68
|
+
# add the sitemap to the list of processed sitemaps
|
69
|
+
processed << b
|
70
|
+
|
71
|
+
# reset timeout errors
|
72
|
+
timeout_errors = 0
|
73
|
+
|
74
|
+
# break if I exceeded the limit of links
|
75
|
+
break if max_links <= self.links.size
|
76
|
+
|
77
|
+
rescue Net::ReadTimeout => e
|
78
|
+
l.logf "Timeout Error: #{e.message}".red
|
79
|
+
|
80
|
+
l.logs "Restarting browser..."
|
81
|
+
browser.close if browser
|
82
|
+
self.browser = BlackStack::Bots::Browser.new()
|
83
|
+
l.done
|
84
|
+
|
85
|
+
timeout_errors += 1
|
86
|
+
break if timeout_errors >= max_allowed_timeout_errors
|
87
|
+
|
88
|
+
rescue => e
|
89
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
90
|
+
end
|
91
|
+
}
|
92
|
+
# update the list of sitemaps to process
|
93
|
+
processed.uniq!
|
94
|
+
to_process = sitemaps - processed
|
95
|
+
end
|
34
96
|
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
97
|
+
|
98
|
+
rescue Net::ReadTimeout => e
|
99
|
+
l.logf "Timeout Error: #{e.message}".red
|
100
|
+
|
101
|
+
l.logs "Restarting browser..."
|
102
|
+
browser.close if browser
|
103
|
+
self.browser = BlackStack::Bots::Browser.new()
|
104
|
+
l.done
|
35
105
|
rescue => e
|
36
106
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
37
107
|
end
|
@@ -42,14 +112,12 @@ module BlackStack
|
|
42
112
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
43
113
|
l.logs "get_links (#{url})... "
|
44
114
|
aux = []
|
45
|
-
browser = nil
|
46
115
|
begin
|
47
116
|
# trim url
|
48
117
|
url = url.strip
|
49
118
|
# get domain of the url using open-uri
|
50
119
|
domain = URI.parse(url).host
|
51
120
|
# visit the main page of the website
|
52
|
-
browser = BlackStack::Bots::Browser.new()
|
53
121
|
browser.goto url
|
54
122
|
sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
|
55
123
|
# get the self.links to the pages of the website
|
@@ -76,16 +144,23 @@ module BlackStack
|
|
76
144
|
l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
|
77
145
|
rescue Net::ReadTimeout => e
|
78
146
|
l.logf "Timeout Error: #{e.message}".red
|
147
|
+
|
148
|
+
l.logs "Restarting browser..."
|
149
|
+
browser.close if browser
|
150
|
+
self.browser = BlackStack::Bots::Browser.new()
|
151
|
+
l.done
|
79
152
|
rescue => e
|
80
153
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
81
|
-
ensure
|
82
|
-
browser.close if browser
|
83
154
|
end
|
84
155
|
self.links += aux
|
85
156
|
end # def get_links_from_url
|
86
157
|
|
87
|
-
def get_links(stop_at=
|
158
|
+
def get_links(stop_at=100, l=nil)
|
88
159
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
160
|
+
|
161
|
+
# get links from the sitemap
|
162
|
+
self.get_links_from_sitemap(stop_at, l)
|
163
|
+
=begin
|
89
164
|
# working with root url
|
90
165
|
url = "http://#{self.domain}/"
|
91
166
|
self.links << url if self.links.select { |link| link == url }.empty?
|
@@ -99,13 +174,11 @@ module BlackStack
|
|
99
174
|
self.links_processed << link
|
100
175
|
}
|
101
176
|
end # while
|
102
|
-
|
103
|
-
self.get_links_from_sitemap(l)
|
177
|
+
=end
|
104
178
|
end # def get_links
|
105
179
|
|
106
180
|
def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
|
107
181
|
pages = []
|
108
|
-
browser = nil
|
109
182
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
110
183
|
# iterate the links
|
111
184
|
j = 0
|
@@ -115,7 +188,6 @@ module BlackStack
|
|
115
188
|
l.logs "#{j.to_s}. find_keywords (#{link})... "
|
116
189
|
begin
|
117
190
|
# get the page
|
118
|
-
browser = BlackStack::Bots::Browser.new()
|
119
191
|
browser.goto link
|
120
192
|
sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
|
121
193
|
# get page body content in plain text
|
@@ -142,11 +214,14 @@ module BlackStack
|
|
142
214
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
143
215
|
|
144
216
|
rescue Net::ReadTimeout => e
|
145
|
-
l.logf "Timeout Error: #{e.message}".red
|
217
|
+
l.logf "Timeout Error: #{e.message}".red
|
218
|
+
|
219
|
+
l.logs "Restarting browser..."
|
220
|
+
browser.close if browser
|
221
|
+
self.browser = BlackStack::Bots::Browser.new()
|
222
|
+
l.done
|
146
223
|
rescue => e
|
147
224
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
148
|
-
ensure
|
149
|
-
browser.close if browser
|
150
225
|
end
|
151
226
|
} # each
|
152
227
|
# return
|