bots 1.0.10 → 1.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/browser.rb +6 -3
- data/lib/scraper.rb +96 -21
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d4d967dc18df73987b5f5d812fc91695b171b4ed5af8f274ca1adc2c3b522d4
|
4
|
+
data.tar.gz: dd12b36f5e74842b985f433130d6d5c1fb6dbc0c5d6c8f973bb7c243d72f7891
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9c99e15e50f3ea6b78f7be2011a44f927b3f9f8c3ac9aeea5d54232559b6af4b72581aeb17b5cf23afc7701ab9f4ef2eaffd4e6283187f7d1f682b22dc1fd55
|
7
|
+
data.tar.gz: d1122c000ee843f027a6b0ab788b44fc6e2859e99420f77496764e874b084eb78507fc45355ee86b390c832b0faacb55185c02331c64565a8c7437f2721eaace
|
data/lib/browser.rb
CHANGED
@@ -7,15 +7,13 @@ module BlackStack
|
|
7
7
|
def initialize()
|
8
8
|
self.lockfile = File.open(LOCKFILENAME, 'w+')
|
9
9
|
|
10
|
-
n =
|
10
|
+
n = 20 # timeout in seconds
|
11
11
|
|
12
12
|
# wait the lock file /tmp/blackstack.bots.browser.lock
|
13
13
|
self.lockfile.flock(File::LOCK_EX)
|
14
14
|
begin
|
15
15
|
# get list of PID of all opened chrome browsers, before launching this one
|
16
16
|
pids_before = `pgrep -f chrome`.split("\n")
|
17
|
-
# track # of chrome processes
|
18
|
-
#print "(#{pids_before.size})"
|
19
17
|
# setup driver
|
20
18
|
client = Selenium::WebDriver::Remote::Http::Default.new
|
21
19
|
begin
|
@@ -25,6 +23,11 @@ module BlackStack
|
|
25
23
|
end
|
26
24
|
options = Selenium::WebDriver::Chrome::Options.new
|
27
25
|
options.add_argument('--headless')
|
26
|
+
# setup user agent with-out the keyword "headless"
|
27
|
+
# otherwise, our scraper may be detected as a bot and blocked
|
28
|
+
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
|
29
|
+
#+"AppleWebKit/537.36 (KHTML, like Gecko)"
|
30
|
+
#+"Chrome/87.0.4280.141 Safari/537.36")
|
28
31
|
|
29
32
|
# Add this parameter to run Chrome from a root user.
|
30
33
|
# https://stackoverflow.com/questions/50642308/webdriverexception-unknown-error-devtoolsactiveport-file-doesnt-exist-while-t
|
data/lib/scraper.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module BlackStack
|
2
2
|
module Bots
|
3
3
|
class Scraper
|
4
|
-
attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
|
4
|
+
attr_accessor :browser, :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
|
5
5
|
# auxiliar array of links that I have extracted links from
|
6
6
|
attr_accessor :links_processed
|
7
7
|
|
@@ -14,24 +14,94 @@ module BlackStack
|
|
14
14
|
#self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
15
15
|
self.links = []
|
16
16
|
self.links_processed = []
|
17
|
+
self.browser = BlackStack::Bots::Browser.new()
|
17
18
|
end # def initialize
|
18
19
|
|
19
|
-
def get_links_from_sitemap(l=nil)
|
20
|
-
|
20
|
+
def get_links_from_sitemap(stop_at=100, l=nil)
|
21
|
+
max_allowed_timeout_errors = 3
|
22
|
+
timeout_errors = 0
|
23
|
+
max_links = self.links.size + stop_at
|
24
|
+
|
21
25
|
l.logs "Scrape sitemaps... "
|
22
26
|
begin
|
27
|
+
l.logs "get_sitemaps from #{self.domain}... "
|
28
|
+
|
23
29
|
# download the robots.txt
|
24
30
|
url = "http://#{domain}/robots.txt"
|
31
|
+
|
25
32
|
# get the content of robots.txt from url
|
26
|
-
|
33
|
+
browser.goto url
|
34
|
+
s = browser.text
|
35
|
+
|
27
36
|
# get the sitemap
|
28
37
|
sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
38
|
+
processed = []
|
39
|
+
to_process = sitemaps - processed
|
40
|
+
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
41
|
+
|
42
|
+
# while there are sitemaps to process
|
43
|
+
while to_process.size > 0 && timeout_errors < max_allowed_timeout_errors && max_links >= self.links.size
|
44
|
+
to_process.each { |b|
|
45
|
+
l.logs "go to #{b}... "
|
46
|
+
begin
|
47
|
+
browser.goto b
|
48
|
+
l.done
|
49
|
+
|
50
|
+
l.logs "parsing #{b}... "
|
51
|
+
s = browser.text
|
52
|
+
# extract all URLs
|
53
|
+
doc = Nokogiri::HTML(s)
|
54
|
+
l.done
|
55
|
+
|
56
|
+
# get the value of all <loc> tags with .xml extension
|
57
|
+
l.logs "get_sitemaps from #{b}... "
|
58
|
+
sitemaps += doc.xpath('//loc').map(&:text).select { |s| s =~ /\.xml$/ }.map { |s| s.downcase }
|
59
|
+
sitemaps.uniq!
|
60
|
+
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
61
|
+
|
62
|
+
# get the value of all <loc> tags without .xml extension
|
63
|
+
l.logs "get_links from #{b}..."
|
64
|
+
self.links += doc.xpath('//loc').map(&:text).select { |s| s !~ /\.xml$/ }.map { |s| s.downcase }
|
65
|
+
self.links.uniq!
|
66
|
+
l.logf self.links.size == 0 ? 'no links found'.yellow : "#{self.links.size} links found".green # get_links
|
67
|
+
|
68
|
+
# add the sitemap to the list of processed sitemaps
|
69
|
+
processed << b
|
70
|
+
|
71
|
+
# reset timeout errors
|
72
|
+
timeout_errors = 0
|
73
|
+
|
74
|
+
# break if I exceeded the limit of links
|
75
|
+
break if max_links <= self.links.size
|
76
|
+
|
77
|
+
rescue Net::ReadTimeout => e
|
78
|
+
l.logf "Timeout Error: #{e.message}".red
|
79
|
+
|
80
|
+
l.logs "Restarting browser..."
|
81
|
+
browser.close if browser
|
82
|
+
self.browser = BlackStack::Bots::Browser.new()
|
83
|
+
l.done
|
84
|
+
|
85
|
+
timeout_errors += 1
|
86
|
+
break if timeout_errors >= max_allowed_timeout_errors
|
87
|
+
|
88
|
+
rescue => e
|
89
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
90
|
+
end
|
91
|
+
}
|
92
|
+
# update the list of sitemaps to process
|
93
|
+
processed.uniq!
|
94
|
+
to_process = sitemaps - processed
|
95
|
+
end
|
34
96
|
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
97
|
+
|
98
|
+
rescue Net::ReadTimeout => e
|
99
|
+
l.logf "Timeout Error: #{e.message}".red
|
100
|
+
|
101
|
+
l.logs "Restarting browser..."
|
102
|
+
browser.close if browser
|
103
|
+
self.browser = BlackStack::Bots::Browser.new()
|
104
|
+
l.done
|
35
105
|
rescue => e
|
36
106
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
37
107
|
end
|
@@ -42,14 +112,12 @@ module BlackStack
|
|
42
112
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
43
113
|
l.logs "get_links (#{url})... "
|
44
114
|
aux = []
|
45
|
-
browser = nil
|
46
115
|
begin
|
47
116
|
# trim url
|
48
117
|
url = url.strip
|
49
118
|
# get domain of the url using open-uri
|
50
119
|
domain = URI.parse(url).host
|
51
120
|
# visit the main page of the website
|
52
|
-
browser = BlackStack::Bots::Browser.new()
|
53
121
|
browser.goto url
|
54
122
|
sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
|
55
123
|
# get the self.links to the pages of the website
|
@@ -76,16 +144,23 @@ module BlackStack
|
|
76
144
|
l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
|
77
145
|
rescue Net::ReadTimeout => e
|
78
146
|
l.logf "Timeout Error: #{e.message}".red
|
147
|
+
|
148
|
+
l.logs "Restarting browser..."
|
149
|
+
browser.close if browser
|
150
|
+
self.browser = BlackStack::Bots::Browser.new()
|
151
|
+
l.done
|
79
152
|
rescue => e
|
80
153
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
81
|
-
ensure
|
82
|
-
browser.close if browser
|
83
154
|
end
|
84
155
|
self.links += aux
|
85
156
|
end # def get_links_from_url
|
86
157
|
|
87
|
-
def get_links(stop_at=
|
158
|
+
def get_links(stop_at=100, l=nil)
|
88
159
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
160
|
+
|
161
|
+
# get links from the sitemap
|
162
|
+
self.get_links_from_sitemap(stop_at, l)
|
163
|
+
=begin
|
89
164
|
# working with root url
|
90
165
|
url = "http://#{self.domain}/"
|
91
166
|
self.links << url if self.links.select { |link| link == url }.empty?
|
@@ -99,13 +174,11 @@ module BlackStack
|
|
99
174
|
self.links_processed << link
|
100
175
|
}
|
101
176
|
end # while
|
102
|
-
|
103
|
-
self.get_links_from_sitemap(l)
|
177
|
+
=end
|
104
178
|
end # def get_links
|
105
179
|
|
106
180
|
def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
|
107
181
|
pages = []
|
108
|
-
browser = nil
|
109
182
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
110
183
|
# iterate the links
|
111
184
|
j = 0
|
@@ -115,7 +188,6 @@ module BlackStack
|
|
115
188
|
l.logs "#{j.to_s}. find_keywords (#{link})... "
|
116
189
|
begin
|
117
190
|
# get the page
|
118
|
-
browser = BlackStack::Bots::Browser.new()
|
119
191
|
browser.goto link
|
120
192
|
sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
|
121
193
|
# get page body content in plain text
|
@@ -142,11 +214,14 @@ module BlackStack
|
|
142
214
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
143
215
|
|
144
216
|
rescue Net::ReadTimeout => e
|
145
|
-
l.logf "Timeout Error: #{e.message}".red
|
217
|
+
l.logf "Timeout Error: #{e.message}".red
|
218
|
+
|
219
|
+
l.logs "Restarting browser..."
|
220
|
+
browser.close if browser
|
221
|
+
self.browser = BlackStack::Bots::Browser.new()
|
222
|
+
l.done
|
146
223
|
rescue => e
|
147
224
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
148
|
-
ensure
|
149
|
-
browser.close if browser
|
150
225
|
end
|
151
226
|
} # each
|
152
227
|
# return
|