arachnid2 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +51 -28
- data/README.md +94 -22
- data/arachnid2.gemspec +4 -1
- data/lib/arachnid2/{cashed_arachnid_responses.rb → cached_arachnid_responses.rb} +1 -1
- data/lib/arachnid2/exoskeleton.rb +133 -0
- data/lib/arachnid2/typhoeus.rb +99 -0
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2/watir.rb +102 -0
- data/lib/arachnid2.rb +17 -226
- metadata +50 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90efb51a783ec434d1e269d61a9c550e6fe8740eea229b24d3820790c1e5d296
|
4
|
+
data.tar.gz: f731d4cc5ab87ee603a69d795216b9dacd322af653a873273ffb226e6bb6b704
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4fe796d93a1d87ba260b269a5ff9a290280b5354e02cd596f305ea0a342edbf9d48fb29134a83197bcc00266e13e16b5c3b482b448525b380e82f7ffaa69e9b2
|
7
|
+
data.tar.gz: 9c80660fd0b7e9003ea70163fe87d142e1b1ef730b662d251992fdc1aff5ac773ec8e73ffb4c7bf300ebe0543b6b452d83d975b478bcf00ce6bb99347e171dc3
|
data/Gemfile.lock
CHANGED
@@ -1,52 +1,76 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
arachnid2 (0.
|
4
|
+
arachnid2 (0.3.0)
|
5
5
|
addressable
|
6
6
|
adomain
|
7
7
|
bloomfilter-rb
|
8
|
-
nokogiri
|
8
|
+
nokogiri (>= 1.8.5)
|
9
9
|
typhoeus
|
10
|
+
watir
|
11
|
+
webdriver-user-agent (>= 7.6)
|
12
|
+
webdrivers
|
10
13
|
|
11
14
|
GEM
|
12
15
|
remote: https://rubygems.org/
|
13
16
|
specs:
|
14
|
-
addressable (2.
|
17
|
+
addressable (2.6.0)
|
15
18
|
public_suffix (>= 2.0.2, < 4.0)
|
16
19
|
adomain (0.1.1)
|
17
20
|
addressable (~> 2.5)
|
18
21
|
bloomfilter-rb (2.1.1)
|
19
22
|
redis
|
20
|
-
|
23
|
+
childprocess (0.9.0)
|
24
|
+
ffi (~> 1.0, >= 1.0.11)
|
21
25
|
diff-lcs (1.3)
|
22
|
-
ethon (0.
|
26
|
+
ethon (0.12.0)
|
23
27
|
ffi (>= 1.3.0)
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
28
|
+
facets (3.1.0)
|
29
|
+
ffi (1.10.0)
|
30
|
+
json (2.1.0)
|
31
|
+
mini_portile2 (2.4.0)
|
32
|
+
net_http_ssl_fix (0.0.10)
|
33
|
+
nokogiri (1.10.1)
|
34
|
+
mini_portile2 (~> 2.4.0)
|
35
|
+
os (1.0.0)
|
36
|
+
psych (3.1.0)
|
32
37
|
public_suffix (3.0.3)
|
33
38
|
rake (10.5.0)
|
34
|
-
redis (4.0
|
35
|
-
|
36
|
-
|
37
|
-
rspec-
|
38
|
-
rspec-
|
39
|
-
|
40
|
-
|
41
|
-
|
39
|
+
redis (4.1.0)
|
40
|
+
regexp_parser (1.3.0)
|
41
|
+
rspec (3.8.0)
|
42
|
+
rspec-core (~> 3.8.0)
|
43
|
+
rspec-expectations (~> 3.8.0)
|
44
|
+
rspec-mocks (~> 3.8.0)
|
45
|
+
rspec-core (3.8.0)
|
46
|
+
rspec-support (~> 3.8.0)
|
47
|
+
rspec-expectations (3.8.2)
|
42
48
|
diff-lcs (>= 1.2.0, < 2.0)
|
43
|
-
rspec-support (~> 3.
|
44
|
-
rspec-mocks (3.
|
49
|
+
rspec-support (~> 3.8.0)
|
50
|
+
rspec-mocks (3.8.0)
|
45
51
|
diff-lcs (>= 1.2.0, < 2.0)
|
46
|
-
rspec-support (~> 3.
|
47
|
-
rspec-support (3.
|
48
|
-
|
52
|
+
rspec-support (~> 3.8.0)
|
53
|
+
rspec-support (3.8.0)
|
54
|
+
rubyzip (1.2.2)
|
55
|
+
selenium-webdriver (3.141.0)
|
56
|
+
childprocess (~> 0.5)
|
57
|
+
rubyzip (~> 1.2, >= 1.2.2)
|
58
|
+
typhoeus (1.3.1)
|
49
59
|
ethon (>= 0.9.0)
|
60
|
+
watir (6.16.5)
|
61
|
+
regexp_parser (~> 1.2)
|
62
|
+
selenium-webdriver (~> 3.6)
|
63
|
+
webdriver-user-agent (7.6)
|
64
|
+
facets
|
65
|
+
json
|
66
|
+
os
|
67
|
+
psych
|
68
|
+
selenium-webdriver (>= 3.4.0)
|
69
|
+
webdrivers (3.6.0)
|
70
|
+
net_http_ssl_fix
|
71
|
+
nokogiri (~> 1.6)
|
72
|
+
rubyzip (~> 1.0)
|
73
|
+
selenium-webdriver (~> 3.0)
|
50
74
|
|
51
75
|
PLATFORMS
|
52
76
|
ruby
|
@@ -54,9 +78,8 @@ PLATFORMS
|
|
54
78
|
DEPENDENCIES
|
55
79
|
arachnid2!
|
56
80
|
bundler (~> 1.16)
|
57
|
-
pry
|
58
81
|
rake (~> 10.0)
|
59
82
|
rspec (~> 3.0)
|
60
83
|
|
61
84
|
BUNDLED WITH
|
62
|
-
1.16.
|
85
|
+
1.16.5
|
data/README.md
CHANGED
@@ -3,18 +3,22 @@
|
|
3
3
|
## About
|
4
4
|
|
5
5
|
Arachnid2 is a simple, fast web-crawler written in Ruby.
|
6
|
-
|
7
|
-
to get HTTP requests,
|
6
|
+
You can use [typhoeus](https://github.com/typhoeus/typhoeus)
|
7
|
+
to get HTTP requests, or [Watir](https://github.com/watir/watir)
|
8
|
+
to render pages.
|
9
|
+
|
8
10
|
[bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb)
|
9
|
-
|
11
|
+
stores the URLs it will get and has gotten,
|
10
12
|
and [nokogiri](https://github.com/sparklemotion/nokogiri)
|
11
|
-
to find the URLs on each webpage.
|
13
|
+
to find the URLs on each webpage, adding them to the bloomfilter queue.
|
12
14
|
|
13
15
|
Arachnid2 is a successor to [Arachnid](https://github.com/dchuk/Arachnid),
|
14
16
|
and was abstracted out of the [Tellurion Bot](https://github.com/samnissen/tellurion_bot).
|
15
17
|
|
16
18
|
## Usage
|
17
19
|
|
20
|
+
### Typheous (cURL)
|
21
|
+
|
18
22
|
The basic use of Arachnid2 is surfacing the responses from a domains'
|
19
23
|
URLs by visiting a URL, collecting any links to the same domain
|
20
24
|
on that page, and visiting those to do the same.
|
@@ -22,9 +26,6 @@ on that page, and visiting those to do the same.
|
|
22
26
|
Hence, the simplest output would be to collect all of the responses
|
23
27
|
while spidering from some URL.
|
24
28
|
|
25
|
-
Set cached service url(optional)
|
26
|
-
`export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
|
27
|
-
|
28
29
|
```ruby
|
29
30
|
require "arachnid2"
|
30
31
|
|
@@ -58,7 +59,7 @@ spider.crawl { |response|
|
|
58
59
|
|
59
60
|
`Arachnid2#crawl` will return always `nil`.
|
60
61
|
|
61
|
-
|
62
|
+
#### Options
|
62
63
|
|
63
64
|
```ruby
|
64
65
|
require "arachnid2"
|
@@ -67,7 +68,7 @@ url = "http://sixcolours.com"
|
|
67
68
|
spider = Arachnid2.new(url)
|
68
69
|
opts = {
|
69
70
|
followlocation: true,
|
70
|
-
timeout:
|
71
|
+
timeout: 300,
|
71
72
|
time_box: 60,
|
72
73
|
max_urls: 50,
|
73
74
|
:headers => {
|
@@ -95,26 +96,37 @@ spider.crawl(opts) { |response|
|
|
95
96
|
}
|
96
97
|
```
|
97
98
|
|
98
|
-
|
99
|
+
##### `followlocation`
|
100
|
+
|
101
|
+
Tell Typhoeus to follow redirections.
|
99
102
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
+
##### `timeout`
|
104
|
+
|
105
|
+
Tell Typheous or Watir how long to wait for page load.
|
106
|
+
|
107
|
+
##### `time_box`
|
108
|
+
|
109
|
+
The crawler will time-bound your spidering.
|
110
|
+
If no valid integer is provided,
|
111
|
+
it will crawl for 15 seconds before exiting.
|
112
|
+
10000 seconds is the current maximum,
|
113
|
+
and any value above it will be reduced to 10000.
|
103
114
|
|
104
|
-
|
115
|
+
##### `max_urls`
|
105
116
|
|
106
117
|
The crawler will crawl a limited number of URLs before stopping.
|
107
|
-
If no valid integer is provided,
|
118
|
+
If no valid integer is provided,
|
119
|
+
it will crawl for 50 URLs before exiting.
|
108
120
|
10000 seconds is the current maximum,
|
109
121
|
and any value above it will be reduced to 10000.
|
110
122
|
|
111
|
-
|
123
|
+
##### `headers`
|
112
124
|
|
113
125
|
This is a hash that represents any HTTP header key/value pairs you desire,
|
114
126
|
and is passed directly to Typheous. Before it is sent, a default
|
115
127
|
language and user agent are created:
|
116
128
|
|
117
|
-
|
129
|
+
###### Defaults
|
118
130
|
|
119
131
|
The HTTP header `Accept-Language` default is
|
120
132
|
`en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
|
@@ -122,19 +134,19 @@ The HTTP header `Accept-Language` default is
|
|
122
134
|
The HTTP header `User-Agent` default is
|
123
135
|
`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
|
124
136
|
|
125
|
-
|
137
|
+
##### `proxy`
|
126
138
|
|
127
139
|
Provide your IP, port for a proxy. If required, provide credentials for
|
128
140
|
authenticating to that proxy. Proxy options and handling are done
|
129
141
|
by Typhoeus.
|
130
142
|
|
131
|
-
|
143
|
+
##### `non_html_extensions`
|
132
144
|
|
133
145
|
This is the list of TLDs to ignore when collecting URLs from the page.
|
134
146
|
The extensions are formatted as a hash of key/value pairs, where the value
|
135
147
|
is an array of TLDs, and the keys represent the length of those TLDs.
|
136
148
|
|
137
|
-
|
149
|
+
##### `memory_limit` and Docker
|
138
150
|
|
139
151
|
In case you are operating the crawler within a container, Arachnid2
|
140
152
|
can attempt to prevent the container from running out of memory.
|
@@ -142,15 +154,75 @@ By default, it will end the crawl when the container uses >= 80%
|
|
142
154
|
of its available memory. You can override this with the
|
143
155
|
option.
|
144
156
|
|
145
|
-
|
157
|
+
##### Non-HTML links
|
146
158
|
|
147
159
|
The crawler attempts to stop itself from returning data from
|
148
160
|
links that are not indicative of HTML, as detailed in
|
149
161
|
`Arachnid2::NON_HTML_EXTENSIONS`.
|
150
162
|
|
163
|
+
#### Caching (optional)
|
164
|
+
|
165
|
+
If you have setup a cache to deduplicate crawls,
|
166
|
+
set a cached service url
|
167
|
+
`export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
|
168
|
+
|
169
|
+
This expects a push and get JSON API to respond
|
170
|
+
to `/typhoeus_responses`, with a URL and the options pushed
|
171
|
+
exactly as received as parameters. It will push any crawls
|
172
|
+
to the service, and re-use any crawled pages
|
173
|
+
if they are found to match.
|
174
|
+
|
175
|
+
### With Watir
|
176
|
+
|
177
|
+
Crawling with Watir works similarly, but requires you setup your
|
178
|
+
environment for Watir, and headless web browsing if required.
|
179
|
+
See the Watir documentation for more information.
|
180
|
+
|
181
|
+
```ruby
|
182
|
+
# ...
|
183
|
+
Arachnid2.new(url).crawl_watir(opts)
|
184
|
+
# -or-
|
185
|
+
with_watir = true
|
186
|
+
Arachnid2.new(url).crawl(opts, with_watir)
|
187
|
+
```
|
188
|
+
|
189
|
+
#### Options
|
190
|
+
|
191
|
+
See the Typhoeus options above — most apply to Watir as well, with
|
192
|
+
some exceptions:
|
193
|
+
|
194
|
+
##### `proxy`
|
195
|
+
|
196
|
+
Watir proxy options are formatted differently:
|
197
|
+
|
198
|
+
```ruby
|
199
|
+
proxy: {
|
200
|
+
http: "troy.show:8080",
|
201
|
+
ssl: "abed.show:8080"
|
202
|
+
},
|
203
|
+
```
|
204
|
+
|
205
|
+
Proxy options handling is done by Watir.
|
206
|
+
|
207
|
+
##### `headless`
|
208
|
+
|
209
|
+
And it accepts an argument to make browse headlessly
|
210
|
+
|
211
|
+
```ruby
|
212
|
+
opts = { headless: true }
|
213
|
+
```
|
214
|
+
|
215
|
+
##### `followlocation` and `max_concurrency`
|
216
|
+
|
217
|
+
These options do not apply to Watir, and will be ignored.
|
218
|
+
|
151
219
|
## Development
|
152
220
|
|
153
|
-
|
221
|
+
Fork the repo and run the tests
|
222
|
+
|
223
|
+
```ruby
|
224
|
+
bundle exec rspec spec/
|
225
|
+
```
|
154
226
|
|
155
227
|
## Contributing
|
156
228
|
|
data/arachnid2.gemspec
CHANGED
@@ -25,9 +25,12 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency "rake", "~> 10.0"
|
26
26
|
spec.add_development_dependency "rspec", "~> 3.0"
|
27
27
|
|
28
|
+
spec.add_dependency "webdriver-user-agent", ">= 7.6"
|
29
|
+
spec.add_dependency "watir"
|
30
|
+
spec.add_dependency "webdrivers"
|
28
31
|
spec.add_dependency "typhoeus"
|
29
32
|
spec.add_dependency "bloomfilter-rb"
|
30
33
|
spec.add_dependency "adomain"
|
31
34
|
spec.add_dependency "addressable"
|
32
|
-
spec.add_dependency "nokogiri"
|
35
|
+
spec.add_dependency "nokogiri", ">= 1.8.5"
|
33
36
|
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
class Arachnid2
|
2
|
+
module Exoskeleton
|
3
|
+
def browser_type
|
4
|
+
unless @browser_type
|
5
|
+
@browser_type = "#{@options[:browser_type]}".to_sym if @options[:browser_type]
|
6
|
+
@browser_type ||= :firefox
|
7
|
+
end
|
8
|
+
|
9
|
+
@browser_type
|
10
|
+
end
|
11
|
+
|
12
|
+
def process(url, html)
|
13
|
+
return false unless Adomain["#{url}"].include? @domain
|
14
|
+
|
15
|
+
extract_hrefs(html)
|
16
|
+
end
|
17
|
+
|
18
|
+
def extract_hrefs(body)
|
19
|
+
elements = Nokogiri::HTML.parse(body).css('a')
|
20
|
+
return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
|
21
|
+
end
|
22
|
+
|
23
|
+
def vacuum(links, url)
|
24
|
+
links.each do |link|
|
25
|
+
next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
|
26
|
+
|
27
|
+
begin
|
28
|
+
absolute_link = make_absolute(link, url)
|
29
|
+
|
30
|
+
next if skip_link?(absolute_link)
|
31
|
+
|
32
|
+
@global_queue << absolute_link
|
33
|
+
rescue Addressable::URI::InvalidURIError
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def skip_link?(absolute_link)
|
39
|
+
!internal_link?(absolute_link) || \
|
40
|
+
@global_visited.include?(absolute_link) || \
|
41
|
+
extension_ignored?(absolute_link) || \
|
42
|
+
@global_queue.include?(absolute_link)
|
43
|
+
end
|
44
|
+
|
45
|
+
def preflight(opts)
|
46
|
+
@options = opts
|
47
|
+
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
|
48
|
+
@global_queue = [@url]
|
49
|
+
end
|
50
|
+
|
51
|
+
def proxy
|
52
|
+
@options[:proxy]
|
53
|
+
end
|
54
|
+
|
55
|
+
def non_html_extensions
|
56
|
+
return @non_html_extensions if @non_html_extensions
|
57
|
+
|
58
|
+
@non_html_extensions = @options[:non_html_extensions]
|
59
|
+
@non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
|
60
|
+
end
|
61
|
+
|
62
|
+
def bound_time
|
63
|
+
boundary = "#{@options[:time_box]}".to_i
|
64
|
+
boundary = BASE_CRAWL_TIME if boundary <= 0
|
65
|
+
boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME
|
66
|
+
|
67
|
+
return Time.now + boundary
|
68
|
+
end
|
69
|
+
|
70
|
+
def bound_urls
|
71
|
+
amount = "#{@options[:max_urls]}".to_i
|
72
|
+
amount = BASE_URLS if amount <= 0
|
73
|
+
amount = MAX_URLS if amount > MAX_URLS
|
74
|
+
|
75
|
+
amount
|
76
|
+
end
|
77
|
+
|
78
|
+
def timeout
|
79
|
+
unless @timeout
|
80
|
+
@timeout = @options[:timeout]
|
81
|
+
@timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
|
82
|
+
@timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
|
83
|
+
@timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
|
84
|
+
end
|
85
|
+
@timeout
|
86
|
+
end
|
87
|
+
|
88
|
+
def crawl_options
|
89
|
+
@crawl_options ||= { max_urls: max_urls, time_limit: time_limit }
|
90
|
+
end
|
91
|
+
|
92
|
+
alias_method :max_urls, :bound_urls
|
93
|
+
|
94
|
+
alias_method :time_limit, :bound_time
|
95
|
+
|
96
|
+
def make_absolute(href, root)
|
97
|
+
Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
|
98
|
+
end
|
99
|
+
|
100
|
+
def internal_link?(absolute_url)
|
101
|
+
"#{Adomain[absolute_url]}".include? @domain
|
102
|
+
end
|
103
|
+
|
104
|
+
def extension_ignored?(url)
|
105
|
+
return false if url.empty?
|
106
|
+
|
107
|
+
!non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
|
108
|
+
end
|
109
|
+
|
110
|
+
def memory_danger?
|
111
|
+
return false unless in_docker?
|
112
|
+
|
113
|
+
use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
|
114
|
+
@limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f
|
115
|
+
|
116
|
+
return false unless ( (use > 0.0) && (@limit > 0.0) )
|
117
|
+
|
118
|
+
return ( ( (use / @limit) * 100.0 ) >= maximum_load_rate )
|
119
|
+
end
|
120
|
+
|
121
|
+
def in_docker?
|
122
|
+
File.file?(MEMORY_USE_FILE)
|
123
|
+
end
|
124
|
+
|
125
|
+
def maximum_load_rate
|
126
|
+
return @maximum_load_rate if @maximum_load_rate
|
127
|
+
|
128
|
+
@maximum_load_rate = "#{@options[:memory_limit]}".to_f
|
129
|
+
@maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
|
130
|
+
@maximum_load_rate
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
class Arachnid2
|
2
|
+
class Typhoeus
|
3
|
+
include CachedArachnidResponses
|
4
|
+
include Arachnid2::Exoskeleton
|
5
|
+
|
6
|
+
def initialize(url)
|
7
|
+
@url = url
|
8
|
+
@domain = Adomain[@url]
|
9
|
+
@cached_data = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def crawl(opts = {})
|
13
|
+
preflight(opts)
|
14
|
+
typhoeus_preflight
|
15
|
+
|
16
|
+
until @global_queue.empty?
|
17
|
+
max_concurrency.times do
|
18
|
+
q = @global_queue.shift
|
19
|
+
|
20
|
+
break if @global_visited.size >= crawl_options[:max_urls] || \
|
21
|
+
Time.now > crawl_options[:time_limit] || \
|
22
|
+
memory_danger?
|
23
|
+
|
24
|
+
@global_visited.insert(q)
|
25
|
+
|
26
|
+
request = ::Typhoeus::Request.new(q, request_options)
|
27
|
+
|
28
|
+
data = load_data(@url, opts)
|
29
|
+
data.each { |response| yield response } and return unless data.nil?
|
30
|
+
|
31
|
+
request.on_complete do |response|
|
32
|
+
@cached_data.push(response)
|
33
|
+
links = process(response.effective_url, response.body)
|
34
|
+
next unless links
|
35
|
+
|
36
|
+
yield response
|
37
|
+
|
38
|
+
vacuum(links, response.effective_url)
|
39
|
+
end
|
40
|
+
|
41
|
+
@hydra.queue(request)
|
42
|
+
end # max_concurrency.times do
|
43
|
+
|
44
|
+
@hydra.run
|
45
|
+
|
46
|
+
end # until @global_queue.empty?
|
47
|
+
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
48
|
+
ensure
|
49
|
+
@cookie_file.close! if @cookie_file
|
50
|
+
end # def crawl(opts = {})
|
51
|
+
|
52
|
+
private
|
53
|
+
def typhoeus_preflight
|
54
|
+
@hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
|
55
|
+
typhoeus_proxy_options
|
56
|
+
end
|
57
|
+
|
58
|
+
def max_concurrency
|
59
|
+
return @max_concurrency if @max_concurrency
|
60
|
+
|
61
|
+
@max_concurrency = "#{@options[:max_concurrency]}".to_i
|
62
|
+
@max_concurrency = 1 unless (@max_concurrency > 0)
|
63
|
+
@max_concurrency
|
64
|
+
end
|
65
|
+
|
66
|
+
def followlocation
|
67
|
+
return @followlocation unless @followlocation.nil?
|
68
|
+
|
69
|
+
@followlocation = @options[:followlocation]
|
70
|
+
@followlocation = true unless @followlocation.is_a?(FalseClass)
|
71
|
+
end
|
72
|
+
|
73
|
+
def request_options
|
74
|
+
@cookie_file ||= Tempfile.new('cookies')
|
75
|
+
|
76
|
+
@request_options = {
|
77
|
+
timeout: timeout,
|
78
|
+
followlocation: followlocation,
|
79
|
+
cookiefile: @cookie_file.path,
|
80
|
+
cookiejar: @cookie_file.path,
|
81
|
+
headers: @options[:headers]
|
82
|
+
}.merge(crawl_options[:proxy])
|
83
|
+
|
84
|
+
@request_options[:headers] ||= {}
|
85
|
+
@request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
|
86
|
+
@request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
|
87
|
+
|
88
|
+
@request_options
|
89
|
+
end
|
90
|
+
|
91
|
+
def typhoeus_proxy_options
|
92
|
+
crawl_options[:proxy] = {}
|
93
|
+
|
94
|
+
crawl_options[:proxy][:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
|
95
|
+
crawl_options[:proxy][:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
data/lib/arachnid2/version.rb
CHANGED
@@ -0,0 +1,102 @@
|
|
1
|
+
class Arachnid2
|
2
|
+
class Watir
|
3
|
+
include Arachnid2::Exoskeleton
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
@url = url
|
7
|
+
@domain = Adomain[@url]
|
8
|
+
end
|
9
|
+
|
10
|
+
def crawl(opts)
|
11
|
+
preflight(opts)
|
12
|
+
watir_preflight
|
13
|
+
|
14
|
+
until @global_queue.empty?
|
15
|
+
@already_retried = false
|
16
|
+
q = @global_queue.shift
|
17
|
+
|
18
|
+
break if @global_visited.size >= crawl_options[:max_urls]
|
19
|
+
break if Time.now > crawl_options[:time_limit]
|
20
|
+
break if memory_danger?
|
21
|
+
|
22
|
+
@global_visited.insert(q)
|
23
|
+
|
24
|
+
begin
|
25
|
+
browser.goto q
|
26
|
+
links = process(browser.url, browser.body.html)
|
27
|
+
next unless links
|
28
|
+
|
29
|
+
yield browser
|
30
|
+
|
31
|
+
vacuum(links, browser.url)
|
32
|
+
rescue => e
|
33
|
+
raise e if @already_retried
|
34
|
+
raise e unless "#{e.class}".include?("Selenium") || "#{e.class}".include?("Watir")
|
35
|
+
@browser = nil
|
36
|
+
@already_retried = true
|
37
|
+
retry
|
38
|
+
end
|
39
|
+
|
40
|
+
end # until @global_queue.empty?
|
41
|
+
ensure
|
42
|
+
@browser.close if @browser rescue nil
|
43
|
+
@headless.destroy if @headless rescue nil
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
def browser
|
48
|
+
unless @browser
|
49
|
+
behead if @make_headless
|
50
|
+
|
51
|
+
@browser = create_browser
|
52
|
+
|
53
|
+
set_timeout
|
54
|
+
end
|
55
|
+
|
56
|
+
return @browser
|
57
|
+
end
|
58
|
+
|
59
|
+
def create_browser
|
60
|
+
return ::Watir::Browser.new(driver, proxy: @proxy) if @proxy
|
61
|
+
|
62
|
+
::Watir::Browser.new driver
|
63
|
+
end
|
64
|
+
|
65
|
+
def set_timeout
|
66
|
+
@browser.driver.manage.timeouts.page_load = timeout
|
67
|
+
end
|
68
|
+
|
69
|
+
def behead
|
70
|
+
@headless = Headless.new
|
71
|
+
@headless.start
|
72
|
+
end
|
73
|
+
|
74
|
+
def driver
|
75
|
+
unless @driver
|
76
|
+
language = @options.dig(:headers, "Accept-Language") || DEFAULT_LANGUAGE
|
77
|
+
user_agent = @options.dig(:headers, "User-Agent") || DEFAULT_USER_AGENT
|
78
|
+
|
79
|
+
@driver = Webdriver::UserAgent.driver(
|
80
|
+
browser: browser_type,
|
81
|
+
accept_language_string: language,
|
82
|
+
user_agent_string: user_agent
|
83
|
+
)
|
84
|
+
end
|
85
|
+
|
86
|
+
@driver
|
87
|
+
end
|
88
|
+
|
89
|
+
def watir_preflight
|
90
|
+
watir_proxy_options
|
91
|
+
@make_headless = @options[:headless]
|
92
|
+
end
|
93
|
+
|
94
|
+
def watir_proxy_options
|
95
|
+
crawl_options[:proxy] = {}
|
96
|
+
|
97
|
+
crawl_options[:proxy][:http] = @options[:proxy][:http] if @options.dig(:proxy, :http)
|
98
|
+
crawl_options[:proxy][:ssl] = @options[:proxy][:ssl] if @options.dig(:proxy, :ssl)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
data/lib/arachnid2.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
require "arachnid2/version"
|
2
|
-
require "arachnid2/
|
2
|
+
require "arachnid2/cached_arachnid_responses"
|
3
|
+
require "arachnid2/exoskeleton"
|
4
|
+
require "arachnid2/typhoeus"
|
5
|
+
require "arachnid2/watir"
|
3
6
|
|
4
7
|
require 'tempfile'
|
5
8
|
require "typhoeus"
|
@@ -8,9 +11,12 @@ require "adomain"
|
|
8
11
|
require "addressable/uri"
|
9
12
|
require "nokogiri"
|
10
13
|
require "base64"
|
14
|
+
require "webdrivers"
|
15
|
+
require "webdriver-user-agent"
|
16
|
+
require "watir"
|
17
|
+
|
11
18
|
|
12
19
|
class Arachnid2
|
13
|
-
include CashedArachnidResponses
|
14
20
|
# META:
|
15
21
|
# About the origins of this crawling approach
|
16
22
|
# The Crawler is heavily borrowed from by Arachnid.
|
@@ -22,7 +28,7 @@ class Arachnid2
|
|
22
28
|
# And this was originally written as a part of Tellurion's bot
|
23
29
|
# https://github.com/samnissen/tellurion_bot
|
24
30
|
|
25
|
-
MAX_CRAWL_TIME =
|
31
|
+
MAX_CRAWL_TIME = 10000
|
26
32
|
BASE_CRAWL_TIME = 15
|
27
33
|
MAX_URLS = 10000
|
28
34
|
BASE_URLS = 50
|
@@ -58,8 +64,6 @@ class Arachnid2
|
|
58
64
|
#
|
59
65
|
def initialize(url)
|
60
66
|
@url = url
|
61
|
-
@domain = Adomain[@url]
|
62
|
-
@cached_data = []
|
63
67
|
end
|
64
68
|
|
65
69
|
#
|
@@ -101,228 +105,15 @@ class Arachnid2
|
|
101
105
|
#
|
102
106
|
# @return nil
|
103
107
|
#
|
104
|
-
def crawl(opts = {})
|
105
|
-
|
106
|
-
|
107
|
-
until @global_queue.empty?
|
108
|
-
@max_concurrency.times do
|
109
|
-
q = @global_queue.shift
|
110
|
-
|
111
|
-
break if @global_visited.size >= @crawl_options[:max_urls]
|
112
|
-
break if Time.now > @crawl_options[:time_limit]
|
113
|
-
break if memory_danger?
|
114
|
-
|
115
|
-
@global_visited.insert(q)
|
116
|
-
|
117
|
-
request = Typhoeus::Request.new(q, request_options)
|
118
|
-
|
119
|
-
data = load_data(@url, opts)
|
120
|
-
unless data.nil?
|
121
|
-
data.each do |response|
|
122
|
-
yield response
|
123
|
-
end
|
124
|
-
return
|
125
|
-
end
|
126
|
-
request.on_complete do |response|
|
127
|
-
@cached_data.push(response)
|
128
|
-
links = process(response)
|
129
|
-
next unless links
|
130
|
-
|
131
|
-
yield response
|
132
|
-
|
133
|
-
vacuum(links, response)
|
134
|
-
end
|
135
|
-
|
136
|
-
@hydra.queue(request)
|
137
|
-
end # @max_concurrency.times do
|
138
|
-
|
139
|
-
@hydra.run
|
140
|
-
|
141
|
-
end # until @global_queue.empty?
|
142
|
-
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
143
|
-
ensure
|
144
|
-
@cookie_file.close! if @cookie_file
|
145
|
-
|
146
|
-
|
147
|
-
end # def crawl(opts = {})
|
148
|
-
|
149
|
-
private
|
150
|
-
def process(response)
|
151
|
-
return false unless Adomain["#{response.effective_url}"].include? @domain
|
152
|
-
|
153
|
-
elements = Nokogiri::HTML.parse(response.body).css('a')
|
154
|
-
return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
|
155
|
-
end
|
156
|
-
|
157
|
-
def vacuum(links, response)
|
158
|
-
links.each do |link|
|
159
|
-
next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
|
160
|
-
|
161
|
-
begin
|
162
|
-
absolute_link = make_absolute(link, response.effective_url)
|
163
|
-
|
164
|
-
next if skip_link?(absolute_link)
|
165
|
-
|
166
|
-
@global_queue << absolute_link
|
167
|
-
rescue Addressable::URI::InvalidURIError
|
168
|
-
end
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
def skip_link?(absolute_link)
|
173
|
-
internal = internal_link?(absolute_link)
|
174
|
-
visited = @global_visited.include?(absolute_link)
|
175
|
-
ignored = extension_ignored?(absolute_link)
|
176
|
-
known = @global_queue.include?(absolute_link)
|
177
|
-
|
178
|
-
!internal || visited || ignored || known
|
179
|
-
end
|
180
|
-
|
181
|
-
def preflight(opts)
|
182
|
-
@options = opts
|
183
|
-
@crawl_options = crawl_options
|
184
|
-
@maximum_load_rate = maximum_load_rate
|
185
|
-
@max_concurrency = max_concurrency
|
186
|
-
@non_html_extensions = non_html_extensions
|
187
|
-
@hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
|
188
|
-
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
|
189
|
-
@global_queue = [@url]
|
190
|
-
end
|
191
|
-
|
192
|
-
def non_html_extensions
|
193
|
-
@non_html_extensions ||= nil
|
194
|
-
|
195
|
-
if !@non_html_extensions
|
196
|
-
@non_html_extensions = @options[:non_html_extensions]
|
197
|
-
@non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
|
198
|
-
end
|
199
|
-
|
200
|
-
@non_html_extensions
|
201
|
-
end
|
202
|
-
|
203
|
-
def max_concurrency
|
204
|
-
@max_concurrency ||= nil
|
205
|
-
|
206
|
-
if !@max_concurrency
|
207
|
-
@max_concurrency = "#{@options[:max_concurrency]}".to_i
|
208
|
-
@max_concurrency = 1 unless (@max_concurrency > 0)
|
209
|
-
end
|
210
|
-
|
211
|
-
@max_concurrency
|
212
|
-
end
|
108
|
+
def crawl(opts = {}, with_watir = false)
|
109
|
+
crawl_watir and return if with_watir
|
213
110
|
|
214
|
-
|
215
|
-
|
216
|
-
boundary = BASE_CRAWL_TIME if boundary <= 0
|
217
|
-
boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME
|
218
|
-
|
219
|
-
return Time.now + boundary
|
220
|
-
end
|
221
|
-
|
222
|
-
def bound_urls
|
223
|
-
amount = "#{@options[:max_urls]}".to_i
|
224
|
-
amount = BASE_URLS if amount <= 0
|
225
|
-
amount = MAX_URLS if amount > MAX_URLS
|
226
|
-
|
227
|
-
amount
|
228
|
-
end
|
229
|
-
|
230
|
-
def followlocation
|
231
|
-
if @followlocation.is_a?(NilClass)
|
232
|
-
@followlocation = @options[:followlocation]
|
233
|
-
@followlocation = true unless @followlocation.is_a?(FalseClass)
|
234
|
-
end
|
235
|
-
@followlocation
|
236
|
-
end
|
237
|
-
|
238
|
-
def timeout
|
239
|
-
if !@timeout
|
240
|
-
@timeout = @options[:timeout]
|
241
|
-
@timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
|
242
|
-
@timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
|
243
|
-
@timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
|
244
|
-
end
|
245
|
-
@timeout
|
246
|
-
end
|
247
|
-
|
248
|
-
def request_options
|
249
|
-
@cookie_file ||= Tempfile.new('cookies')
|
250
|
-
|
251
|
-
@request_options = {
|
252
|
-
timeout: timeout,
|
253
|
-
followlocation: followlocation,
|
254
|
-
cookiefile: @cookie_file.path,
|
255
|
-
cookiejar: @cookie_file.path,
|
256
|
-
headers: @options[:headers]
|
257
|
-
}
|
258
|
-
|
259
|
-
@request_options[:headers] ||= {}
|
260
|
-
@request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
|
261
|
-
@request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
|
262
|
-
|
263
|
-
@request_options
|
264
|
-
end
|
265
|
-
|
266
|
-
def crawl_options
|
267
|
-
@crawl_options ||= nil
|
268
|
-
|
269
|
-
if !@crawl_options
|
270
|
-
@crawl_options = { :max_urls => max_urls, :time_limit => time_limit }
|
271
|
-
|
272
|
-
@crawl_options[:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
|
273
|
-
@crawl_options[:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
|
274
|
-
end
|
275
|
-
|
276
|
-
@crawl_options
|
277
|
-
end
|
278
|
-
|
279
|
-
def max_urls
|
280
|
-
bound_urls
|
281
|
-
end
|
282
|
-
|
283
|
-
def time_limit
|
284
|
-
bound_time
|
285
|
-
end
|
286
|
-
|
287
|
-
def make_absolute(href, root)
|
288
|
-
Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
|
289
|
-
end
|
290
|
-
|
291
|
-
def internal_link?(absolute_url)
|
292
|
-
"#{Adomain[absolute_url]}".include? @domain
|
293
|
-
end
|
294
|
-
|
295
|
-
def extension_ignored?(url)
|
296
|
-
return false if url.empty?
|
297
|
-
|
298
|
-
!@non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
|
299
|
-
end
|
300
|
-
|
301
|
-
def memory_danger?
|
302
|
-
return false unless in_docker?
|
303
|
-
|
304
|
-
use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
|
305
|
-
@limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f
|
306
|
-
|
307
|
-
return false unless ( (use > 0.0) && (@limit > 0.0) )
|
308
|
-
|
309
|
-
return ( ( (use / @limit) * 100.0 ) >= @maximum_load_rate )
|
310
|
-
end
|
311
|
-
|
312
|
-
def in_docker?
|
313
|
-
return false unless File.file?(MEMORY_USE_FILE)
|
314
|
-
true
|
315
|
-
end
|
316
|
-
|
317
|
-
def maximum_load_rate
|
318
|
-
@maximum_load_rate ||= nil
|
319
|
-
|
320
|
-
if !@maximum_load_rate
|
321
|
-
@maximum_load_rate = "#{@options[:memory_limit]}".to_f
|
322
|
-
@maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
|
323
|
-
end
|
111
|
+
Arachnid2::Typhoeus.new(@url).crawl(opts, &Proc.new)
|
112
|
+
end
|
324
113
|
|
325
|
-
|
326
|
-
|
114
|
+
def crawl_watir(opts)
|
115
|
+
Arachnid2::Watir.new(@url).crawl(opts, &Proc.new)
|
116
|
+
end
|
117
|
+
# https://mudge.name/2011/01/26/passing-blocks-in-ruby-without-block.html
|
327
118
|
|
328
119
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,48 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: webdriver-user-agent
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '7.6'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '7.6'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: watir
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: webdrivers
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
55
97
|
- !ruby/object:Gem::Dependency
|
56
98
|
name: typhoeus
|
57
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -114,14 +156,14 @@ dependencies:
|
|
114
156
|
requirements:
|
115
157
|
- - ">="
|
116
158
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
159
|
+
version: 1.8.5
|
118
160
|
type: :runtime
|
119
161
|
prerelease: false
|
120
162
|
version_requirements: !ruby/object:Gem::Requirement
|
121
163
|
requirements:
|
122
164
|
- - ">="
|
123
165
|
- !ruby/object:Gem::Version
|
124
|
-
version:
|
166
|
+
version: 1.8.5
|
125
167
|
description:
|
126
168
|
email:
|
127
169
|
- scnissen@gmail.com
|
@@ -142,8 +184,11 @@ files:
|
|
142
184
|
- bin/console
|
143
185
|
- bin/setup
|
144
186
|
- lib/arachnid2.rb
|
145
|
-
- lib/arachnid2/
|
187
|
+
- lib/arachnid2/cached_arachnid_responses.rb
|
188
|
+
- lib/arachnid2/exoskeleton.rb
|
189
|
+
- lib/arachnid2/typhoeus.rb
|
146
190
|
- lib/arachnid2/version.rb
|
191
|
+
- lib/arachnid2/watir.rb
|
147
192
|
homepage: https://github.com/samnissen/arachnid2
|
148
193
|
licenses:
|
149
194
|
- MIT
|