arachnid2 0.2.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +51 -28
- data/README.md +94 -22
- data/arachnid2.gemspec +4 -1
- data/lib/arachnid2/{cashed_arachnid_responses.rb → cached_arachnid_responses.rb} +1 -1
- data/lib/arachnid2/exoskeleton.rb +133 -0
- data/lib/arachnid2/typhoeus.rb +99 -0
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2/watir.rb +102 -0
- data/lib/arachnid2.rb +17 -226
- metadata +50 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90efb51a783ec434d1e269d61a9c550e6fe8740eea229b24d3820790c1e5d296
|
4
|
+
data.tar.gz: f731d4cc5ab87ee603a69d795216b9dacd322af653a873273ffb226e6bb6b704
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4fe796d93a1d87ba260b269a5ff9a290280b5354e02cd596f305ea0a342edbf9d48fb29134a83197bcc00266e13e16b5c3b482b448525b380e82f7ffaa69e9b2
|
7
|
+
data.tar.gz: 9c80660fd0b7e9003ea70163fe87d142e1b1ef730b662d251992fdc1aff5ac773ec8e73ffb4c7bf300ebe0543b6b452d83d975b478bcf00ce6bb99347e171dc3
|
data/Gemfile.lock
CHANGED
@@ -1,52 +1,76 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
arachnid2 (0.
|
4
|
+
arachnid2 (0.3.0)
|
5
5
|
addressable
|
6
6
|
adomain
|
7
7
|
bloomfilter-rb
|
8
|
-
nokogiri
|
8
|
+
nokogiri (>= 1.8.5)
|
9
9
|
typhoeus
|
10
|
+
watir
|
11
|
+
webdriver-user-agent (>= 7.6)
|
12
|
+
webdrivers
|
10
13
|
|
11
14
|
GEM
|
12
15
|
remote: https://rubygems.org/
|
13
16
|
specs:
|
14
|
-
addressable (2.
|
17
|
+
addressable (2.6.0)
|
15
18
|
public_suffix (>= 2.0.2, < 4.0)
|
16
19
|
adomain (0.1.1)
|
17
20
|
addressable (~> 2.5)
|
18
21
|
bloomfilter-rb (2.1.1)
|
19
22
|
redis
|
20
|
-
|
23
|
+
childprocess (0.9.0)
|
24
|
+
ffi (~> 1.0, >= 1.0.11)
|
21
25
|
diff-lcs (1.3)
|
22
|
-
ethon (0.
|
26
|
+
ethon (0.12.0)
|
23
27
|
ffi (>= 1.3.0)
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
28
|
+
facets (3.1.0)
|
29
|
+
ffi (1.10.0)
|
30
|
+
json (2.1.0)
|
31
|
+
mini_portile2 (2.4.0)
|
32
|
+
net_http_ssl_fix (0.0.10)
|
33
|
+
nokogiri (1.10.1)
|
34
|
+
mini_portile2 (~> 2.4.0)
|
35
|
+
os (1.0.0)
|
36
|
+
psych (3.1.0)
|
32
37
|
public_suffix (3.0.3)
|
33
38
|
rake (10.5.0)
|
34
|
-
redis (4.0
|
35
|
-
|
36
|
-
|
37
|
-
rspec-
|
38
|
-
rspec-
|
39
|
-
|
40
|
-
|
41
|
-
|
39
|
+
redis (4.1.0)
|
40
|
+
regexp_parser (1.3.0)
|
41
|
+
rspec (3.8.0)
|
42
|
+
rspec-core (~> 3.8.0)
|
43
|
+
rspec-expectations (~> 3.8.0)
|
44
|
+
rspec-mocks (~> 3.8.0)
|
45
|
+
rspec-core (3.8.0)
|
46
|
+
rspec-support (~> 3.8.0)
|
47
|
+
rspec-expectations (3.8.2)
|
42
48
|
diff-lcs (>= 1.2.0, < 2.0)
|
43
|
-
rspec-support (~> 3.
|
44
|
-
rspec-mocks (3.
|
49
|
+
rspec-support (~> 3.8.0)
|
50
|
+
rspec-mocks (3.8.0)
|
45
51
|
diff-lcs (>= 1.2.0, < 2.0)
|
46
|
-
rspec-support (~> 3.
|
47
|
-
rspec-support (3.
|
48
|
-
|
52
|
+
rspec-support (~> 3.8.0)
|
53
|
+
rspec-support (3.8.0)
|
54
|
+
rubyzip (1.2.2)
|
55
|
+
selenium-webdriver (3.141.0)
|
56
|
+
childprocess (~> 0.5)
|
57
|
+
rubyzip (~> 1.2, >= 1.2.2)
|
58
|
+
typhoeus (1.3.1)
|
49
59
|
ethon (>= 0.9.0)
|
60
|
+
watir (6.16.5)
|
61
|
+
regexp_parser (~> 1.2)
|
62
|
+
selenium-webdriver (~> 3.6)
|
63
|
+
webdriver-user-agent (7.6)
|
64
|
+
facets
|
65
|
+
json
|
66
|
+
os
|
67
|
+
psych
|
68
|
+
selenium-webdriver (>= 3.4.0)
|
69
|
+
webdrivers (3.6.0)
|
70
|
+
net_http_ssl_fix
|
71
|
+
nokogiri (~> 1.6)
|
72
|
+
rubyzip (~> 1.0)
|
73
|
+
selenium-webdriver (~> 3.0)
|
50
74
|
|
51
75
|
PLATFORMS
|
52
76
|
ruby
|
@@ -54,9 +78,8 @@ PLATFORMS
|
|
54
78
|
DEPENDENCIES
|
55
79
|
arachnid2!
|
56
80
|
bundler (~> 1.16)
|
57
|
-
pry
|
58
81
|
rake (~> 10.0)
|
59
82
|
rspec (~> 3.0)
|
60
83
|
|
61
84
|
BUNDLED WITH
|
62
|
-
1.16.
|
85
|
+
1.16.5
|
data/README.md
CHANGED
@@ -3,18 +3,22 @@
|
|
3
3
|
## About
|
4
4
|
|
5
5
|
Arachnid2 is a simple, fast web-crawler written in Ruby.
|
6
|
-
|
7
|
-
to get HTTP requests,
|
6
|
+
You can use [typhoeus](https://github.com/typhoeus/typhoeus)
|
7
|
+
to get HTTP requests, or [Watir](https://github.com/watir/watir)
|
8
|
+
to render pages.
|
9
|
+
|
8
10
|
[bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb)
|
9
|
-
|
11
|
+
stores the URLs it will get and has gotten,
|
10
12
|
and [nokogiri](https://github.com/sparklemotion/nokogiri)
|
11
|
-
to find the URLs on each webpage.
|
13
|
+
to find the URLs on each webpage, adding them to the bloomfilter queue.
|
12
14
|
|
13
15
|
Arachnid2 is a successor to [Arachnid](https://github.com/dchuk/Arachnid),
|
14
16
|
and was abstracted out of the [Tellurion Bot](https://github.com/samnissen/tellurion_bot).
|
15
17
|
|
16
18
|
## Usage
|
17
19
|
|
20
|
+
### Typheous (cURL)
|
21
|
+
|
18
22
|
The basic use of Arachnid2 is surfacing the responses from a domains'
|
19
23
|
URLs by visiting a URL, collecting any links to the same domain
|
20
24
|
on that page, and visiting those to do the same.
|
@@ -22,9 +26,6 @@ on that page, and visiting those to do the same.
|
|
22
26
|
Hence, the simplest output would be to collect all of the responses
|
23
27
|
while spidering from some URL.
|
24
28
|
|
25
|
-
Set cached service url(optional)
|
26
|
-
`export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
|
27
|
-
|
28
29
|
```ruby
|
29
30
|
require "arachnid2"
|
30
31
|
|
@@ -58,7 +59,7 @@ spider.crawl { |response|
|
|
58
59
|
|
59
60
|
`Arachnid2#crawl` will return always `nil`.
|
60
61
|
|
61
|
-
|
62
|
+
#### Options
|
62
63
|
|
63
64
|
```ruby
|
64
65
|
require "arachnid2"
|
@@ -67,7 +68,7 @@ url = "http://sixcolours.com"
|
|
67
68
|
spider = Arachnid2.new(url)
|
68
69
|
opts = {
|
69
70
|
followlocation: true,
|
70
|
-
timeout:
|
71
|
+
timeout: 300,
|
71
72
|
time_box: 60,
|
72
73
|
max_urls: 50,
|
73
74
|
:headers => {
|
@@ -95,26 +96,37 @@ spider.crawl(opts) { |response|
|
|
95
96
|
}
|
96
97
|
```
|
97
98
|
|
98
|
-
|
99
|
+
##### `followlocation`
|
100
|
+
|
101
|
+
Tell Typhoeus to follow redirections.
|
99
102
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
+
##### `timeout`
|
104
|
+
|
105
|
+
Tell Typheous or Watir how long to wait for page load.
|
106
|
+
|
107
|
+
##### `time_box`
|
108
|
+
|
109
|
+
The crawler will time-bound your spidering.
|
110
|
+
If no valid integer is provided,
|
111
|
+
it will crawl for 15 seconds before exiting.
|
112
|
+
10000 seconds is the current maximum,
|
113
|
+
and any value above it will be reduced to 10000.
|
103
114
|
|
104
|
-
|
115
|
+
##### `max_urls`
|
105
116
|
|
106
117
|
The crawler will crawl a limited number of URLs before stopping.
|
107
|
-
If no valid integer is provided,
|
118
|
+
If no valid integer is provided,
|
119
|
+
it will crawl for 50 URLs before exiting.
|
108
120
|
10000 seconds is the current maximum,
|
109
121
|
and any value above it will be reduced to 10000.
|
110
122
|
|
111
|
-
|
123
|
+
##### `headers`
|
112
124
|
|
113
125
|
This is a hash that represents any HTTP header key/value pairs you desire,
|
114
126
|
and is passed directly to Typheous. Before it is sent, a default
|
115
127
|
language and user agent are created:
|
116
128
|
|
117
|
-
|
129
|
+
###### Defaults
|
118
130
|
|
119
131
|
The HTTP header `Accept-Language` default is
|
120
132
|
`en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
|
@@ -122,19 +134,19 @@ The HTTP header `Accept-Language` default is
|
|
122
134
|
The HTTP header `User-Agent` default is
|
123
135
|
`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
|
124
136
|
|
125
|
-
|
137
|
+
##### `proxy`
|
126
138
|
|
127
139
|
Provide your IP, port for a proxy. If required, provide credentials for
|
128
140
|
authenticating to that proxy. Proxy options and handling are done
|
129
141
|
by Typhoeus.
|
130
142
|
|
131
|
-
|
143
|
+
##### `non_html_extensions`
|
132
144
|
|
133
145
|
This is the list of TLDs to ignore when collecting URLs from the page.
|
134
146
|
The extensions are formatted as a hash of key/value pairs, where the value
|
135
147
|
is an array of TLDs, and the keys represent the length of those TLDs.
|
136
148
|
|
137
|
-
|
149
|
+
##### `memory_limit` and Docker
|
138
150
|
|
139
151
|
In case you are operating the crawler within a container, Arachnid2
|
140
152
|
can attempt to prevent the container from running out of memory.
|
@@ -142,15 +154,75 @@ By default, it will end the crawl when the container uses >= 80%
|
|
142
154
|
of its available memory. You can override this with the
|
143
155
|
option.
|
144
156
|
|
145
|
-
|
157
|
+
##### Non-HTML links
|
146
158
|
|
147
159
|
The crawler attempts to stop itself from returning data from
|
148
160
|
links that are not indicative of HTML, as detailed in
|
149
161
|
`Arachnid2::NON_HTML_EXTENSIONS`.
|
150
162
|
|
163
|
+
#### Caching (optional)
|
164
|
+
|
165
|
+
If you have setup a cache to deduplicate crawls,
|
166
|
+
set a cached service url
|
167
|
+
`export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
|
168
|
+
|
169
|
+
This expects a push and get JSON API to respond
|
170
|
+
to `/typhoeus_responses`, with a URL and the options pushed
|
171
|
+
exactly as received as parameters. It will push any crawls
|
172
|
+
to the service, and re-use any crawled pages
|
173
|
+
if they are found to match.
|
174
|
+
|
175
|
+
### With Watir
|
176
|
+
|
177
|
+
Crawling with Watir works similarly, but requires you setup your
|
178
|
+
environment for Watir, and headless web browsing if required.
|
179
|
+
See the Watir documentation for more information.
|
180
|
+
|
181
|
+
```ruby
|
182
|
+
# ...
|
183
|
+
Arachnid2.new(url).crawl_watir(opts)
|
184
|
+
# -or-
|
185
|
+
with_watir = true
|
186
|
+
Arachnid2.new(url).crawl(opts, with_watir)
|
187
|
+
```
|
188
|
+
|
189
|
+
#### Options
|
190
|
+
|
191
|
+
See the Typhoeus options above — most apply to Watir as well, with
|
192
|
+
some exceptions:
|
193
|
+
|
194
|
+
##### `proxy`
|
195
|
+
|
196
|
+
Watir proxy options are formatted differently:
|
197
|
+
|
198
|
+
```ruby
|
199
|
+
proxy: {
|
200
|
+
http: "troy.show:8080",
|
201
|
+
ssl: "abed.show:8080"
|
202
|
+
},
|
203
|
+
```
|
204
|
+
|
205
|
+
Proxy options handling is done by Watir.
|
206
|
+
|
207
|
+
##### `headless`
|
208
|
+
|
209
|
+
And it accepts an argument to make browse headlessly
|
210
|
+
|
211
|
+
```ruby
|
212
|
+
opts = { headless: true }
|
213
|
+
```
|
214
|
+
|
215
|
+
##### `followlocation` and `max_concurrency`
|
216
|
+
|
217
|
+
These options do not apply to Watir, and will be ignored.
|
218
|
+
|
151
219
|
## Development
|
152
220
|
|
153
|
-
|
221
|
+
Fork the repo and run the tests
|
222
|
+
|
223
|
+
```ruby
|
224
|
+
bundle exec rspec spec/
|
225
|
+
```
|
154
226
|
|
155
227
|
## Contributing
|
156
228
|
|
data/arachnid2.gemspec
CHANGED
@@ -25,9 +25,12 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency "rake", "~> 10.0"
|
26
26
|
spec.add_development_dependency "rspec", "~> 3.0"
|
27
27
|
|
28
|
+
spec.add_dependency "webdriver-user-agent", ">= 7.6"
|
29
|
+
spec.add_dependency "watir"
|
30
|
+
spec.add_dependency "webdrivers"
|
28
31
|
spec.add_dependency "typhoeus"
|
29
32
|
spec.add_dependency "bloomfilter-rb"
|
30
33
|
spec.add_dependency "adomain"
|
31
34
|
spec.add_dependency "addressable"
|
32
|
-
spec.add_dependency "nokogiri"
|
35
|
+
spec.add_dependency "nokogiri", ">= 1.8.5"
|
33
36
|
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
class Arachnid2
|
2
|
+
module Exoskeleton
|
3
|
+
def browser_type
|
4
|
+
unless @browser_type
|
5
|
+
@browser_type = "#{@options[:browser_type]}".to_sym if @options[:browser_type]
|
6
|
+
@browser_type ||= :firefox
|
7
|
+
end
|
8
|
+
|
9
|
+
@browser_type
|
10
|
+
end
|
11
|
+
|
12
|
+
def process(url, html)
|
13
|
+
return false unless Adomain["#{url}"].include? @domain
|
14
|
+
|
15
|
+
extract_hrefs(html)
|
16
|
+
end
|
17
|
+
|
18
|
+
def extract_hrefs(body)
|
19
|
+
elements = Nokogiri::HTML.parse(body).css('a')
|
20
|
+
return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
|
21
|
+
end
|
22
|
+
|
23
|
+
def vacuum(links, url)
|
24
|
+
links.each do |link|
|
25
|
+
next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
|
26
|
+
|
27
|
+
begin
|
28
|
+
absolute_link = make_absolute(link, url)
|
29
|
+
|
30
|
+
next if skip_link?(absolute_link)
|
31
|
+
|
32
|
+
@global_queue << absolute_link
|
33
|
+
rescue Addressable::URI::InvalidURIError
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def skip_link?(absolute_link)
|
39
|
+
!internal_link?(absolute_link) || \
|
40
|
+
@global_visited.include?(absolute_link) || \
|
41
|
+
extension_ignored?(absolute_link) || \
|
42
|
+
@global_queue.include?(absolute_link)
|
43
|
+
end
|
44
|
+
|
45
|
+
def preflight(opts)
|
46
|
+
@options = opts
|
47
|
+
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
|
48
|
+
@global_queue = [@url]
|
49
|
+
end
|
50
|
+
|
51
|
+
def proxy
|
52
|
+
@options[:proxy]
|
53
|
+
end
|
54
|
+
|
55
|
+
def non_html_extensions
|
56
|
+
return @non_html_extensions if @non_html_extensions
|
57
|
+
|
58
|
+
@non_html_extensions = @options[:non_html_extensions]
|
59
|
+
@non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
|
60
|
+
end
|
61
|
+
|
62
|
+
def bound_time
|
63
|
+
boundary = "#{@options[:time_box]}".to_i
|
64
|
+
boundary = BASE_CRAWL_TIME if boundary <= 0
|
65
|
+
boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME
|
66
|
+
|
67
|
+
return Time.now + boundary
|
68
|
+
end
|
69
|
+
|
70
|
+
def bound_urls
|
71
|
+
amount = "#{@options[:max_urls]}".to_i
|
72
|
+
amount = BASE_URLS if amount <= 0
|
73
|
+
amount = MAX_URLS if amount > MAX_URLS
|
74
|
+
|
75
|
+
amount
|
76
|
+
end
|
77
|
+
|
78
|
+
def timeout
|
79
|
+
unless @timeout
|
80
|
+
@timeout = @options[:timeout]
|
81
|
+
@timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
|
82
|
+
@timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
|
83
|
+
@timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
|
84
|
+
end
|
85
|
+
@timeout
|
86
|
+
end
|
87
|
+
|
88
|
+
def crawl_options
|
89
|
+
@crawl_options ||= { max_urls: max_urls, time_limit: time_limit }
|
90
|
+
end
|
91
|
+
|
92
|
+
alias_method :max_urls, :bound_urls
|
93
|
+
|
94
|
+
alias_method :time_limit, :bound_time
|
95
|
+
|
96
|
+
def make_absolute(href, root)
|
97
|
+
Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
|
98
|
+
end
|
99
|
+
|
100
|
+
def internal_link?(absolute_url)
|
101
|
+
"#{Adomain[absolute_url]}".include? @domain
|
102
|
+
end
|
103
|
+
|
104
|
+
def extension_ignored?(url)
|
105
|
+
return false if url.empty?
|
106
|
+
|
107
|
+
!non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
|
108
|
+
end
|
109
|
+
|
110
|
+
def memory_danger?
|
111
|
+
return false unless in_docker?
|
112
|
+
|
113
|
+
use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
|
114
|
+
@limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f
|
115
|
+
|
116
|
+
return false unless ( (use > 0.0) && (@limit > 0.0) )
|
117
|
+
|
118
|
+
return ( ( (use / @limit) * 100.0 ) >= maximum_load_rate )
|
119
|
+
end
|
120
|
+
|
121
|
+
def in_docker?
|
122
|
+
File.file?(MEMORY_USE_FILE)
|
123
|
+
end
|
124
|
+
|
125
|
+
def maximum_load_rate
|
126
|
+
return @maximum_load_rate if @maximum_load_rate
|
127
|
+
|
128
|
+
@maximum_load_rate = "#{@options[:memory_limit]}".to_f
|
129
|
+
@maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
|
130
|
+
@maximum_load_rate
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
class Arachnid2
|
2
|
+
class Typhoeus
|
3
|
+
include CachedArachnidResponses
|
4
|
+
include Arachnid2::Exoskeleton
|
5
|
+
|
6
|
+
def initialize(url)
|
7
|
+
@url = url
|
8
|
+
@domain = Adomain[@url]
|
9
|
+
@cached_data = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def crawl(opts = {})
|
13
|
+
preflight(opts)
|
14
|
+
typhoeus_preflight
|
15
|
+
|
16
|
+
until @global_queue.empty?
|
17
|
+
max_concurrency.times do
|
18
|
+
q = @global_queue.shift
|
19
|
+
|
20
|
+
break if @global_visited.size >= crawl_options[:max_urls] || \
|
21
|
+
Time.now > crawl_options[:time_limit] || \
|
22
|
+
memory_danger?
|
23
|
+
|
24
|
+
@global_visited.insert(q)
|
25
|
+
|
26
|
+
request = ::Typhoeus::Request.new(q, request_options)
|
27
|
+
|
28
|
+
data = load_data(@url, opts)
|
29
|
+
data.each { |response| yield response } and return unless data.nil?
|
30
|
+
|
31
|
+
request.on_complete do |response|
|
32
|
+
@cached_data.push(response)
|
33
|
+
links = process(response.effective_url, response.body)
|
34
|
+
next unless links
|
35
|
+
|
36
|
+
yield response
|
37
|
+
|
38
|
+
vacuum(links, response.effective_url)
|
39
|
+
end
|
40
|
+
|
41
|
+
@hydra.queue(request)
|
42
|
+
end # max_concurrency.times do
|
43
|
+
|
44
|
+
@hydra.run
|
45
|
+
|
46
|
+
end # until @global_queue.empty?
|
47
|
+
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
48
|
+
ensure
|
49
|
+
@cookie_file.close! if @cookie_file
|
50
|
+
end # def crawl(opts = {})
|
51
|
+
|
52
|
+
private
|
53
|
+
def typhoeus_preflight
|
54
|
+
@hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
|
55
|
+
typhoeus_proxy_options
|
56
|
+
end
|
57
|
+
|
58
|
+
def max_concurrency
|
59
|
+
return @max_concurrency if @max_concurrency
|
60
|
+
|
61
|
+
@max_concurrency = "#{@options[:max_concurrency]}".to_i
|
62
|
+
@max_concurrency = 1 unless (@max_concurrency > 0)
|
63
|
+
@max_concurrency
|
64
|
+
end
|
65
|
+
|
66
|
+
def followlocation
|
67
|
+
return @followlocation unless @followlocation.nil?
|
68
|
+
|
69
|
+
@followlocation = @options[:followlocation]
|
70
|
+
@followlocation = true unless @followlocation.is_a?(FalseClass)
|
71
|
+
end
|
72
|
+
|
73
|
+
def request_options
|
74
|
+
@cookie_file ||= Tempfile.new('cookies')
|
75
|
+
|
76
|
+
@request_options = {
|
77
|
+
timeout: timeout,
|
78
|
+
followlocation: followlocation,
|
79
|
+
cookiefile: @cookie_file.path,
|
80
|
+
cookiejar: @cookie_file.path,
|
81
|
+
headers: @options[:headers]
|
82
|
+
}.merge(crawl_options[:proxy])
|
83
|
+
|
84
|
+
@request_options[:headers] ||= {}
|
85
|
+
@request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
|
86
|
+
@request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
|
87
|
+
|
88
|
+
@request_options
|
89
|
+
end
|
90
|
+
|
91
|
+
def typhoeus_proxy_options
|
92
|
+
crawl_options[:proxy] = {}
|
93
|
+
|
94
|
+
crawl_options[:proxy][:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
|
95
|
+
crawl_options[:proxy][:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
data/lib/arachnid2/version.rb
CHANGED
@@ -0,0 +1,102 @@
|
|
1
|
+
class Arachnid2
|
2
|
+
class Watir
|
3
|
+
include Arachnid2::Exoskeleton
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
@url = url
|
7
|
+
@domain = Adomain[@url]
|
8
|
+
end
|
9
|
+
|
10
|
+
def crawl(opts)
|
11
|
+
preflight(opts)
|
12
|
+
watir_preflight
|
13
|
+
|
14
|
+
until @global_queue.empty?
|
15
|
+
@already_retried = false
|
16
|
+
q = @global_queue.shift
|
17
|
+
|
18
|
+
break if @global_visited.size >= crawl_options[:max_urls]
|
19
|
+
break if Time.now > crawl_options[:time_limit]
|
20
|
+
break if memory_danger?
|
21
|
+
|
22
|
+
@global_visited.insert(q)
|
23
|
+
|
24
|
+
begin
|
25
|
+
browser.goto q
|
26
|
+
links = process(browser.url, browser.body.html)
|
27
|
+
next unless links
|
28
|
+
|
29
|
+
yield browser
|
30
|
+
|
31
|
+
vacuum(links, browser.url)
|
32
|
+
rescue => e
|
33
|
+
raise e if @already_retried
|
34
|
+
raise e unless "#{e.class}".include?("Selenium") || "#{e.class}".include?("Watir")
|
35
|
+
@browser = nil
|
36
|
+
@already_retried = true
|
37
|
+
retry
|
38
|
+
end
|
39
|
+
|
40
|
+
end # until @global_queue.empty?
|
41
|
+
ensure
|
42
|
+
@browser.close if @browser rescue nil
|
43
|
+
@headless.destroy if @headless rescue nil
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
def browser
|
48
|
+
unless @browser
|
49
|
+
behead if @make_headless
|
50
|
+
|
51
|
+
@browser = create_browser
|
52
|
+
|
53
|
+
set_timeout
|
54
|
+
end
|
55
|
+
|
56
|
+
return @browser
|
57
|
+
end
|
58
|
+
|
59
|
+
def create_browser
|
60
|
+
return ::Watir::Browser.new(driver, proxy: @proxy) if @proxy
|
61
|
+
|
62
|
+
::Watir::Browser.new driver
|
63
|
+
end
|
64
|
+
|
65
|
+
def set_timeout
|
66
|
+
@browser.driver.manage.timeouts.page_load = timeout
|
67
|
+
end
|
68
|
+
|
69
|
+
def behead
|
70
|
+
@headless = Headless.new
|
71
|
+
@headless.start
|
72
|
+
end
|
73
|
+
|
74
|
+
def driver
|
75
|
+
unless @driver
|
76
|
+
language = @options.dig(:headers, "Accept-Language") || DEFAULT_LANGUAGE
|
77
|
+
user_agent = @options.dig(:headers, "User-Agent") || DEFAULT_USER_AGENT
|
78
|
+
|
79
|
+
@driver = Webdriver::UserAgent.driver(
|
80
|
+
browser: browser_type,
|
81
|
+
accept_language_string: language,
|
82
|
+
user_agent_string: user_agent
|
83
|
+
)
|
84
|
+
end
|
85
|
+
|
86
|
+
@driver
|
87
|
+
end
|
88
|
+
|
89
|
+
def watir_preflight
|
90
|
+
watir_proxy_options
|
91
|
+
@make_headless = @options[:headless]
|
92
|
+
end
|
93
|
+
|
94
|
+
def watir_proxy_options
|
95
|
+
crawl_options[:proxy] = {}
|
96
|
+
|
97
|
+
crawl_options[:proxy][:http] = @options[:proxy][:http] if @options.dig(:proxy, :http)
|
98
|
+
crawl_options[:proxy][:ssl] = @options[:proxy][:ssl] if @options.dig(:proxy, :ssl)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
data/lib/arachnid2.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
require "arachnid2/version"
|
2
|
-
require "arachnid2/
|
2
|
+
require "arachnid2/cached_arachnid_responses"
|
3
|
+
require "arachnid2/exoskeleton"
|
4
|
+
require "arachnid2/typhoeus"
|
5
|
+
require "arachnid2/watir"
|
3
6
|
|
4
7
|
require 'tempfile'
|
5
8
|
require "typhoeus"
|
@@ -8,9 +11,12 @@ require "adomain"
|
|
8
11
|
require "addressable/uri"
|
9
12
|
require "nokogiri"
|
10
13
|
require "base64"
|
14
|
+
require "webdrivers"
|
15
|
+
require "webdriver-user-agent"
|
16
|
+
require "watir"
|
17
|
+
|
11
18
|
|
12
19
|
class Arachnid2
|
13
|
-
include CashedArachnidResponses
|
14
20
|
# META:
|
15
21
|
# About the origins of this crawling approach
|
16
22
|
# The Crawler is heavily borrowed from by Arachnid.
|
@@ -22,7 +28,7 @@ class Arachnid2
|
|
22
28
|
# And this was originally written as a part of Tellurion's bot
|
23
29
|
# https://github.com/samnissen/tellurion_bot
|
24
30
|
|
25
|
-
MAX_CRAWL_TIME =
|
31
|
+
MAX_CRAWL_TIME = 10000
|
26
32
|
BASE_CRAWL_TIME = 15
|
27
33
|
MAX_URLS = 10000
|
28
34
|
BASE_URLS = 50
|
@@ -58,8 +64,6 @@ class Arachnid2
|
|
58
64
|
#
|
59
65
|
def initialize(url)
|
60
66
|
@url = url
|
61
|
-
@domain = Adomain[@url]
|
62
|
-
@cached_data = []
|
63
67
|
end
|
64
68
|
|
65
69
|
#
|
@@ -101,228 +105,15 @@ class Arachnid2
|
|
101
105
|
#
|
102
106
|
# @return nil
|
103
107
|
#
|
104
|
-
def crawl(opts = {})
|
105
|
-
|
106
|
-
|
107
|
-
until @global_queue.empty?
|
108
|
-
@max_concurrency.times do
|
109
|
-
q = @global_queue.shift
|
110
|
-
|
111
|
-
break if @global_visited.size >= @crawl_options[:max_urls]
|
112
|
-
break if Time.now > @crawl_options[:time_limit]
|
113
|
-
break if memory_danger?
|
114
|
-
|
115
|
-
@global_visited.insert(q)
|
116
|
-
|
117
|
-
request = Typhoeus::Request.new(q, request_options)
|
118
|
-
|
119
|
-
data = load_data(@url, opts)
|
120
|
-
unless data.nil?
|
121
|
-
data.each do |response|
|
122
|
-
yield response
|
123
|
-
end
|
124
|
-
return
|
125
|
-
end
|
126
|
-
request.on_complete do |response|
|
127
|
-
@cached_data.push(response)
|
128
|
-
links = process(response)
|
129
|
-
next unless links
|
130
|
-
|
131
|
-
yield response
|
132
|
-
|
133
|
-
vacuum(links, response)
|
134
|
-
end
|
135
|
-
|
136
|
-
@hydra.queue(request)
|
137
|
-
end # @max_concurrency.times do
|
138
|
-
|
139
|
-
@hydra.run
|
140
|
-
|
141
|
-
end # until @global_queue.empty?
|
142
|
-
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
143
|
-
ensure
|
144
|
-
@cookie_file.close! if @cookie_file
|
145
|
-
|
146
|
-
|
147
|
-
end # def crawl(opts = {})
|
148
|
-
|
149
|
-
private
|
150
|
-
def process(response)
|
151
|
-
return false unless Adomain["#{response.effective_url}"].include? @domain
|
152
|
-
|
153
|
-
elements = Nokogiri::HTML.parse(response.body).css('a')
|
154
|
-
return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
|
155
|
-
end
|
156
|
-
|
157
|
-
def vacuum(links, response)
|
158
|
-
links.each do |link|
|
159
|
-
next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
|
160
|
-
|
161
|
-
begin
|
162
|
-
absolute_link = make_absolute(link, response.effective_url)
|
163
|
-
|
164
|
-
next if skip_link?(absolute_link)
|
165
|
-
|
166
|
-
@global_queue << absolute_link
|
167
|
-
rescue Addressable::URI::InvalidURIError
|
168
|
-
end
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
def skip_link?(absolute_link)
|
173
|
-
internal = internal_link?(absolute_link)
|
174
|
-
visited = @global_visited.include?(absolute_link)
|
175
|
-
ignored = extension_ignored?(absolute_link)
|
176
|
-
known = @global_queue.include?(absolute_link)
|
177
|
-
|
178
|
-
!internal || visited || ignored || known
|
179
|
-
end
|
180
|
-
|
181
|
-
def preflight(opts)
|
182
|
-
@options = opts
|
183
|
-
@crawl_options = crawl_options
|
184
|
-
@maximum_load_rate = maximum_load_rate
|
185
|
-
@max_concurrency = max_concurrency
|
186
|
-
@non_html_extensions = non_html_extensions
|
187
|
-
@hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
|
188
|
-
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
|
189
|
-
@global_queue = [@url]
|
190
|
-
end
|
191
|
-
|
192
|
-
def non_html_extensions
|
193
|
-
@non_html_extensions ||= nil
|
194
|
-
|
195
|
-
if !@non_html_extensions
|
196
|
-
@non_html_extensions = @options[:non_html_extensions]
|
197
|
-
@non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
|
198
|
-
end
|
199
|
-
|
200
|
-
@non_html_extensions
|
201
|
-
end
|
202
|
-
|
203
|
-
def max_concurrency
|
204
|
-
@max_concurrency ||= nil
|
205
|
-
|
206
|
-
if !@max_concurrency
|
207
|
-
@max_concurrency = "#{@options[:max_concurrency]}".to_i
|
208
|
-
@max_concurrency = 1 unless (@max_concurrency > 0)
|
209
|
-
end
|
210
|
-
|
211
|
-
@max_concurrency
|
212
|
-
end
|
108
|
+
def crawl(opts = {}, with_watir = false)
|
109
|
+
crawl_watir and return if with_watir
|
213
110
|
|
214
|
-
|
215
|
-
|
216
|
-
boundary = BASE_CRAWL_TIME if boundary <= 0
|
217
|
-
boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME
|
218
|
-
|
219
|
-
return Time.now + boundary
|
220
|
-
end
|
221
|
-
|
222
|
-
def bound_urls
|
223
|
-
amount = "#{@options[:max_urls]}".to_i
|
224
|
-
amount = BASE_URLS if amount <= 0
|
225
|
-
amount = MAX_URLS if amount > MAX_URLS
|
226
|
-
|
227
|
-
amount
|
228
|
-
end
|
229
|
-
|
230
|
-
def followlocation
|
231
|
-
if @followlocation.is_a?(NilClass)
|
232
|
-
@followlocation = @options[:followlocation]
|
233
|
-
@followlocation = true unless @followlocation.is_a?(FalseClass)
|
234
|
-
end
|
235
|
-
@followlocation
|
236
|
-
end
|
237
|
-
|
238
|
-
def timeout
|
239
|
-
if !@timeout
|
240
|
-
@timeout = @options[:timeout]
|
241
|
-
@timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
|
242
|
-
@timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
|
243
|
-
@timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
|
244
|
-
end
|
245
|
-
@timeout
|
246
|
-
end
|
247
|
-
|
248
|
-
def request_options
|
249
|
-
@cookie_file ||= Tempfile.new('cookies')
|
250
|
-
|
251
|
-
@request_options = {
|
252
|
-
timeout: timeout,
|
253
|
-
followlocation: followlocation,
|
254
|
-
cookiefile: @cookie_file.path,
|
255
|
-
cookiejar: @cookie_file.path,
|
256
|
-
headers: @options[:headers]
|
257
|
-
}
|
258
|
-
|
259
|
-
@request_options[:headers] ||= {}
|
260
|
-
@request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
|
261
|
-
@request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
|
262
|
-
|
263
|
-
@request_options
|
264
|
-
end
|
265
|
-
|
266
|
-
def crawl_options
|
267
|
-
@crawl_options ||= nil
|
268
|
-
|
269
|
-
if !@crawl_options
|
270
|
-
@crawl_options = { :max_urls => max_urls, :time_limit => time_limit }
|
271
|
-
|
272
|
-
@crawl_options[:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
|
273
|
-
@crawl_options[:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
|
274
|
-
end
|
275
|
-
|
276
|
-
@crawl_options
|
277
|
-
end
|
278
|
-
|
279
|
-
def max_urls
|
280
|
-
bound_urls
|
281
|
-
end
|
282
|
-
|
283
|
-
def time_limit
|
284
|
-
bound_time
|
285
|
-
end
|
286
|
-
|
287
|
-
def make_absolute(href, root)
|
288
|
-
Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
|
289
|
-
end
|
290
|
-
|
291
|
-
def internal_link?(absolute_url)
|
292
|
-
"#{Adomain[absolute_url]}".include? @domain
|
293
|
-
end
|
294
|
-
|
295
|
-
def extension_ignored?(url)
|
296
|
-
return false if url.empty?
|
297
|
-
|
298
|
-
!@non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
|
299
|
-
end
|
300
|
-
|
301
|
-
def memory_danger?
|
302
|
-
return false unless in_docker?
|
303
|
-
|
304
|
-
use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
|
305
|
-
@limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f
|
306
|
-
|
307
|
-
return false unless ( (use > 0.0) && (@limit > 0.0) )
|
308
|
-
|
309
|
-
return ( ( (use / @limit) * 100.0 ) >= @maximum_load_rate )
|
310
|
-
end
|
311
|
-
|
312
|
-
def in_docker?
|
313
|
-
return false unless File.file?(MEMORY_USE_FILE)
|
314
|
-
true
|
315
|
-
end
|
316
|
-
|
317
|
-
def maximum_load_rate
|
318
|
-
@maximum_load_rate ||= nil
|
319
|
-
|
320
|
-
if !@maximum_load_rate
|
321
|
-
@maximum_load_rate = "#{@options[:memory_limit]}".to_f
|
322
|
-
@maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
|
323
|
-
end
|
111
|
+
Arachnid2::Typhoeus.new(@url).crawl(opts, &Proc.new)
|
112
|
+
end
|
324
113
|
|
325
|
-
|
326
|
-
|
114
|
+
def crawl_watir(opts)
|
115
|
+
Arachnid2::Watir.new(@url).crawl(opts, &Proc.new)
|
116
|
+
end
|
117
|
+
# https://mudge.name/2011/01/26/passing-blocks-in-ruby-without-block.html
|
327
118
|
|
328
119
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,48 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: webdriver-user-agent
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '7.6'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '7.6'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: watir
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: webdrivers
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
55
97
|
- !ruby/object:Gem::Dependency
|
56
98
|
name: typhoeus
|
57
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -114,14 +156,14 @@ dependencies:
|
|
114
156
|
requirements:
|
115
157
|
- - ">="
|
116
158
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
159
|
+
version: 1.8.5
|
118
160
|
type: :runtime
|
119
161
|
prerelease: false
|
120
162
|
version_requirements: !ruby/object:Gem::Requirement
|
121
163
|
requirements:
|
122
164
|
- - ">="
|
123
165
|
- !ruby/object:Gem::Version
|
124
|
-
version:
|
166
|
+
version: 1.8.5
|
125
167
|
description:
|
126
168
|
email:
|
127
169
|
- scnissen@gmail.com
|
@@ -142,8 +184,11 @@ files:
|
|
142
184
|
- bin/console
|
143
185
|
- bin/setup
|
144
186
|
- lib/arachnid2.rb
|
145
|
-
- lib/arachnid2/
|
187
|
+
- lib/arachnid2/cached_arachnid_responses.rb
|
188
|
+
- lib/arachnid2/exoskeleton.rb
|
189
|
+
- lib/arachnid2/typhoeus.rb
|
146
190
|
- lib/arachnid2/version.rb
|
191
|
+
- lib/arachnid2/watir.rb
|
147
192
|
homepage: https://github.com/samnissen/arachnid2
|
148
193
|
licenses:
|
149
194
|
- MIT
|