arachnid2 0.2.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 00cef9d45ae8be8b0747d47e254737fdfdb94e3f40cfe85a99faff283653f87b
4
- data.tar.gz: 24ecf163c8b2eeda25908067a0efa68aec661441c59b06c466ee22828d154f5d
3
+ metadata.gz: 90efb51a783ec434d1e269d61a9c550e6fe8740eea229b24d3820790c1e5d296
4
+ data.tar.gz: f731d4cc5ab87ee603a69d795216b9dacd322af653a873273ffb226e6bb6b704
5
5
  SHA512:
6
- metadata.gz: 2830f48686f9c2e9a921da58cca907580c800716e856982ae5e836f6dd51ab899192456bc4090ad78aac89eb6db49496d2ad953713d4d52548f1e248a7198df2
7
- data.tar.gz: d3175c6f6574dc5a6feb9955e2a7804accfdd09c705cd90dea11586bd774fb79a67fdca499afe628739eb85b8d6adf6ff13d831658b23f4a993fc210e429c8f1
6
+ metadata.gz: 4fe796d93a1d87ba260b269a5ff9a290280b5354e02cd596f305ea0a342edbf9d48fb29134a83197bcc00266e13e16b5c3b482b448525b380e82f7ffaa69e9b2
7
+ data.tar.gz: 9c80660fd0b7e9003ea70163fe87d142e1b1ef730b662d251992fdc1aff5ac773ec8e73ffb4c7bf300ebe0543b6b452d83d975b478bcf00ce6bb99347e171dc3
data/Gemfile.lock CHANGED
@@ -1,52 +1,76 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- arachnid2 (0.1.4)
4
+ arachnid2 (0.3.0)
5
5
  addressable
6
6
  adomain
7
7
  bloomfilter-rb
8
- nokogiri
8
+ nokogiri (>= 1.8.5)
9
9
  typhoeus
10
+ watir
11
+ webdriver-user-agent (>= 7.6)
12
+ webdrivers
10
13
 
11
14
  GEM
12
15
  remote: https://rubygems.org/
13
16
  specs:
14
- addressable (2.5.2)
17
+ addressable (2.6.0)
15
18
  public_suffix (>= 2.0.2, < 4.0)
16
19
  adomain (0.1.1)
17
20
  addressable (~> 2.5)
18
21
  bloomfilter-rb (2.1.1)
19
22
  redis
20
- coderay (1.1.2)
23
+ childprocess (0.9.0)
24
+ ffi (~> 1.0, >= 1.0.11)
21
25
  diff-lcs (1.3)
22
- ethon (0.11.0)
26
+ ethon (0.12.0)
23
27
  ffi (>= 1.3.0)
24
- ffi (1.9.25)
25
- method_source (0.9.0)
26
- mini_portile2 (2.3.0)
27
- nokogiri (1.8.4)
28
- mini_portile2 (~> 2.3.0)
29
- pry (0.11.3)
30
- coderay (~> 1.1.0)
31
- method_source (~> 0.9.0)
28
+ facets (3.1.0)
29
+ ffi (1.10.0)
30
+ json (2.1.0)
31
+ mini_portile2 (2.4.0)
32
+ net_http_ssl_fix (0.0.10)
33
+ nokogiri (1.10.1)
34
+ mini_portile2 (~> 2.4.0)
35
+ os (1.0.0)
36
+ psych (3.1.0)
32
37
  public_suffix (3.0.3)
33
38
  rake (10.5.0)
34
- redis (4.0.2)
35
- rspec (3.7.0)
36
- rspec-core (~> 3.7.0)
37
- rspec-expectations (~> 3.7.0)
38
- rspec-mocks (~> 3.7.0)
39
- rspec-core (3.7.1)
40
- rspec-support (~> 3.7.0)
41
- rspec-expectations (3.7.0)
39
+ redis (4.1.0)
40
+ regexp_parser (1.3.0)
41
+ rspec (3.8.0)
42
+ rspec-core (~> 3.8.0)
43
+ rspec-expectations (~> 3.8.0)
44
+ rspec-mocks (~> 3.8.0)
45
+ rspec-core (3.8.0)
46
+ rspec-support (~> 3.8.0)
47
+ rspec-expectations (3.8.2)
42
48
  diff-lcs (>= 1.2.0, < 2.0)
43
- rspec-support (~> 3.7.0)
44
- rspec-mocks (3.7.0)
49
+ rspec-support (~> 3.8.0)
50
+ rspec-mocks (3.8.0)
45
51
  diff-lcs (>= 1.2.0, < 2.0)
46
- rspec-support (~> 3.7.0)
47
- rspec-support (3.7.1)
48
- typhoeus (1.3.0)
52
+ rspec-support (~> 3.8.0)
53
+ rspec-support (3.8.0)
54
+ rubyzip (1.2.2)
55
+ selenium-webdriver (3.141.0)
56
+ childprocess (~> 0.5)
57
+ rubyzip (~> 1.2, >= 1.2.2)
58
+ typhoeus (1.3.1)
49
59
  ethon (>= 0.9.0)
60
+ watir (6.16.5)
61
+ regexp_parser (~> 1.2)
62
+ selenium-webdriver (~> 3.6)
63
+ webdriver-user-agent (7.6)
64
+ facets
65
+ json
66
+ os
67
+ psych
68
+ selenium-webdriver (>= 3.4.0)
69
+ webdrivers (3.6.0)
70
+ net_http_ssl_fix
71
+ nokogiri (~> 1.6)
72
+ rubyzip (~> 1.0)
73
+ selenium-webdriver (~> 3.0)
50
74
 
51
75
  PLATFORMS
52
76
  ruby
@@ -54,9 +78,8 @@ PLATFORMS
54
78
  DEPENDENCIES
55
79
  arachnid2!
56
80
  bundler (~> 1.16)
57
- pry
58
81
  rake (~> 10.0)
59
82
  rspec (~> 3.0)
60
83
 
61
84
  BUNDLED WITH
62
- 1.16.3
85
+ 1.16.5
data/README.md CHANGED
@@ -3,18 +3,22 @@
3
3
  ## About
4
4
 
5
5
  Arachnid2 is a simple, fast web-crawler written in Ruby.
6
- It uses [typhoeus](https://github.com/typhoeus/typhoeus)
7
- to get HTTP requests,
6
+ You can use [typhoeus](https://github.com/typhoeus/typhoeus)
7
+ to get HTTP requests, or [Watir](https://github.com/watir/watir)
8
+ to render pages.
9
+
8
10
  [bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb)
9
- to store the URLs it will get and has gotten,
11
+ stores the URLs it will get and has gotten,
10
12
  and [nokogiri](https://github.com/sparklemotion/nokogiri)
11
- to find the URLs on each webpage.
13
+ to find the URLs on each webpage, adding them to the bloomfilter queue.
12
14
 
13
15
  Arachnid2 is a successor to [Arachnid](https://github.com/dchuk/Arachnid),
14
16
  and was abstracted out of the [Tellurion Bot](https://github.com/samnissen/tellurion_bot).
15
17
 
16
18
  ## Usage
17
19
 
20
+ ### Typheous (cURL)
21
+
18
22
  The basic use of Arachnid2 is surfacing the responses from a domains'
19
23
  URLs by visiting a URL, collecting any links to the same domain
20
24
  on that page, and visiting those to do the same.
@@ -22,9 +26,6 @@ on that page, and visiting those to do the same.
22
26
  Hence, the simplest output would be to collect all of the responses
23
27
  while spidering from some URL.
24
28
 
25
- Set cached service url(optional)
26
- `export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
27
-
28
29
  ```ruby
29
30
  require "arachnid2"
30
31
 
@@ -58,7 +59,7 @@ spider.crawl { |response|
58
59
 
59
60
  `Arachnid2#crawl` will return always `nil`.
60
61
 
61
- ### Options
62
+ #### Options
62
63
 
63
64
  ```ruby
64
65
  require "arachnid2"
@@ -67,7 +68,7 @@ url = "http://sixcolours.com"
67
68
  spider = Arachnid2.new(url)
68
69
  opts = {
69
70
  followlocation: true,
70
- timeout: 10000,
71
+ timeout: 300,
71
72
  time_box: 60,
72
73
  max_urls: 50,
73
74
  :headers => {
@@ -95,26 +96,37 @@ spider.crawl(opts) { |response|
95
96
  }
96
97
  ```
97
98
 
98
- #### `time_box`
99
+ ##### `followlocation`
100
+
101
+ Tell Typhoeus to follow redirections.
99
102
 
100
- The crawler will time-bound your spidering. If no valid integer is provided,
101
- it will crawl for 15 seconds before exiting. 600 seconds (10 minutes)
102
- is the current maximum, and any value above it will be reduced to 600.
103
+ ##### `timeout`
104
+
105
+ Tell Typheous or Watir how long to wait for page load.
106
+
107
+ ##### `time_box`
108
+
109
+ The crawler will time-bound your spidering.
110
+ If no valid integer is provided,
111
+ it will crawl for 15 seconds before exiting.
112
+ 10000 seconds is the current maximum,
113
+ and any value above it will be reduced to 10000.
103
114
 
104
- #### `max_urls`
115
+ ##### `max_urls`
105
116
 
106
117
  The crawler will crawl a limited number of URLs before stopping.
107
- If no valid integer is provided, it will crawl for 50 URLs before exiting.
118
+ If no valid integer is provided,
119
+ it will crawl for 50 URLs before exiting.
108
120
  10000 seconds is the current maximum,
109
121
  and any value above it will be reduced to 10000.
110
122
 
111
- #### `headers`
123
+ ##### `headers`
112
124
 
113
125
  This is a hash that represents any HTTP header key/value pairs you desire,
114
126
  and is passed directly to Typheous. Before it is sent, a default
115
127
  language and user agent are created:
116
128
 
117
- ##### Defaults
129
+ ###### Defaults
118
130
 
119
131
  The HTTP header `Accept-Language` default is
120
132
  `en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
@@ -122,19 +134,19 @@ The HTTP header `Accept-Language` default is
122
134
  The HTTP header `User-Agent` default is
123
135
  `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
124
136
 
125
- #### `proxy`
137
+ ##### `proxy`
126
138
 
127
139
  Provide your IP, port for a proxy. If required, provide credentials for
128
140
  authenticating to that proxy. Proxy options and handling are done
129
141
  by Typhoeus.
130
142
 
131
- #### `non_html_extensions`
143
+ ##### `non_html_extensions`
132
144
 
133
145
  This is the list of TLDs to ignore when collecting URLs from the page.
134
146
  The extensions are formatted as a hash of key/value pairs, where the value
135
147
  is an array of TLDs, and the keys represent the length of those TLDs.
136
148
 
137
- #### `memory_limit` and Docker
149
+ ##### `memory_limit` and Docker
138
150
 
139
151
  In case you are operating the crawler within a container, Arachnid2
140
152
  can attempt to prevent the container from running out of memory.
@@ -142,15 +154,75 @@ By default, it will end the crawl when the container uses >= 80%
142
154
  of its available memory. You can override this with the
143
155
  option.
144
156
 
145
- ### Non-HTML links
157
+ ##### Non-HTML links
146
158
 
147
159
  The crawler attempts to stop itself from returning data from
148
160
  links that are not indicative of HTML, as detailed in
149
161
  `Arachnid2::NON_HTML_EXTENSIONS`.
150
162
 
163
+ #### Caching (optional)
164
+
165
+ If you have setup a cache to deduplicate crawls,
166
+ set a cached service url
167
+ `export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
168
+
169
+ This expects a push and get JSON API to respond
170
+ to `/typhoeus_responses`, with a URL and the options pushed
171
+ exactly as received as parameters. It will push any crawls
172
+ to the service, and re-use any crawled pages
173
+ if they are found to match.
174
+
175
+ ### With Watir
176
+
177
+ Crawling with Watir works similarly, but requires you setup your
178
+ environment for Watir, and headless web browsing if required.
179
+ See the Watir documentation for more information.
180
+
181
+ ```ruby
182
+ # ...
183
+ Arachnid2.new(url).crawl_watir(opts)
184
+ # -or-
185
+ with_watir = true
186
+ Arachnid2.new(url).crawl(opts, with_watir)
187
+ ```
188
+
189
+ #### Options
190
+
191
+ See the Typhoeus options above &mdash; most apply to Watir as well, with
192
+ some exceptions:
193
+
194
+ ##### `proxy`
195
+
196
+ Watir proxy options are formatted differently:
197
+
198
+ ```ruby
199
+ proxy: {
200
+ http: "troy.show:8080",
201
+ ssl: "abed.show:8080"
202
+ },
203
+ ```
204
+
205
+ Proxy options handling is done by Watir.
206
+
207
+ ##### `headless`
208
+
209
+ And it accepts an argument to make browse headlessly
210
+
211
+ ```ruby
212
+ opts = { headless: true }
213
+ ```
214
+
215
+ ##### `followlocation` and `max_concurrency`
216
+
217
+ These options do not apply to Watir, and will be ignored.
218
+
151
219
  ## Development
152
220
 
153
- TODO: this
221
+ Fork the repo and run the tests
222
+
223
+ ```ruby
224
+ bundle exec rspec spec/
225
+ ```
154
226
 
155
227
  ## Contributing
156
228
 
data/arachnid2.gemspec CHANGED
@@ -25,9 +25,12 @@ Gem::Specification.new do |spec|
25
25
  spec.add_development_dependency "rake", "~> 10.0"
26
26
  spec.add_development_dependency "rspec", "~> 3.0"
27
27
 
28
+ spec.add_dependency "webdriver-user-agent", ">= 7.6"
29
+ spec.add_dependency "watir"
30
+ spec.add_dependency "webdrivers"
28
31
  spec.add_dependency "typhoeus"
29
32
  spec.add_dependency "bloomfilter-rb"
30
33
  spec.add_dependency "adomain"
31
34
  spec.add_dependency "addressable"
32
- spec.add_dependency "nokogiri"
35
+ spec.add_dependency "nokogiri", ">= 1.8.5"
33
36
  end
@@ -1,6 +1,6 @@
1
1
  require 'net/http'
2
2
  require 'json'
3
- module CashedArachnidResponses
3
+ module CachedArachnidResponses
4
4
  CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
5
5
 
6
6
  def load_data(_url, _options)
@@ -0,0 +1,133 @@
1
+ class Arachnid2
2
+ module Exoskeleton
3
+ def browser_type
4
+ unless @browser_type
5
+ @browser_type = "#{@options[:browser_type]}".to_sym if @options[:browser_type]
6
+ @browser_type ||= :firefox
7
+ end
8
+
9
+ @browser_type
10
+ end
11
+
12
+ def process(url, html)
13
+ return false unless Adomain["#{url}"].include? @domain
14
+
15
+ extract_hrefs(html)
16
+ end
17
+
18
+ def extract_hrefs(body)
19
+ elements = Nokogiri::HTML.parse(body).css('a')
20
+ return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
21
+ end
22
+
23
+ def vacuum(links, url)
24
+ links.each do |link|
25
+ next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
26
+
27
+ begin
28
+ absolute_link = make_absolute(link, url)
29
+
30
+ next if skip_link?(absolute_link)
31
+
32
+ @global_queue << absolute_link
33
+ rescue Addressable::URI::InvalidURIError
34
+ end
35
+ end
36
+ end
37
+
38
+ def skip_link?(absolute_link)
39
+ !internal_link?(absolute_link) || \
40
+ @global_visited.include?(absolute_link) || \
41
+ extension_ignored?(absolute_link) || \
42
+ @global_queue.include?(absolute_link)
43
+ end
44
+
45
+ def preflight(opts)
46
+ @options = opts
47
+ @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
48
+ @global_queue = [@url]
49
+ end
50
+
51
+ def proxy
52
+ @options[:proxy]
53
+ end
54
+
55
+ def non_html_extensions
56
+ return @non_html_extensions if @non_html_extensions
57
+
58
+ @non_html_extensions = @options[:non_html_extensions]
59
+ @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
60
+ end
61
+
62
+ def bound_time
63
+ boundary = "#{@options[:time_box]}".to_i
64
+ boundary = BASE_CRAWL_TIME if boundary <= 0
65
+ boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME
66
+
67
+ return Time.now + boundary
68
+ end
69
+
70
+ def bound_urls
71
+ amount = "#{@options[:max_urls]}".to_i
72
+ amount = BASE_URLS if amount <= 0
73
+ amount = MAX_URLS if amount > MAX_URLS
74
+
75
+ amount
76
+ end
77
+
78
+ def timeout
79
+ unless @timeout
80
+ @timeout = @options[:timeout]
81
+ @timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
82
+ @timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
83
+ @timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
84
+ end
85
+ @timeout
86
+ end
87
+
88
+ def crawl_options
89
+ @crawl_options ||= { max_urls: max_urls, time_limit: time_limit }
90
+ end
91
+
92
+ alias_method :max_urls, :bound_urls
93
+
94
+ alias_method :time_limit, :bound_time
95
+
96
+ def make_absolute(href, root)
97
+ Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
98
+ end
99
+
100
+ def internal_link?(absolute_url)
101
+ "#{Adomain[absolute_url]}".include? @domain
102
+ end
103
+
104
+ def extension_ignored?(url)
105
+ return false if url.empty?
106
+
107
+ !non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
108
+ end
109
+
110
+ def memory_danger?
111
+ return false unless in_docker?
112
+
113
+ use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
114
+ @limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f
115
+
116
+ return false unless ( (use > 0.0) && (@limit > 0.0) )
117
+
118
+ return ( ( (use / @limit) * 100.0 ) >= maximum_load_rate )
119
+ end
120
+
121
+ def in_docker?
122
+ File.file?(MEMORY_USE_FILE)
123
+ end
124
+
125
+ def maximum_load_rate
126
+ return @maximum_load_rate if @maximum_load_rate
127
+
128
+ @maximum_load_rate = "#{@options[:memory_limit]}".to_f
129
+ @maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
130
+ @maximum_load_rate
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,99 @@
1
+ class Arachnid2
2
+ class Typhoeus
3
+ include CachedArachnidResponses
4
+ include Arachnid2::Exoskeleton
5
+
6
+ def initialize(url)
7
+ @url = url
8
+ @domain = Adomain[@url]
9
+ @cached_data = []
10
+ end
11
+
12
+ def crawl(opts = {})
13
+ preflight(opts)
14
+ typhoeus_preflight
15
+
16
+ until @global_queue.empty?
17
+ max_concurrency.times do
18
+ q = @global_queue.shift
19
+
20
+ break if @global_visited.size >= crawl_options[:max_urls] || \
21
+ Time.now > crawl_options[:time_limit] || \
22
+ memory_danger?
23
+
24
+ @global_visited.insert(q)
25
+
26
+ request = ::Typhoeus::Request.new(q, request_options)
27
+
28
+ data = load_data(@url, opts)
29
+ data.each { |response| yield response } and return unless data.nil?
30
+
31
+ request.on_complete do |response|
32
+ @cached_data.push(response)
33
+ links = process(response.effective_url, response.body)
34
+ next unless links
35
+
36
+ yield response
37
+
38
+ vacuum(links, response.effective_url)
39
+ end
40
+
41
+ @hydra.queue(request)
42
+ end # max_concurrency.times do
43
+
44
+ @hydra.run
45
+
46
+ end # until @global_queue.empty?
47
+ put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
48
+ ensure
49
+ @cookie_file.close! if @cookie_file
50
+ end # def crawl(opts = {})
51
+
52
+ private
53
+ def typhoeus_preflight
54
+ @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
55
+ typhoeus_proxy_options
56
+ end
57
+
58
+ def max_concurrency
59
+ return @max_concurrency if @max_concurrency
60
+
61
+ @max_concurrency = "#{@options[:max_concurrency]}".to_i
62
+ @max_concurrency = 1 unless (@max_concurrency > 0)
63
+ @max_concurrency
64
+ end
65
+
66
+ def followlocation
67
+ return @followlocation unless @followlocation.nil?
68
+
69
+ @followlocation = @options[:followlocation]
70
+ @followlocation = true unless @followlocation.is_a?(FalseClass)
71
+ end
72
+
73
+ def request_options
74
+ @cookie_file ||= Tempfile.new('cookies')
75
+
76
+ @request_options = {
77
+ timeout: timeout,
78
+ followlocation: followlocation,
79
+ cookiefile: @cookie_file.path,
80
+ cookiejar: @cookie_file.path,
81
+ headers: @options[:headers]
82
+ }.merge(crawl_options[:proxy])
83
+
84
+ @request_options[:headers] ||= {}
85
+ @request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
86
+ @request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
87
+
88
+ @request_options
89
+ end
90
+
91
+ def typhoeus_proxy_options
92
+ crawl_options[:proxy] = {}
93
+
94
+ crawl_options[:proxy][:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
95
+ crawl_options[:proxy][:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
96
+ end
97
+
98
+ end
99
+ end
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.1"
3
3
  end
@@ -0,0 +1,102 @@
1
+ class Arachnid2
2
+ class Watir
3
+ include Arachnid2::Exoskeleton
4
+
5
+ def initialize(url)
6
+ @url = url
7
+ @domain = Adomain[@url]
8
+ end
9
+
10
+ def crawl(opts)
11
+ preflight(opts)
12
+ watir_preflight
13
+
14
+ until @global_queue.empty?
15
+ @already_retried = false
16
+ q = @global_queue.shift
17
+
18
+ break if @global_visited.size >= crawl_options[:max_urls]
19
+ break if Time.now > crawl_options[:time_limit]
20
+ break if memory_danger?
21
+
22
+ @global_visited.insert(q)
23
+
24
+ begin
25
+ browser.goto q
26
+ links = process(browser.url, browser.body.html)
27
+ next unless links
28
+
29
+ yield browser
30
+
31
+ vacuum(links, browser.url)
32
+ rescue => e
33
+ raise e if @already_retried
34
+ raise e unless "#{e.class}".include?("Selenium") || "#{e.class}".include?("Watir")
35
+ @browser = nil
36
+ @already_retried = true
37
+ retry
38
+ end
39
+
40
+ end # until @global_queue.empty?
41
+ ensure
42
+ @browser.close if @browser rescue nil
43
+ @headless.destroy if @headless rescue nil
44
+ end
45
+
46
+ private
47
+ def browser
48
+ unless @browser
49
+ behead if @make_headless
50
+
51
+ @browser = create_browser
52
+
53
+ set_timeout
54
+ end
55
+
56
+ return @browser
57
+ end
58
+
59
+ def create_browser
60
+ return ::Watir::Browser.new(driver, proxy: @proxy) if @proxy
61
+
62
+ ::Watir::Browser.new driver
63
+ end
64
+
65
+ def set_timeout
66
+ @browser.driver.manage.timeouts.page_load = timeout
67
+ end
68
+
69
+ def behead
70
+ @headless = Headless.new
71
+ @headless.start
72
+ end
73
+
74
+ def driver
75
+ unless @driver
76
+ language = @options.dig(:headers, "Accept-Language") || DEFAULT_LANGUAGE
77
+ user_agent = @options.dig(:headers, "User-Agent") || DEFAULT_USER_AGENT
78
+
79
+ @driver = Webdriver::UserAgent.driver(
80
+ browser: browser_type,
81
+ accept_language_string: language,
82
+ user_agent_string: user_agent
83
+ )
84
+ end
85
+
86
+ @driver
87
+ end
88
+
89
+ def watir_preflight
90
+ watir_proxy_options
91
+ @make_headless = @options[:headless]
92
+ end
93
+
94
+ def watir_proxy_options
95
+ crawl_options[:proxy] = {}
96
+
97
+ crawl_options[:proxy][:http] = @options[:proxy][:http] if @options.dig(:proxy, :http)
98
+ crawl_options[:proxy][:ssl] = @options[:proxy][:ssl] if @options.dig(:proxy, :ssl)
99
+ end
100
+ end
101
+
102
+ end
data/lib/arachnid2.rb CHANGED
@@ -1,5 +1,8 @@
1
1
  require "arachnid2/version"
2
- require "arachnid2/cashed_arachnid_responses"
2
+ require "arachnid2/cached_arachnid_responses"
3
+ require "arachnid2/exoskeleton"
4
+ require "arachnid2/typhoeus"
5
+ require "arachnid2/watir"
3
6
 
4
7
  require 'tempfile'
5
8
  require "typhoeus"
@@ -8,9 +11,12 @@ require "adomain"
8
11
  require "addressable/uri"
9
12
  require "nokogiri"
10
13
  require "base64"
14
+ require "webdrivers"
15
+ require "webdriver-user-agent"
16
+ require "watir"
17
+
11
18
 
12
19
  class Arachnid2
13
- include CashedArachnidResponses
14
20
  # META:
15
21
  # About the origins of this crawling approach
16
22
  # The Crawler is heavily borrowed from by Arachnid.
@@ -22,7 +28,7 @@ class Arachnid2
22
28
  # And this was originally written as a part of Tellurion's bot
23
29
  # https://github.com/samnissen/tellurion_bot
24
30
 
25
- MAX_CRAWL_TIME = 600
31
+ MAX_CRAWL_TIME = 10000
26
32
  BASE_CRAWL_TIME = 15
27
33
  MAX_URLS = 10000
28
34
  BASE_URLS = 50
@@ -58,8 +64,6 @@ class Arachnid2
58
64
  #
59
65
  def initialize(url)
60
66
  @url = url
61
- @domain = Adomain[@url]
62
- @cached_data = []
63
67
  end
64
68
 
65
69
  #
@@ -101,228 +105,15 @@ class Arachnid2
101
105
  #
102
106
  # @return nil
103
107
  #
104
- def crawl(opts = {})
105
- preflight(opts)
106
-
107
- until @global_queue.empty?
108
- @max_concurrency.times do
109
- q = @global_queue.shift
110
-
111
- break if @global_visited.size >= @crawl_options[:max_urls]
112
- break if Time.now > @crawl_options[:time_limit]
113
- break if memory_danger?
114
-
115
- @global_visited.insert(q)
116
-
117
- request = Typhoeus::Request.new(q, request_options)
118
-
119
- data = load_data(@url, opts)
120
- unless data.nil?
121
- data.each do |response|
122
- yield response
123
- end
124
- return
125
- end
126
- request.on_complete do |response|
127
- @cached_data.push(response)
128
- links = process(response)
129
- next unless links
130
-
131
- yield response
132
-
133
- vacuum(links, response)
134
- end
135
-
136
- @hydra.queue(request)
137
- end # @max_concurrency.times do
138
-
139
- @hydra.run
140
-
141
- end # until @global_queue.empty?
142
- put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
143
- ensure
144
- @cookie_file.close! if @cookie_file
145
-
146
-
147
- end # def crawl(opts = {})
148
-
149
- private
150
- def process(response)
151
- return false unless Adomain["#{response.effective_url}"].include? @domain
152
-
153
- elements = Nokogiri::HTML.parse(response.body).css('a')
154
- return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
155
- end
156
-
157
- def vacuum(links, response)
158
- links.each do |link|
159
- next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
160
-
161
- begin
162
- absolute_link = make_absolute(link, response.effective_url)
163
-
164
- next if skip_link?(absolute_link)
165
-
166
- @global_queue << absolute_link
167
- rescue Addressable::URI::InvalidURIError
168
- end
169
- end
170
- end
171
-
172
- def skip_link?(absolute_link)
173
- internal = internal_link?(absolute_link)
174
- visited = @global_visited.include?(absolute_link)
175
- ignored = extension_ignored?(absolute_link)
176
- known = @global_queue.include?(absolute_link)
177
-
178
- !internal || visited || ignored || known
179
- end
180
-
181
- def preflight(opts)
182
- @options = opts
183
- @crawl_options = crawl_options
184
- @maximum_load_rate = maximum_load_rate
185
- @max_concurrency = max_concurrency
186
- @non_html_extensions = non_html_extensions
187
- @hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
188
- @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
189
- @global_queue = [@url]
190
- end
191
-
192
- def non_html_extensions
193
- @non_html_extensions ||= nil
194
-
195
- if !@non_html_extensions
196
- @non_html_extensions = @options[:non_html_extensions]
197
- @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
198
- end
199
-
200
- @non_html_extensions
201
- end
202
-
203
- def max_concurrency
204
- @max_concurrency ||= nil
205
-
206
- if !@max_concurrency
207
- @max_concurrency = "#{@options[:max_concurrency]}".to_i
208
- @max_concurrency = 1 unless (@max_concurrency > 0)
209
- end
210
-
211
- @max_concurrency
212
- end
108
+ def crawl(opts = {}, with_watir = false)
109
+ crawl_watir and return if with_watir
213
110
 
214
- def bound_time
215
- boundary = "#{@options[:time_box]}".to_i
216
- boundary = BASE_CRAWL_TIME if boundary <= 0
217
- boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME
218
-
219
- return Time.now + boundary
220
- end
221
-
222
- def bound_urls
223
- amount = "#{@options[:max_urls]}".to_i
224
- amount = BASE_URLS if amount <= 0
225
- amount = MAX_URLS if amount > MAX_URLS
226
-
227
- amount
228
- end
229
-
230
- def followlocation
231
- if @followlocation.is_a?(NilClass)
232
- @followlocation = @options[:followlocation]
233
- @followlocation = true unless @followlocation.is_a?(FalseClass)
234
- end
235
- @followlocation
236
- end
237
-
238
- def timeout
239
- if !@timeout
240
- @timeout = @options[:timeout]
241
- @timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
242
- @timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
243
- @timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
244
- end
245
- @timeout
246
- end
247
-
248
- def request_options
249
- @cookie_file ||= Tempfile.new('cookies')
250
-
251
- @request_options = {
252
- timeout: timeout,
253
- followlocation: followlocation,
254
- cookiefile: @cookie_file.path,
255
- cookiejar: @cookie_file.path,
256
- headers: @options[:headers]
257
- }
258
-
259
- @request_options[:headers] ||= {}
260
- @request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
261
- @request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
262
-
263
- @request_options
264
- end
265
-
266
- def crawl_options
267
- @crawl_options ||= nil
268
-
269
- if !@crawl_options
270
- @crawl_options = { :max_urls => max_urls, :time_limit => time_limit }
271
-
272
- @crawl_options[:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
273
- @crawl_options[:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
274
- end
275
-
276
- @crawl_options
277
- end
278
-
279
- def max_urls
280
- bound_urls
281
- end
282
-
283
- def time_limit
284
- bound_time
285
- end
286
-
287
- def make_absolute(href, root)
288
- Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
289
- end
290
-
291
- def internal_link?(absolute_url)
292
- "#{Adomain[absolute_url]}".include? @domain
293
- end
294
-
295
- def extension_ignored?(url)
296
- return false if url.empty?
297
-
298
- !@non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
299
- end
300
-
301
- def memory_danger?
302
- return false unless in_docker?
303
-
304
- use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
305
- @limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f
306
-
307
- return false unless ( (use > 0.0) && (@limit > 0.0) )
308
-
309
- return ( ( (use / @limit) * 100.0 ) >= @maximum_load_rate )
310
- end
311
-
312
- def in_docker?
313
- return false unless File.file?(MEMORY_USE_FILE)
314
- true
315
- end
316
-
317
- def maximum_load_rate
318
- @maximum_load_rate ||= nil
319
-
320
- if !@maximum_load_rate
321
- @maximum_load_rate = "#{@options[:memory_limit]}".to_f
322
- @maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
323
- end
111
+ Arachnid2::Typhoeus.new(@url).crawl(opts, &Proc.new)
112
+ end
324
113
 
325
- @maximum_load_rate
326
- end
114
+ def crawl_watir(opts)
115
+ Arachnid2::Watir.new(@url).crawl(opts, &Proc.new)
116
+ end
117
+ # https://mudge.name/2011/01/26/passing-blocks-in-ruby-without-block.html
327
118
 
328
119
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-01-25 00:00:00.000000000 Z
11
+ date: 2019-02-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,48 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: webdriver-user-agent
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '7.6'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '7.6'
69
+ - !ruby/object:Gem::Dependency
70
+ name: watir
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webdrivers
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
55
97
  - !ruby/object:Gem::Dependency
56
98
  name: typhoeus
57
99
  requirement: !ruby/object:Gem::Requirement
@@ -114,14 +156,14 @@ dependencies:
114
156
  requirements:
115
157
  - - ">="
116
158
  - !ruby/object:Gem::Version
117
- version: '0'
159
+ version: 1.8.5
118
160
  type: :runtime
119
161
  prerelease: false
120
162
  version_requirements: !ruby/object:Gem::Requirement
121
163
  requirements:
122
164
  - - ">="
123
165
  - !ruby/object:Gem::Version
124
- version: '0'
166
+ version: 1.8.5
125
167
  description:
126
168
  email:
127
169
  - scnissen@gmail.com
@@ -142,8 +184,11 @@ files:
142
184
  - bin/console
143
185
  - bin/setup
144
186
  - lib/arachnid2.rb
145
- - lib/arachnid2/cashed_arachnid_responses.rb
187
+ - lib/arachnid2/cached_arachnid_responses.rb
188
+ - lib/arachnid2/exoskeleton.rb
189
+ - lib/arachnid2/typhoeus.rb
146
190
  - lib/arachnid2/version.rb
191
+ - lib/arachnid2/watir.rb
147
192
  homepage: https://github.com/samnissen/arachnid2
148
193
  licenses:
149
194
  - MIT