arachnid2 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +12 -0
- data/lib/arachnid2.rb +20 -2
- data/lib/arachnid2/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 420df644a588b8eac92cfda03df0ab2ca20de52e3123aa7e1990ff850fd404d5
|
4
|
+
data.tar.gz: 49cdd7681f110d9a1d53075563b84e2614e959dad4cb39f877a36bf28be4dbf4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 49229b32b3d79cb560879298d2fa54f6206c8c419bbb86e7da9e1e132f84132026689270d0a6810610921be01676b37adb494e03a4f96a094561cb55fb2f7e4b
|
7
|
+
data.tar.gz: 505716bfafcdb116f25f401355928a2b65573d90d6d315d6710cc29247ce4f29c5c121e9d12c2ec807fe48a047823429aedfc5fe64b8d69683b8a603a8081621
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
arachnid2 (0.1.
|
4
|
+
arachnid2 (0.1.4)
|
5
5
|
addressable
|
6
6
|
adomain
|
7
7
|
bloomfilter-rb
|
@@ -22,11 +22,11 @@ GEM
|
|
22
22
|
ffi (>= 1.3.0)
|
23
23
|
ffi (1.9.25)
|
24
24
|
mini_portile2 (2.3.0)
|
25
|
-
nokogiri (1.8.
|
25
|
+
nokogiri (1.8.4)
|
26
26
|
mini_portile2 (~> 2.3.0)
|
27
|
-
public_suffix (3.0.
|
27
|
+
public_suffix (3.0.3)
|
28
28
|
rake (10.5.0)
|
29
|
-
redis (4.0.
|
29
|
+
redis (4.0.2)
|
30
30
|
rspec (3.7.0)
|
31
31
|
rspec-core (~> 3.7.0)
|
32
32
|
rspec-expectations (~> 3.7.0)
|
data/README.md
CHANGED
@@ -78,6 +78,12 @@ opts = {
|
|
78
78
|
username: "sam",
|
79
79
|
password: "coolcoolcool",
|
80
80
|
}
|
81
|
+
:non_html_extensions => {
|
82
|
+
3 => [".abc", ".xyz"],
|
83
|
+
4 => [".abcd"],
|
84
|
+
6 => [".abcdef"],
|
85
|
+
11 => [".abcdefghijk"]
|
86
|
+
}
|
81
87
|
}
|
82
88
|
responses = []
|
83
89
|
|
@@ -119,6 +125,12 @@ Provide your IP, port for a proxy. If required, provide credentials for
|
|
119
125
|
authenticating to that proxy. Proxy options and handling are done
|
120
126
|
by Typhoeus.
|
121
127
|
|
128
|
+
#### `non_html_extensions`
|
129
|
+
|
130
|
+
This is the list of TLDs to ignore when collecting URLs from the page.
|
131
|
+
The extensions are formatted as a hash of key/value pairs, where the value
|
132
|
+
is an array of TLDs, and the keys represent the length of those TLDs.
|
133
|
+
|
122
134
|
#### `memory_limit` and Docker
|
123
135
|
|
124
136
|
In case you are operating the crawler within a container, Arachnid2
|
data/lib/arachnid2.rb
CHANGED
@@ -27,7 +27,7 @@ class Arachnid2
|
|
27
27
|
DEFAULT_LANGUAGE = "en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, *;0.4"
|
28
28
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"
|
29
29
|
|
30
|
-
|
30
|
+
DEFAULT_NON_HTML_EXTENSIONS = {
|
31
31
|
3 => ['.gz'],
|
32
32
|
4 => ['.jpg', '.png', '.m4a', '.mp3', '.mp4', '.pdf', '.zip',
|
33
33
|
'.wmv', '.gif', '.doc', '.xls', '.pps', '.ppt', '.tar',
|
@@ -82,6 +82,12 @@ class Arachnid2
|
|
82
82
|
# :username => "sam",
|
83
83
|
# :password => "coolcoolcool",
|
84
84
|
# }
|
85
|
+
# :non_html_extensions => {
|
86
|
+
# 3 => [".abc", ".xyz"],
|
87
|
+
# 4 => [".abcd"],
|
88
|
+
# 6 => [".abcdef"],
|
89
|
+
# 11 => [".abcdefghijk"]
|
90
|
+
# }
|
85
91
|
# }
|
86
92
|
# responses = []
|
87
93
|
# spider.crawl(opts) { |response|
|
@@ -163,11 +169,23 @@ class Arachnid2
|
|
163
169
|
@crawl_options = crawl_options
|
164
170
|
@maximum_load_rate = maximum_load_rate
|
165
171
|
@max_concurrency = max_concurrency
|
172
|
+
@non_html_extensions = non_html_extensions
|
166
173
|
@hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
|
167
174
|
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
|
168
175
|
@global_queue = [@url]
|
169
176
|
end
|
170
177
|
|
178
|
+
def non_html_extensions
|
179
|
+
@non_html_extensions ||= nil
|
180
|
+
|
181
|
+
if !@non_html_extensions
|
182
|
+
@non_html_extensions = @options[:non_html_extensions]
|
183
|
+
@non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
|
184
|
+
end
|
185
|
+
|
186
|
+
@non_html_extensions
|
187
|
+
end
|
188
|
+
|
171
189
|
def max_concurrency
|
172
190
|
@max_concurrency ||= nil
|
173
191
|
|
@@ -263,7 +281,7 @@ class Arachnid2
|
|
263
281
|
def extension_ignored?(url)
|
264
282
|
return false if url.empty?
|
265
283
|
|
266
|
-
|
284
|
+
!@non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
|
267
285
|
end
|
268
286
|
|
269
287
|
def memory_danger?
|
data/lib/arachnid2/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-08-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|