arachnid2 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +12 -0
- data/lib/arachnid2.rb +20 -2
- data/lib/arachnid2/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 420df644a588b8eac92cfda03df0ab2ca20de52e3123aa7e1990ff850fd404d5
|
|
4
|
+
data.tar.gz: 49cdd7681f110d9a1d53075563b84e2614e959dad4cb39f877a36bf28be4dbf4
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 49229b32b3d79cb560879298d2fa54f6206c8c419bbb86e7da9e1e132f84132026689270d0a6810610921be01676b37adb494e03a4f96a094561cb55fb2f7e4b
|
|
7
|
+
data.tar.gz: 505716bfafcdb116f25f401355928a2b65573d90d6d315d6710cc29247ce4f29c5c121e9d12c2ec807fe48a047823429aedfc5fe64b8d69683b8a603a8081621
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
arachnid2 (0.1.
|
|
4
|
+
arachnid2 (0.1.4)
|
|
5
5
|
addressable
|
|
6
6
|
adomain
|
|
7
7
|
bloomfilter-rb
|
|
@@ -22,11 +22,11 @@ GEM
|
|
|
22
22
|
ffi (>= 1.3.0)
|
|
23
23
|
ffi (1.9.25)
|
|
24
24
|
mini_portile2 (2.3.0)
|
|
25
|
-
nokogiri (1.8.
|
|
25
|
+
nokogiri (1.8.4)
|
|
26
26
|
mini_portile2 (~> 2.3.0)
|
|
27
|
-
public_suffix (3.0.
|
|
27
|
+
public_suffix (3.0.3)
|
|
28
28
|
rake (10.5.0)
|
|
29
|
-
redis (4.0.
|
|
29
|
+
redis (4.0.2)
|
|
30
30
|
rspec (3.7.0)
|
|
31
31
|
rspec-core (~> 3.7.0)
|
|
32
32
|
rspec-expectations (~> 3.7.0)
|
data/README.md
CHANGED
|
@@ -78,6 +78,12 @@ opts = {
|
|
|
78
78
|
username: "sam",
|
|
79
79
|
password: "coolcoolcool",
|
|
80
80
|
}
|
|
81
|
+
:non_html_extensions => {
|
|
82
|
+
3 => [".abc", ".xyz"],
|
|
83
|
+
4 => [".abcd"],
|
|
84
|
+
6 => [".abcdef"],
|
|
85
|
+
11 => [".abcdefghijk"]
|
|
86
|
+
}
|
|
81
87
|
}
|
|
82
88
|
responses = []
|
|
83
89
|
|
|
@@ -119,6 +125,12 @@ Provide your IP, port for a proxy. If required, provide credentials for
|
|
|
119
125
|
authenticating to that proxy. Proxy options and handling are done
|
|
120
126
|
by Typhoeus.
|
|
121
127
|
|
|
128
|
+
#### `non_html_extensions`
|
|
129
|
+
|
|
130
|
+
This is the list of TLDs to ignore when collecting URLs from the page.
|
|
131
|
+
The extensions are formatted as a hash of key/value pairs, where the value
|
|
132
|
+
is an array of TLDs, and the keys represent the length of those TLDs.
|
|
133
|
+
|
|
122
134
|
#### `memory_limit` and Docker
|
|
123
135
|
|
|
124
136
|
In case you are operating the crawler within a container, Arachnid2
|
data/lib/arachnid2.rb
CHANGED
|
@@ -27,7 +27,7 @@ class Arachnid2
|
|
|
27
27
|
DEFAULT_LANGUAGE = "en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, *;0.4"
|
|
28
28
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
DEFAULT_NON_HTML_EXTENSIONS = {
|
|
31
31
|
3 => ['.gz'],
|
|
32
32
|
4 => ['.jpg', '.png', '.m4a', '.mp3', '.mp4', '.pdf', '.zip',
|
|
33
33
|
'.wmv', '.gif', '.doc', '.xls', '.pps', '.ppt', '.tar',
|
|
@@ -82,6 +82,12 @@ class Arachnid2
|
|
|
82
82
|
# :username => "sam",
|
|
83
83
|
# :password => "coolcoolcool",
|
|
84
84
|
# }
|
|
85
|
+
# :non_html_extensions => {
|
|
86
|
+
# 3 => [".abc", ".xyz"],
|
|
87
|
+
# 4 => [".abcd"],
|
|
88
|
+
# 6 => [".abcdef"],
|
|
89
|
+
# 11 => [".abcdefghijk"]
|
|
90
|
+
# }
|
|
85
91
|
# }
|
|
86
92
|
# responses = []
|
|
87
93
|
# spider.crawl(opts) { |response|
|
|
@@ -163,11 +169,23 @@ class Arachnid2
|
|
|
163
169
|
@crawl_options = crawl_options
|
|
164
170
|
@maximum_load_rate = maximum_load_rate
|
|
165
171
|
@max_concurrency = max_concurrency
|
|
172
|
+
@non_html_extensions = non_html_extensions
|
|
166
173
|
@hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
|
|
167
174
|
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
|
|
168
175
|
@global_queue = [@url]
|
|
169
176
|
end
|
|
170
177
|
|
|
178
|
+
def non_html_extensions
|
|
179
|
+
@non_html_extensions ||= nil
|
|
180
|
+
|
|
181
|
+
if !@non_html_extensions
|
|
182
|
+
@non_html_extensions = @options[:non_html_extensions]
|
|
183
|
+
@non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
@non_html_extensions
|
|
187
|
+
end
|
|
188
|
+
|
|
171
189
|
def max_concurrency
|
|
172
190
|
@max_concurrency ||= nil
|
|
173
191
|
|
|
@@ -263,7 +281,7 @@ class Arachnid2
|
|
|
263
281
|
def extension_ignored?(url)
|
|
264
282
|
return false if url.empty?
|
|
265
283
|
|
|
266
|
-
|
|
284
|
+
!@non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
|
|
267
285
|
end
|
|
268
286
|
|
|
269
287
|
def memory_danger?
|
data/lib/arachnid2/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: arachnid2
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Sam Nissen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2018-
|
|
11
|
+
date: 2018-08-22 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|