arachnid2 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0ebb9ed9cdef3106462796f1b7fcc2483d58857bf605f04c800733358ea3f486
4
- data.tar.gz: 86eaaf1bd44b85ee564b1bf3aeb08e2be326462c6bf8d3291ba2bc2f55e7c444
3
+ metadata.gz: 420df644a588b8eac92cfda03df0ab2ca20de52e3123aa7e1990ff850fd404d5
4
+ data.tar.gz: 49cdd7681f110d9a1d53075563b84e2614e959dad4cb39f877a36bf28be4dbf4
5
5
  SHA512:
6
- metadata.gz: 1b8f1d5798379c75502cf36046c3110f02ae6abbd12a8ecaa7c501e1efd0d8d86393abd64212f12e3c3f951ddf5f8f6fed5ed15a9937305cf63ff92523087c89
7
- data.tar.gz: 4f02afa25d537346b2cc6daaa16ae7b19956c5478bdfb5f4e2a9188825ad3f687691ad96025570661fd97dd12954c0e367be9cbdf9267f726b34ad13ea50bff0
6
+ metadata.gz: 49229b32b3d79cb560879298d2fa54f6206c8c419bbb86e7da9e1e132f84132026689270d0a6810610921be01676b37adb494e03a4f96a094561cb55fb2f7e4b
7
+ data.tar.gz: 505716bfafcdb116f25f401355928a2b65573d90d6d315d6710cc29247ce4f29c5c121e9d12c2ec807fe48a047823429aedfc5fe64b8d69683b8a603a8081621
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- arachnid2 (0.1.2)
4
+ arachnid2 (0.1.4)
5
5
  addressable
6
6
  adomain
7
7
  bloomfilter-rb
@@ -22,11 +22,11 @@ GEM
22
22
  ffi (>= 1.3.0)
23
23
  ffi (1.9.25)
24
24
  mini_portile2 (2.3.0)
25
- nokogiri (1.8.2)
25
+ nokogiri (1.8.4)
26
26
  mini_portile2 (~> 2.3.0)
27
- public_suffix (3.0.2)
27
+ public_suffix (3.0.3)
28
28
  rake (10.5.0)
29
- redis (4.0.1)
29
+ redis (4.0.2)
30
30
  rspec (3.7.0)
31
31
  rspec-core (~> 3.7.0)
32
32
  rspec-expectations (~> 3.7.0)
data/README.md CHANGED
@@ -78,6 +78,12 @@ opts = {
78
78
  username: "sam",
79
79
  password: "coolcoolcool",
80
80
  }
81
+ :non_html_extensions => {
82
+ 3 => [".abc", ".xyz"],
83
+ 4 => [".abcd"],
84
+ 6 => [".abcdef"],
85
+ 11 => [".abcdefghijk"]
86
+ }
81
87
  }
82
88
  responses = []
83
89
 
@@ -119,6 +125,12 @@ Provide your IP, port for a proxy. If required, provide credentials for
119
125
  authenticating to that proxy. Proxy options and handling are done
120
126
  by Typhoeus.
121
127
 
128
+ #### `non_html_extensions`
129
+
130
+ This is the list of TLDs to ignore when collecting URLs from the page.
131
+ The extensions are formatted as a hash of key/value pairs, where the value
132
+ is an array of TLDs, and the keys represent the length of those TLDs.
133
+
122
134
  #### `memory_limit` and Docker
123
135
 
124
136
  In case you are operating the crawler within a container, Arachnid2
data/lib/arachnid2.rb CHANGED
@@ -27,7 +27,7 @@ class Arachnid2
27
27
  DEFAULT_LANGUAGE = "en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, *;0.4"
28
28
  DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"
29
29
 
30
- NON_HTML_EXTENSIONS = {
30
+ DEFAULT_NON_HTML_EXTENSIONS = {
31
31
  3 => ['.gz'],
32
32
  4 => ['.jpg', '.png', '.m4a', '.mp3', '.mp4', '.pdf', '.zip',
33
33
  '.wmv', '.gif', '.doc', '.xls', '.pps', '.ppt', '.tar',
@@ -82,6 +82,12 @@ class Arachnid2
82
82
  # :username => "sam",
83
83
  # :password => "coolcoolcool",
84
84
  # }
85
+ # :non_html_extensions => {
86
+ # 3 => [".abc", ".xyz"],
87
+ # 4 => [".abcd"],
88
+ # 6 => [".abcdef"],
89
+ # 11 => [".abcdefghijk"]
90
+ # }
85
91
  # }
86
92
  # responses = []
87
93
  # spider.crawl(opts) { |response|
@@ -163,11 +169,23 @@ class Arachnid2
163
169
  @crawl_options = crawl_options
164
170
  @maximum_load_rate = maximum_load_rate
165
171
  @max_concurrency = max_concurrency
172
+ @non_html_extensions = non_html_extensions
166
173
  @hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
167
174
  @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
168
175
  @global_queue = [@url]
169
176
  end
170
177
 
178
+ def non_html_extensions
179
+ @non_html_extensions ||= nil
180
+
181
+ if !@non_html_extensions
182
+ @non_html_extensions = @options[:non_html_extensions]
183
+ @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
184
+ end
185
+
186
+ @non_html_extensions
187
+ end
188
+
171
189
  def max_concurrency
172
190
  @max_concurrency ||= nil
173
191
 
@@ -263,7 +281,7 @@ class Arachnid2
263
281
  def extension_ignored?(url)
264
282
  return false if url.empty?
265
283
 
266
- !NON_HTML_EXTENSIONS.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
284
+ !@non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
267
285
  end
268
286
 
269
287
  def memory_danger?
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-07-04 00:00:00.000000000 Z
11
+ date: 2018-08-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler