arachnid2 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0ebb9ed9cdef3106462796f1b7fcc2483d58857bf605f04c800733358ea3f486
4
- data.tar.gz: 86eaaf1bd44b85ee564b1bf3aeb08e2be326462c6bf8d3291ba2bc2f55e7c444
3
+ metadata.gz: 420df644a588b8eac92cfda03df0ab2ca20de52e3123aa7e1990ff850fd404d5
4
+ data.tar.gz: 49cdd7681f110d9a1d53075563b84e2614e959dad4cb39f877a36bf28be4dbf4
5
5
  SHA512:
6
- metadata.gz: 1b8f1d5798379c75502cf36046c3110f02ae6abbd12a8ecaa7c501e1efd0d8d86393abd64212f12e3c3f951ddf5f8f6fed5ed15a9937305cf63ff92523087c89
7
- data.tar.gz: 4f02afa25d537346b2cc6daaa16ae7b19956c5478bdfb5f4e2a9188825ad3f687691ad96025570661fd97dd12954c0e367be9cbdf9267f726b34ad13ea50bff0
6
+ metadata.gz: 49229b32b3d79cb560879298d2fa54f6206c8c419bbb86e7da9e1e132f84132026689270d0a6810610921be01676b37adb494e03a4f96a094561cb55fb2f7e4b
7
+ data.tar.gz: 505716bfafcdb116f25f401355928a2b65573d90d6d315d6710cc29247ce4f29c5c121e9d12c2ec807fe48a047823429aedfc5fe64b8d69683b8a603a8081621
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- arachnid2 (0.1.2)
4
+ arachnid2 (0.1.4)
5
5
  addressable
6
6
  adomain
7
7
  bloomfilter-rb
@@ -22,11 +22,11 @@ GEM
22
22
  ffi (>= 1.3.0)
23
23
  ffi (1.9.25)
24
24
  mini_portile2 (2.3.0)
25
- nokogiri (1.8.2)
25
+ nokogiri (1.8.4)
26
26
  mini_portile2 (~> 2.3.0)
27
- public_suffix (3.0.2)
27
+ public_suffix (3.0.3)
28
28
  rake (10.5.0)
29
- redis (4.0.1)
29
+ redis (4.0.2)
30
30
  rspec (3.7.0)
31
31
  rspec-core (~> 3.7.0)
32
32
  rspec-expectations (~> 3.7.0)
data/README.md CHANGED
@@ -78,6 +78,12 @@ opts = {
78
78
  username: "sam",
79
79
  password: "coolcoolcool",
80
80
  }
81
+ :non_html_extensions => {
82
+ 3 => [".abc", ".xyz"],
83
+ 4 => [".abcd"],
84
+ 6 => [".abcdef"],
85
+ 11 => [".abcdefghijk"]
86
+ }
81
87
  }
82
88
  responses = []
83
89
 
@@ -119,6 +125,12 @@ Provide your IP, port for a proxy. If required, provide credentials for
119
125
  authenticating to that proxy. Proxy options and handling are done
120
126
  by Typhoeus.
121
127
 
128
+ #### `non_html_extensions`
129
+
130
+ This is the list of TLDs to ignore when collecting URLs from the page.
131
+ The extensions are formatted as a hash of key/value pairs, where the value
132
+ is an array of TLDs, and the keys represent the length of those TLDs.
133
+
122
134
  #### `memory_limit` and Docker
123
135
 
124
136
  In case you are operating the crawler within a container, Arachnid2
data/lib/arachnid2.rb CHANGED
@@ -27,7 +27,7 @@ class Arachnid2
27
27
  DEFAULT_LANGUAGE = "en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, *;0.4"
28
28
  DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"
29
29
 
30
- NON_HTML_EXTENSIONS = {
30
+ DEFAULT_NON_HTML_EXTENSIONS = {
31
31
  3 => ['.gz'],
32
32
  4 => ['.jpg', '.png', '.m4a', '.mp3', '.mp4', '.pdf', '.zip',
33
33
  '.wmv', '.gif', '.doc', '.xls', '.pps', '.ppt', '.tar',
@@ -82,6 +82,12 @@ class Arachnid2
82
82
  # :username => "sam",
83
83
  # :password => "coolcoolcool",
84
84
  # }
85
+ # :non_html_extensions => {
86
+ # 3 => [".abc", ".xyz"],
87
+ # 4 => [".abcd"],
88
+ # 6 => [".abcdef"],
89
+ # 11 => [".abcdefghijk"]
90
+ # }
85
91
  # }
86
92
  # responses = []
87
93
  # spider.crawl(opts) { |response|
@@ -163,11 +169,23 @@ class Arachnid2
163
169
  @crawl_options = crawl_options
164
170
  @maximum_load_rate = maximum_load_rate
165
171
  @max_concurrency = max_concurrency
172
+ @non_html_extensions = non_html_extensions
166
173
  @hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
167
174
  @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
168
175
  @global_queue = [@url]
169
176
  end
170
177
 
178
+ def non_html_extensions
179
+ @non_html_extensions ||= nil
180
+
181
+ if !@non_html_extensions
182
+ @non_html_extensions = @options[:non_html_extensions]
183
+ @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
184
+ end
185
+
186
+ @non_html_extensions
187
+ end
188
+
171
189
  def max_concurrency
172
190
  @max_concurrency ||= nil
173
191
 
@@ -263,7 +281,7 @@ class Arachnid2
263
281
  def extension_ignored?(url)
264
282
  return false if url.empty?
265
283
 
266
- !NON_HTML_EXTENSIONS.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
284
+ !@non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
267
285
  end
268
286
 
269
287
  def memory_danger?
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-07-04 00:00:00.000000000 Z
11
+ date: 2018-08-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler