arachnid2 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -1
- data/README.md +4 -1
- data/lib/arachnid2/cashed_arachnid_responses.rb +41 -0
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2.rb +16 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00cef9d45ae8be8b0747d47e254737fdfdb94e3f40cfe85a99faff283653f87b
|
4
|
+
data.tar.gz: 24ecf163c8b2eeda25908067a0efa68aec661441c59b06c466ee22828d154f5d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2830f48686f9c2e9a921da58cca907580c800716e856982ae5e836f6dd51ab899192456bc4090ad78aac89eb6db49496d2ad953713d4d52548f1e248a7198df2
|
7
|
+
data.tar.gz: d3175c6f6574dc5a6feb9955e2a7804accfdd09c705cd90dea11586bd774fb79a67fdca499afe628739eb85b8d6adf6ff13d831658b23f4a993fc210e429c8f1
|
data/Gemfile.lock
CHANGED
@@ -17,13 +17,18 @@ GEM
|
|
17
17
|
addressable (~> 2.5)
|
18
18
|
bloomfilter-rb (2.1.1)
|
19
19
|
redis
|
20
|
+
coderay (1.1.2)
|
20
21
|
diff-lcs (1.3)
|
21
22
|
ethon (0.11.0)
|
22
23
|
ffi (>= 1.3.0)
|
23
24
|
ffi (1.9.25)
|
25
|
+
method_source (0.9.0)
|
24
26
|
mini_portile2 (2.3.0)
|
25
27
|
nokogiri (1.8.4)
|
26
28
|
mini_portile2 (~> 2.3.0)
|
29
|
+
pry (0.11.3)
|
30
|
+
coderay (~> 1.1.0)
|
31
|
+
method_source (~> 0.9.0)
|
27
32
|
public_suffix (3.0.3)
|
28
33
|
rake (10.5.0)
|
29
34
|
redis (4.0.2)
|
@@ -49,8 +54,9 @@ PLATFORMS
|
|
49
54
|
DEPENDENCIES
|
50
55
|
arachnid2!
|
51
56
|
bundler (~> 1.16)
|
57
|
+
pry
|
52
58
|
rake (~> 10.0)
|
53
59
|
rspec (~> 3.0)
|
54
60
|
|
55
61
|
BUNDLED WITH
|
56
|
-
1.16.
|
62
|
+
1.16.3
|
data/README.md
CHANGED
@@ -22,6 +22,9 @@ on that page, and visiting those to do the same.
|
|
22
22
|
Hence, the simplest output would be to collect all of the responses
|
23
23
|
while spidering from some URL.
|
24
24
|
|
25
|
+
Set cached service url(optional)
|
26
|
+
`export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
|
27
|
+
|
25
28
|
```ruby
|
26
29
|
require "arachnid2"
|
27
30
|
|
@@ -129,7 +132,7 @@ by Typhoeus.
|
|
129
132
|
|
130
133
|
This is the list of TLDs to ignore when collecting URLs from the page.
|
131
134
|
The extensions are formatted as a hash of key/value pairs, where the value
|
132
|
-
is an array of TLDs, and the keys represent the length of those TLDs.
|
135
|
+
is an array of TLDs, and the keys represent the length of those TLDs.
|
133
136
|
|
134
137
|
#### `memory_limit` and Docker
|
135
138
|
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'json'
|
3
|
+
module CashedArachnidResponses
|
4
|
+
CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
|
5
|
+
|
6
|
+
def load_data(_url, _options)
|
7
|
+
return if check_config
|
8
|
+
|
9
|
+
uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses?url=#{@url}&options=#{@options}")
|
10
|
+
req = Net::HTTP::Get.new(uri)
|
11
|
+
req['Accept'] = 'json'
|
12
|
+
Net::HTTP.start(uri.hostname, uri.port) do |http|
|
13
|
+
response = http.request(req)
|
14
|
+
return nil if response.code != '200'
|
15
|
+
|
16
|
+
body = ::JSON.parse(response.body)
|
17
|
+
responses_list = Base64.decode64(body['encrypted_response'])
|
18
|
+
return Marshal.load responses_list # here we get array of Typhoeus::Response
|
19
|
+
end
|
20
|
+
rescue StandardError
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def put_cached_data(url, options, data)
|
25
|
+
return if check_config
|
26
|
+
|
27
|
+
uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses")
|
28
|
+
|
29
|
+
header = { 'Content-Type': 'application/json' }
|
30
|
+
req = Net::HTTP::Post.new(uri, header)
|
31
|
+
processed_data = Base64.encode64(Marshal.dump(data))
|
32
|
+
req.body = { url: url, options: options, encrypted_response: processed_data }.to_json
|
33
|
+
Net::HTTP.start(uri.hostname, uri.port) do |http|
|
34
|
+
http.request(req)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def check_config
|
39
|
+
CACHE_SERVICE_URL.nil?
|
40
|
+
end
|
41
|
+
end
|
data/lib/arachnid2/version.rb
CHANGED
data/lib/arachnid2.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require "arachnid2/version"
|
2
|
+
require "arachnid2/cashed_arachnid_responses"
|
2
3
|
|
3
4
|
require 'tempfile'
|
4
5
|
require "typhoeus"
|
@@ -6,9 +7,10 @@ require "bloomfilter-rb"
|
|
6
7
|
require "adomain"
|
7
8
|
require "addressable/uri"
|
8
9
|
require "nokogiri"
|
10
|
+
require "base64"
|
9
11
|
|
10
12
|
class Arachnid2
|
11
|
-
|
13
|
+
include CashedArachnidResponses
|
12
14
|
# META:
|
13
15
|
# About the origins of this crawling approach
|
14
16
|
# The Crawler is heavily borrowed from by Arachnid.
|
@@ -57,6 +59,7 @@ class Arachnid2
|
|
57
59
|
def initialize(url)
|
58
60
|
@url = url
|
59
61
|
@domain = Adomain[@url]
|
62
|
+
@cached_data = []
|
60
63
|
end
|
61
64
|
|
62
65
|
#
|
@@ -113,7 +116,15 @@ class Arachnid2
|
|
113
116
|
|
114
117
|
request = Typhoeus::Request.new(q, request_options)
|
115
118
|
|
119
|
+
data = load_data(@url, opts)
|
120
|
+
unless data.nil?
|
121
|
+
data.each do |response|
|
122
|
+
yield response
|
123
|
+
end
|
124
|
+
return
|
125
|
+
end
|
116
126
|
request.on_complete do |response|
|
127
|
+
@cached_data.push(response)
|
117
128
|
links = process(response)
|
118
129
|
next unless links
|
119
130
|
|
@@ -126,10 +137,13 @@ class Arachnid2
|
|
126
137
|
end # @max_concurrency.times do
|
127
138
|
|
128
139
|
@hydra.run
|
129
|
-
end # until @global_queue.empty?
|
130
140
|
|
141
|
+
end # until @global_queue.empty?
|
142
|
+
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
131
143
|
ensure
|
132
144
|
@cookie_file.close! if @cookie_file
|
145
|
+
|
146
|
+
|
133
147
|
end # def crawl(opts = {})
|
134
148
|
|
135
149
|
private
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -142,6 +142,7 @@ files:
|
|
142
142
|
- bin/console
|
143
143
|
- bin/setup
|
144
144
|
- lib/arachnid2.rb
|
145
|
+
- lib/arachnid2/cashed_arachnid_responses.rb
|
145
146
|
- lib/arachnid2/version.rb
|
146
147
|
homepage: https://github.com/samnissen/arachnid2
|
147
148
|
licenses:
|