arachnid2 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -1
- data/README.md +4 -1
- data/lib/arachnid2/cashed_arachnid_responses.rb +41 -0
- data/lib/arachnid2/version.rb +1 -1
- data/lib/arachnid2.rb +16 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00cef9d45ae8be8b0747d47e254737fdfdb94e3f40cfe85a99faff283653f87b
|
4
|
+
data.tar.gz: 24ecf163c8b2eeda25908067a0efa68aec661441c59b06c466ee22828d154f5d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2830f48686f9c2e9a921da58cca907580c800716e856982ae5e836f6dd51ab899192456bc4090ad78aac89eb6db49496d2ad953713d4d52548f1e248a7198df2
|
7
|
+
data.tar.gz: d3175c6f6574dc5a6feb9955e2a7804accfdd09c705cd90dea11586bd774fb79a67fdca499afe628739eb85b8d6adf6ff13d831658b23f4a993fc210e429c8f1
|
data/Gemfile.lock
CHANGED
@@ -17,13 +17,18 @@ GEM
|
|
17
17
|
addressable (~> 2.5)
|
18
18
|
bloomfilter-rb (2.1.1)
|
19
19
|
redis
|
20
|
+
coderay (1.1.2)
|
20
21
|
diff-lcs (1.3)
|
21
22
|
ethon (0.11.0)
|
22
23
|
ffi (>= 1.3.0)
|
23
24
|
ffi (1.9.25)
|
25
|
+
method_source (0.9.0)
|
24
26
|
mini_portile2 (2.3.0)
|
25
27
|
nokogiri (1.8.4)
|
26
28
|
mini_portile2 (~> 2.3.0)
|
29
|
+
pry (0.11.3)
|
30
|
+
coderay (~> 1.1.0)
|
31
|
+
method_source (~> 0.9.0)
|
27
32
|
public_suffix (3.0.3)
|
28
33
|
rake (10.5.0)
|
29
34
|
redis (4.0.2)
|
@@ -49,8 +54,9 @@ PLATFORMS
|
|
49
54
|
DEPENDENCIES
|
50
55
|
arachnid2!
|
51
56
|
bundler (~> 1.16)
|
57
|
+
pry
|
52
58
|
rake (~> 10.0)
|
53
59
|
rspec (~> 3.0)
|
54
60
|
|
55
61
|
BUNDLED WITH
|
56
|
-
1.16.
|
62
|
+
1.16.3
|
data/README.md
CHANGED
@@ -22,6 +22,9 @@ on that page, and visiting those to do the same.
|
|
22
22
|
Hence, the simplest output would be to collect all of the responses
|
23
23
|
while spidering from some URL.
|
24
24
|
|
25
|
+
Set cached service url(optional)
|
26
|
+
`export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
|
27
|
+
|
25
28
|
```ruby
|
26
29
|
require "arachnid2"
|
27
30
|
|
@@ -129,7 +132,7 @@ by Typhoeus.
|
|
129
132
|
|
130
133
|
This is the list of TLDs to ignore when collecting URLs from the page.
|
131
134
|
The extensions are formatted as a hash of key/value pairs, where the value
|
132
|
-
is an array of TLDs, and the keys represent the length of those TLDs.
|
135
|
+
is an array of TLDs, and the keys represent the length of those TLDs.
|
133
136
|
|
134
137
|
#### `memory_limit` and Docker
|
135
138
|
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'json'
|
3
|
+
module CashedArachnidResponses
|
4
|
+
CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
|
5
|
+
|
6
|
+
def load_data(_url, _options)
|
7
|
+
return if check_config
|
8
|
+
|
9
|
+
uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses?url=#{@url}&options=#{@options}")
|
10
|
+
req = Net::HTTP::Get.new(uri)
|
11
|
+
req['Accept'] = 'json'
|
12
|
+
Net::HTTP.start(uri.hostname, uri.port) do |http|
|
13
|
+
response = http.request(req)
|
14
|
+
return nil if response.code != '200'
|
15
|
+
|
16
|
+
body = ::JSON.parse(response.body)
|
17
|
+
responses_list = Base64.decode64(body['encrypted_response'])
|
18
|
+
return Marshal.load responses_list # here we get array of Typhoeus::Response
|
19
|
+
end
|
20
|
+
rescue StandardError
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def put_cached_data(url, options, data)
|
25
|
+
return if check_config
|
26
|
+
|
27
|
+
uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses")
|
28
|
+
|
29
|
+
header = { 'Content-Type': 'application/json' }
|
30
|
+
req = Net::HTTP::Post.new(uri, header)
|
31
|
+
processed_data = Base64.encode64(Marshal.dump(data))
|
32
|
+
req.body = { url: url, options: options, encrypted_response: processed_data }.to_json
|
33
|
+
Net::HTTP.start(uri.hostname, uri.port) do |http|
|
34
|
+
http.request(req)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def check_config
|
39
|
+
CACHE_SERVICE_URL.nil?
|
40
|
+
end
|
41
|
+
end
|
data/lib/arachnid2/version.rb
CHANGED
data/lib/arachnid2.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require "arachnid2/version"
|
2
|
+
require "arachnid2/cashed_arachnid_responses"
|
2
3
|
|
3
4
|
require 'tempfile'
|
4
5
|
require "typhoeus"
|
@@ -6,9 +7,10 @@ require "bloomfilter-rb"
|
|
6
7
|
require "adomain"
|
7
8
|
require "addressable/uri"
|
8
9
|
require "nokogiri"
|
10
|
+
require "base64"
|
9
11
|
|
10
12
|
class Arachnid2
|
11
|
-
|
13
|
+
include CashedArachnidResponses
|
12
14
|
# META:
|
13
15
|
# About the origins of this crawling approach
|
14
16
|
# The Crawler is heavily borrowed from by Arachnid.
|
@@ -57,6 +59,7 @@ class Arachnid2
|
|
57
59
|
def initialize(url)
|
58
60
|
@url = url
|
59
61
|
@domain = Adomain[@url]
|
62
|
+
@cached_data = []
|
60
63
|
end
|
61
64
|
|
62
65
|
#
|
@@ -113,7 +116,15 @@ class Arachnid2
|
|
113
116
|
|
114
117
|
request = Typhoeus::Request.new(q, request_options)
|
115
118
|
|
119
|
+
data = load_data(@url, opts)
|
120
|
+
unless data.nil?
|
121
|
+
data.each do |response|
|
122
|
+
yield response
|
123
|
+
end
|
124
|
+
return
|
125
|
+
end
|
116
126
|
request.on_complete do |response|
|
127
|
+
@cached_data.push(response)
|
117
128
|
links = process(response)
|
118
129
|
next unless links
|
119
130
|
|
@@ -126,10 +137,13 @@ class Arachnid2
|
|
126
137
|
end # @max_concurrency.times do
|
127
138
|
|
128
139
|
@hydra.run
|
129
|
-
end # until @global_queue.empty?
|
130
140
|
|
141
|
+
end # until @global_queue.empty?
|
142
|
+
put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
|
131
143
|
ensure
|
132
144
|
@cookie_file.close! if @cookie_file
|
145
|
+
|
146
|
+
|
133
147
|
end # def crawl(opts = {})
|
134
148
|
|
135
149
|
private
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -142,6 +142,7 @@ files:
|
|
142
142
|
- bin/console
|
143
143
|
- bin/setup
|
144
144
|
- lib/arachnid2.rb
|
145
|
+
- lib/arachnid2/cashed_arachnid_responses.rb
|
145
146
|
- lib/arachnid2/version.rb
|
146
147
|
homepage: https://github.com/samnissen/arachnid2
|
147
148
|
licenses:
|