arachnid2 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 420df644a588b8eac92cfda03df0ab2ca20de52e3123aa7e1990ff850fd404d5
4
- data.tar.gz: 49cdd7681f110d9a1d53075563b84e2614e959dad4cb39f877a36bf28be4dbf4
3
+ metadata.gz: 00cef9d45ae8be8b0747d47e254737fdfdb94e3f40cfe85a99faff283653f87b
4
+ data.tar.gz: 24ecf163c8b2eeda25908067a0efa68aec661441c59b06c466ee22828d154f5d
5
5
  SHA512:
6
- metadata.gz: 49229b32b3d79cb560879298d2fa54f6206c8c419bbb86e7da9e1e132f84132026689270d0a6810610921be01676b37adb494e03a4f96a094561cb55fb2f7e4b
7
- data.tar.gz: 505716bfafcdb116f25f401355928a2b65573d90d6d315d6710cc29247ce4f29c5c121e9d12c2ec807fe48a047823429aedfc5fe64b8d69683b8a603a8081621
6
+ metadata.gz: 2830f48686f9c2e9a921da58cca907580c800716e856982ae5e836f6dd51ab899192456bc4090ad78aac89eb6db49496d2ad953713d4d52548f1e248a7198df2
7
+ data.tar.gz: d3175c6f6574dc5a6feb9955e2a7804accfdd09c705cd90dea11586bd774fb79a67fdca499afe628739eb85b8d6adf6ff13d831658b23f4a993fc210e429c8f1
data/Gemfile.lock CHANGED
@@ -17,13 +17,18 @@ GEM
17
17
  addressable (~> 2.5)
18
18
  bloomfilter-rb (2.1.1)
19
19
  redis
20
+ coderay (1.1.2)
20
21
  diff-lcs (1.3)
21
22
  ethon (0.11.0)
22
23
  ffi (>= 1.3.0)
23
24
  ffi (1.9.25)
25
+ method_source (0.9.0)
24
26
  mini_portile2 (2.3.0)
25
27
  nokogiri (1.8.4)
26
28
  mini_portile2 (~> 2.3.0)
29
+ pry (0.11.3)
30
+ coderay (~> 1.1.0)
31
+ method_source (~> 0.9.0)
27
32
  public_suffix (3.0.3)
28
33
  rake (10.5.0)
29
34
  redis (4.0.2)
@@ -49,8 +54,9 @@ PLATFORMS
49
54
  DEPENDENCIES
50
55
  arachnid2!
51
56
  bundler (~> 1.16)
57
+ pry
52
58
  rake (~> 10.0)
53
59
  rspec (~> 3.0)
54
60
 
55
61
  BUNDLED WITH
56
- 1.16.2
62
+ 1.16.3
data/README.md CHANGED
@@ -22,6 +22,9 @@ on that page, and visiting those to do the same.
22
22
  Hence, the simplest output would be to collect all of the responses
23
23
  while spidering from some URL.
24
24
 
25
+ Set cached service url(optional)
26
+ `export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
27
+
25
28
  ```ruby
26
29
  require "arachnid2"
27
30
 
@@ -129,7 +132,7 @@ by Typhoeus.
129
132
 
130
133
  This is the list of TLDs to ignore when collecting URLs from the page.
131
134
  The extensions are formatted as a hash of key/value pairs, where the value
132
- is an array of TLDs, and the keys represent the length of those TLDs.
135
+ is an array of TLDs, and the keys represent the length of those TLDs.
133
136
 
134
137
  #### `memory_limit` and Docker
135
138
 
@@ -0,0 +1,41 @@
1
+ require 'net/http'
2
+ require 'json'
3
+ module CashedArachnidResponses
4
+ CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
5
+
6
+ def load_data(_url, _options)
7
+ return if check_config
8
+
9
+ uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses?url=#{@url}&options=#{@options}")
10
+ req = Net::HTTP::Get.new(uri)
11
+ req['Accept'] = 'json'
12
+ Net::HTTP.start(uri.hostname, uri.port) do |http|
13
+ response = http.request(req)
14
+ return nil if response.code != '200'
15
+
16
+ body = ::JSON.parse(response.body)
17
+ responses_list = Base64.decode64(body['encrypted_response'])
18
+ return Marshal.load responses_list # here we get array of Typhoeus::Response
19
+ end
20
+ rescue StandardError
21
+ nil
22
+ end
23
+
24
+ def put_cached_data(url, options, data)
25
+ return if check_config
26
+
27
+ uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses")
28
+
29
+ header = { 'Content-Type': 'application/json' }
30
+ req = Net::HTTP::Post.new(uri, header)
31
+ processed_data = Base64.encode64(Marshal.dump(data))
32
+ req.body = { url: url, options: options, encrypted_response: processed_data }.to_json
33
+ Net::HTTP.start(uri.hostname, uri.port) do |http|
34
+ http.request(req)
35
+ end
36
+ end
37
+
38
+ def check_config
39
+ CACHE_SERVICE_URL.nil?
40
+ end
41
+ end
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.1.4"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/arachnid2.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require "arachnid2/version"
2
+ require "arachnid2/cashed_arachnid_responses"
2
3
 
3
4
  require 'tempfile'
4
5
  require "typhoeus"
@@ -6,9 +7,10 @@ require "bloomfilter-rb"
6
7
  require "adomain"
7
8
  require "addressable/uri"
8
9
  require "nokogiri"
10
+ require "base64"
9
11
 
10
12
  class Arachnid2
11
-
13
+ include CashedArachnidResponses
12
14
  # META:
13
15
  # About the origins of this crawling approach
14
16
  # The Crawler is heavily borrowed from by Arachnid.
@@ -57,6 +59,7 @@ class Arachnid2
57
59
  def initialize(url)
58
60
  @url = url
59
61
  @domain = Adomain[@url]
62
+ @cached_data = []
60
63
  end
61
64
 
62
65
  #
@@ -113,7 +116,15 @@ class Arachnid2
113
116
 
114
117
  request = Typhoeus::Request.new(q, request_options)
115
118
 
119
+ data = load_data(@url, opts)
120
+ unless data.nil?
121
+ data.each do |response|
122
+ yield response
123
+ end
124
+ return
125
+ end
116
126
  request.on_complete do |response|
127
+ @cached_data.push(response)
117
128
  links = process(response)
118
129
  next unless links
119
130
 
@@ -126,10 +137,13 @@ class Arachnid2
126
137
  end # @max_concurrency.times do
127
138
 
128
139
  @hydra.run
129
- end # until @global_queue.empty?
130
140
 
141
+ end # until @global_queue.empty?
142
+ put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
131
143
  ensure
132
144
  @cookie_file.close! if @cookie_file
145
+
146
+
133
147
  end # def crawl(opts = {})
134
148
 
135
149
  private
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-08-22 00:00:00.000000000 Z
11
+ date: 2019-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -142,6 +142,7 @@ files:
142
142
  - bin/console
143
143
  - bin/setup
144
144
  - lib/arachnid2.rb
145
+ - lib/arachnid2/cashed_arachnid_responses.rb
145
146
  - lib/arachnid2/version.rb
146
147
  homepage: https://github.com/samnissen/arachnid2
147
148
  licenses: