arachnid2 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/Gemfile.lock +7 -1
 - data/README.md +4 -1
 - data/lib/arachnid2/cashed_arachnid_responses.rb +41 -0
 - data/lib/arachnid2/version.rb +1 -1
 - data/lib/arachnid2.rb +16 -2
 - metadata +3 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 00cef9d45ae8be8b0747d47e254737fdfdb94e3f40cfe85a99faff283653f87b
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 24ecf163c8b2eeda25908067a0efa68aec661441c59b06c466ee22828d154f5d
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 2830f48686f9c2e9a921da58cca907580c800716e856982ae5e836f6dd51ab899192456bc4090ad78aac89eb6db49496d2ad953713d4d52548f1e248a7198df2
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: d3175c6f6574dc5a6feb9955e2a7804accfdd09c705cd90dea11586bd774fb79a67fdca499afe628739eb85b8d6adf6ff13d831658b23f4a993fc210e429c8f1
         
     | 
    
        data/Gemfile.lock
    CHANGED
    
    | 
         @@ -17,13 +17,18 @@ GEM 
     | 
|
| 
       17 
17 
     | 
    
         
             
                  addressable (~> 2.5)
         
     | 
| 
       18 
18 
     | 
    
         
             
                bloomfilter-rb (2.1.1)
         
     | 
| 
       19 
19 
     | 
    
         
             
                  redis
         
     | 
| 
      
 20 
     | 
    
         
            +
                coderay (1.1.2)
         
     | 
| 
       20 
21 
     | 
    
         
             
                diff-lcs (1.3)
         
     | 
| 
       21 
22 
     | 
    
         
             
                ethon (0.11.0)
         
     | 
| 
       22 
23 
     | 
    
         
             
                  ffi (>= 1.3.0)
         
     | 
| 
       23 
24 
     | 
    
         
             
                ffi (1.9.25)
         
     | 
| 
      
 25 
     | 
    
         
            +
                method_source (0.9.0)
         
     | 
| 
       24 
26 
     | 
    
         
             
                mini_portile2 (2.3.0)
         
     | 
| 
       25 
27 
     | 
    
         
             
                nokogiri (1.8.4)
         
     | 
| 
       26 
28 
     | 
    
         
             
                  mini_portile2 (~> 2.3.0)
         
     | 
| 
      
 29 
     | 
    
         
            +
                pry (0.11.3)
         
     | 
| 
      
 30 
     | 
    
         
            +
                  coderay (~> 1.1.0)
         
     | 
| 
      
 31 
     | 
    
         
            +
                  method_source (~> 0.9.0)
         
     | 
| 
       27 
32 
     | 
    
         
             
                public_suffix (3.0.3)
         
     | 
| 
       28 
33 
     | 
    
         
             
                rake (10.5.0)
         
     | 
| 
       29 
34 
     | 
    
         
             
                redis (4.0.2)
         
     | 
| 
         @@ -49,8 +54,9 @@ PLATFORMS 
     | 
|
| 
       49 
54 
     | 
    
         
             
            DEPENDENCIES
         
     | 
| 
       50 
55 
     | 
    
         
             
              arachnid2!
         
     | 
| 
       51 
56 
     | 
    
         
             
              bundler (~> 1.16)
         
     | 
| 
      
 57 
     | 
    
         
            +
              pry
         
     | 
| 
       52 
58 
     | 
    
         
             
              rake (~> 10.0)
         
     | 
| 
       53 
59 
     | 
    
         
             
              rspec (~> 3.0)
         
     | 
| 
       54 
60 
     | 
    
         | 
| 
       55 
61 
     | 
    
         
             
            BUNDLED WITH
         
     | 
| 
       56 
     | 
    
         
            -
               1.16. 
     | 
| 
      
 62 
     | 
    
         
            +
               1.16.3
         
     | 
    
        data/README.md
    CHANGED
    
    | 
         @@ -22,6 +22,9 @@ on that page, and visiting those to do the same. 
     | 
|
| 
       22 
22 
     | 
    
         
             
            Hence, the simplest output would be to collect all of the responses
         
     | 
| 
       23 
23 
     | 
    
         
             
            while spidering from some URL.
         
     | 
| 
       24 
24 
     | 
    
         | 
| 
      
 25 
     | 
    
         
            +
            Set cached service url(optional)
         
     | 
| 
      
 26 
     | 
    
         
            +
            `export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
       25 
28 
     | 
    
         
             
            ```ruby
         
     | 
| 
       26 
29 
     | 
    
         
             
            require "arachnid2"
         
     | 
| 
       27 
30 
     | 
    
         | 
| 
         @@ -129,7 +132,7 @@ by Typhoeus. 
     | 
|
| 
       129 
132 
     | 
    
         | 
| 
       130 
133 
     | 
    
         
             
            This is the list of TLDs to ignore when collecting URLs from the page.
         
     | 
| 
       131 
134 
     | 
    
         
             
            The extensions are formatted as a hash of key/value pairs, where the value
         
     | 
| 
       132 
     | 
    
         
            -
            is an array of TLDs, and the keys represent the length of those TLDs. 
     | 
| 
      
 135 
     | 
    
         
            +
            is an array of TLDs, and the keys represent the length of those TLDs.
         
     | 
| 
       133 
136 
     | 
    
         | 
| 
       134 
137 
     | 
    
         
             
            #### `memory_limit` and Docker
         
     | 
| 
       135 
138 
     | 
    
         | 
| 
         @@ -0,0 +1,41 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'net/http'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'json'
         
     | 
| 
      
 3 
     | 
    
         
            +
            module CashedArachnidResponses
         
     | 
| 
      
 4 
     | 
    
         
            +
              CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
              def load_data(_url, _options)
         
     | 
| 
      
 7 
     | 
    
         
            +
                return if check_config
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses?url=#{@url}&options=#{@options}")
         
     | 
| 
      
 10 
     | 
    
         
            +
                req = Net::HTTP::Get.new(uri)
         
     | 
| 
      
 11 
     | 
    
         
            +
                req['Accept'] = 'json'
         
     | 
| 
      
 12 
     | 
    
         
            +
                Net::HTTP.start(uri.hostname, uri.port) do |http|
         
     | 
| 
      
 13 
     | 
    
         
            +
                  response = http.request(req)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  return nil if response.code != '200'
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                  body = ::JSON.parse(response.body)
         
     | 
| 
      
 17 
     | 
    
         
            +
                  responses_list = Base64.decode64(body['encrypted_response'])
         
     | 
| 
      
 18 
     | 
    
         
            +
                  return Marshal.load responses_list # here we get array of Typhoeus::Response
         
     | 
| 
      
 19 
     | 
    
         
            +
                end
         
     | 
| 
      
 20 
     | 
    
         
            +
              rescue StandardError
         
     | 
| 
      
 21 
     | 
    
         
            +
                nil
         
     | 
| 
      
 22 
     | 
    
         
            +
              end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
              def put_cached_data(url, options, data)
         
     | 
| 
      
 25 
     | 
    
         
            +
                return if check_config
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses")
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                header = { 'Content-Type': 'application/json' }
         
     | 
| 
      
 30 
     | 
    
         
            +
                req = Net::HTTP::Post.new(uri, header)
         
     | 
| 
      
 31 
     | 
    
         
            +
                processed_data = Base64.encode64(Marshal.dump(data))
         
     | 
| 
      
 32 
     | 
    
         
            +
                req.body = { url: url, options: options, encrypted_response: processed_data }.to_json
         
     | 
| 
      
 33 
     | 
    
         
            +
                Net::HTTP.start(uri.hostname, uri.port) do |http|
         
     | 
| 
      
 34 
     | 
    
         
            +
                  http.request(req)
         
     | 
| 
      
 35 
     | 
    
         
            +
                end
         
     | 
| 
      
 36 
     | 
    
         
            +
              end
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
              def check_config
         
     | 
| 
      
 39 
     | 
    
         
            +
                CACHE_SERVICE_URL.nil?
         
     | 
| 
      
 40 
     | 
    
         
            +
              end
         
     | 
| 
      
 41 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/arachnid2/version.rb
    CHANGED
    
    
    
        data/lib/arachnid2.rb
    CHANGED
    
    | 
         @@ -1,4 +1,5 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require "arachnid2/version"
         
     | 
| 
      
 2 
     | 
    
         
            +
            require "arachnid2/cashed_arachnid_responses"
         
     | 
| 
       2 
3 
     | 
    
         | 
| 
       3 
4 
     | 
    
         
             
            require 'tempfile'
         
     | 
| 
       4 
5 
     | 
    
         
             
            require "typhoeus"
         
     | 
| 
         @@ -6,9 +7,10 @@ require "bloomfilter-rb" 
     | 
|
| 
       6 
7 
     | 
    
         
             
            require "adomain"
         
     | 
| 
       7 
8 
     | 
    
         
             
            require "addressable/uri"
         
     | 
| 
       8 
9 
     | 
    
         
             
            require "nokogiri"
         
     | 
| 
      
 10 
     | 
    
         
            +
            require "base64"
         
     | 
| 
       9 
11 
     | 
    
         | 
| 
       10 
12 
     | 
    
         
             
            class Arachnid2
         
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
      
 13 
     | 
    
         
            +
              include CashedArachnidResponses
         
     | 
| 
       12 
14 
     | 
    
         
             
              # META:
         
     | 
| 
       13 
15 
     | 
    
         
             
              #   About the origins of this crawling approach
         
     | 
| 
       14 
16 
     | 
    
         
             
              # The Crawler is heavily borrowed from by Arachnid.
         
     | 
| 
         @@ -57,6 +59,7 @@ class Arachnid2 
     | 
|
| 
       57 
59 
     | 
    
         
             
              def initialize(url)
         
     | 
| 
       58 
60 
     | 
    
         
             
                @url = url
         
     | 
| 
       59 
61 
     | 
    
         
             
                @domain = Adomain[@url]
         
     | 
| 
      
 62 
     | 
    
         
            +
                @cached_data = []
         
     | 
| 
       60 
63 
     | 
    
         
             
              end
         
     | 
| 
       61 
64 
     | 
    
         | 
| 
       62 
65 
     | 
    
         
             
              #
         
     | 
| 
         @@ -113,7 +116,15 @@ class Arachnid2 
     | 
|
| 
       113 
116 
     | 
    
         | 
| 
       114 
117 
     | 
    
         
             
                    request = Typhoeus::Request.new(q, request_options)
         
     | 
| 
       115 
118 
     | 
    
         | 
| 
      
 119 
     | 
    
         
            +
                    data = load_data(@url, opts)
         
     | 
| 
      
 120 
     | 
    
         
            +
                    unless data.nil?
         
     | 
| 
      
 121 
     | 
    
         
            +
                      data.each do |response|
         
     | 
| 
      
 122 
     | 
    
         
            +
                        yield response
         
     | 
| 
      
 123 
     | 
    
         
            +
                      end
         
     | 
| 
      
 124 
     | 
    
         
            +
                      return
         
     | 
| 
      
 125 
     | 
    
         
            +
                    end
         
     | 
| 
       116 
126 
     | 
    
         
             
                    request.on_complete do |response|
         
     | 
| 
      
 127 
     | 
    
         
            +
                      @cached_data.push(response)
         
     | 
| 
       117 
128 
     | 
    
         
             
                      links = process(response)
         
     | 
| 
       118 
129 
     | 
    
         
             
                      next unless links
         
     | 
| 
       119 
130 
     | 
    
         | 
| 
         @@ -126,10 +137,13 @@ class Arachnid2 
     | 
|
| 
       126 
137 
     | 
    
         
             
                  end # @max_concurrency.times do
         
     | 
| 
       127 
138 
     | 
    
         | 
| 
       128 
139 
     | 
    
         
             
                  @hydra.run
         
     | 
| 
       129 
     | 
    
         
            -
                end # until @global_queue.empty?
         
     | 
| 
       130 
140 
     | 
    
         | 
| 
      
 141 
     | 
    
         
            +
                end # until @global_queue.empty?
         
     | 
| 
      
 142 
     | 
    
         
            +
                put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
         
     | 
| 
       131 
143 
     | 
    
         
             
              ensure
         
     | 
| 
       132 
144 
     | 
    
         
             
                @cookie_file.close! if @cookie_file
         
     | 
| 
      
 145 
     | 
    
         
            +
             
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
       133 
147 
     | 
    
         
             
              end # def crawl(opts = {})
         
     | 
| 
       134 
148 
     | 
    
         | 
| 
       135 
149 
     | 
    
         
             
              private
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: arachnid2
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.2.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Sam Nissen
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2019-01-25 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: bundler
         
     | 
| 
         @@ -142,6 +142,7 @@ files: 
     | 
|
| 
       142 
142 
     | 
    
         
             
            - bin/console
         
     | 
| 
       143 
143 
     | 
    
         
             
            - bin/setup
         
     | 
| 
       144 
144 
     | 
    
         
             
            - lib/arachnid2.rb
         
     | 
| 
      
 145 
     | 
    
         
            +
            - lib/arachnid2/cashed_arachnid_responses.rb
         
     | 
| 
       145 
146 
     | 
    
         
             
            - lib/arachnid2/version.rb
         
     | 
| 
       146 
147 
     | 
    
         
             
            homepage: https://github.com/samnissen/arachnid2
         
     | 
| 
       147 
148 
     | 
    
         
             
            licenses:
         
     |