medusa-crawler 1.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ module Medusa
2
+ module Storage
3
+
4
+ class GenericError < Error; end;
5
+
6
+ class ConnectionError < Error; end
7
+
8
+ class RetrievalError < Error; end
9
+
10
+ class InsertionError < Error; end
11
+
12
+ class CloseError < Error; end
13
+
14
+ end
15
+ end
@@ -0,0 +1,42 @@
1
+ require 'moneta'
2
+ require 'forwardable'
3
+
4
+ module Medusa
5
+ module Storage
6
+ class Moneta
7
+ extend Forwardable
8
+
9
+ def_delegators :@moneta, :[], :[]=, :delete, :key?, :clear, :close
10
+
11
+ alias has_key? key?
12
+
13
+ def initialize(name, options = {})
14
+ default_options = { threadsafe: true, prefix: 'medusa' }
15
+ @moneta = ::Moneta.new(name, default_options.merge(options))
16
+ end
17
+
18
+ def each
19
+ @moneta.each_key do |k|
20
+ yield k, @moneta.fetch(k)
21
+ end
22
+ self
23
+ end
24
+
25
+ def size
26
+ current_size = @moneta.each_key.size
27
+
28
+ return @moneta.each_key.reduce(0) { |size, k| size + 1 } if current_size.nil?
29
+ return current_size
30
+ end
31
+
32
+ def keys
33
+ @moneta.each_key.to_a.sort
34
+ end
35
+
36
+ def merge!(hash)
37
+ @moneta.merge!(hash) unless hash.empty?
38
+ self
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,39 @@
1
+ require 'medusa/http'
2
+
3
+ module Medusa
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue, opts = {})
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ @http = Medusa::HTTP.new(opts)
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Gets links from @link_queue, and returns the fetched
18
+ # Page objects into @page_queue
19
+ #
20
+ def run
21
+ loop do
22
+ link, referer, depth = @link_queue.deq
23
+
24
+ break if link == :END
25
+
26
+ @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
27
+
28
+ delay
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay] > 0
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ module Medusa
2
+ VERSION = '1.0.0.pre.1'
3
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'webmock/rspec'
4
+
5
+ WebMock.disable_net_connect!
6
+
7
+ module Medusa
8
+ AUTH = ['user', 'pass']
9
+ SPEC_DOMAIN = "http://www.example.com/"
10
+ AUTH_SPEC_DOMAIN = "http://#{AUTH.join(':')}@#{URI.parse(SPEC_DOMAIN).host}/"
11
+
12
+ class FakePage
13
+ attr_accessor :links
14
+ attr_accessor :hrefs
15
+ attr_accessor :body
16
+
17
+ def initialize(name = '', options = {})
18
+ @name = name
19
+ @links = []
20
+ @hrefs = []
21
+ @redirect = nil
22
+ @auth = false
23
+ @base = ''
24
+
25
+ @links = [options[:links]].flatten if options.has_key?(:links)
26
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
27
+ @redirect = options[:redirect] if options.has_key?(:redirect)
28
+ @auth = options[:auth] if options.has_key?(:auth)
29
+ @base = options[:base] if options.has_key?(:base)
30
+ @content_type = options[:content_type] || "text/html"
31
+ @body = options[:body]
32
+
33
+ create_body unless @body
34
+ add_to_fakeweb
35
+ end
36
+
37
+ def url
38
+ SPEC_DOMAIN + @name
39
+ end
40
+
41
+ def auth_url
42
+ AUTH_SPEC_DOMAIN + @name
43
+ end
44
+
45
+ private
46
+
47
+ def create_body
48
+ if @base
49
+ @body = "<html><head><base href=\"#{@base}\"></head><body>"
50
+ else
51
+ @body = "<html><body>"
52
+ end
53
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
54
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
55
+ @body += "</body></html>"
56
+ end
57
+
58
+ def add_to_fakeweb
59
+ options = {body: @body, status: [200, 'OK'], headers: {'Content-Type' => @content_type}}
60
+
61
+ if @redirect
62
+ options[:status] = [301, 'Moved Permanently']
63
+
64
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
65
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
66
+ options[:headers]['Location'] = redirect_url
67
+
68
+ # register the page this one redirects to
69
+ WebMock.stub_request(:get, redirect_url).to_return(body: '', status: [200, 'OK'], headers: {'Content-Type' => @content_type})
70
+ end
71
+
72
+ if @auth
73
+ unautorized_options = {body: 'Unauthorized', status: [401, 'Unauthorized']}
74
+
75
+ WebMock.stub_request(:get, url).to_return(unautorized_options)
76
+ WebMock.stub_request(:get, url).with(basic_auth: AUTH).to_return(options)
77
+ else
78
+ WebMock.stub_request(:get, url).to_return(options)
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ #default root
85
+ Medusa::FakePage.new
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'medusa'
4
+
5
+ SPEC_DOMAIN = 'http://www.example.com/'
@@ -0,0 +1,14 @@
1
+
2
+ RSpec.describe Medusa do
3
+
4
+ it "should have a version" do
5
+ expect(Medusa.const_defined?('VERSION')).to be true
6
+ end
7
+
8
+ it "should return a Medusa::Core from the crawl, which has a PageStore" do
9
+ result = Medusa.crawl(SPEC_DOMAIN)
10
+ expect(result).to be_an_instance_of(Medusa::Core)
11
+ expect(result.pages).to be_an_instance_of(Medusa::PageStore)
12
+ end
13
+
14
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'medusa_helper'
4
+
5
+ # This file was generated by the `rspec --init` command. Conventionally, all
6
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
7
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
8
+ # this file to always be loaded, without a need to explicitly require it in any
9
+ # files.
10
+ #
11
+ # Given that it is always loaded, you are encouraged to keep this file as
12
+ # light-weight as possible. Requiring heavyweight dependencies from this file
13
+ # will add to the boot time of your test suite on EVERY test run, even for an
14
+ # individual file that may not need all of that loaded. Instead, consider making
15
+ # a separate helper file that requires the additional dependencies and performs
16
+ # the additional setup, and require it from the spec files that actually need
17
+ # it.
18
+ #
19
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
20
+ RSpec.configure do |config|
21
+ # rspec-expectations config goes here. You can use an alternate
22
+ # assertion/expectation library such as wrong or the stdlib/minitest
23
+ # assertions if you prefer.
24
+ config.expect_with :rspec do |expectations|
25
+ # This option will default to `true` in RSpec 4. It makes the `description`
26
+ # and `failure_message` of custom matchers include text for helper methods
27
+ # defined using `chain`, e.g.:
28
+ # be_bigger_than(2).and_smaller_than(4).description
29
+ # # => "be bigger than 2 and smaller than 4"
30
+ # ...rather than:
31
+ # # => "be bigger than 2"
32
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
33
+ end
34
+
35
+ # rspec-mocks config goes here. You can use an alternate test double
36
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
37
+ config.mock_with :rspec do |mocks|
38
+ # Prevents you from mocking or stubbing a method that does not exist on
39
+ # a real object. This is generally recommended, and will default to
40
+ # `true` in RSpec 4.
41
+ mocks.verify_partial_doubles = true
42
+ end
43
+
44
+ # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
45
+ # have no way to turn it off -- the option exists only for backwards
46
+ # compatibility in RSpec 3). It causes shared context metadata to be
47
+ # inherited by the metadata hash of host groups and examples, rather than
48
+ # triggering implicit auto-inclusion in groups with matching metadata.
49
+ config.shared_context_metadata_behavior = :apply_to_host_groups
50
+
51
+ # The settings below are suggested to provide a good initial experience
52
+ # with RSpec, but feel free to customize to your heart's content.
53
+
54
+ # This allows you to limit a spec run to individual examples or groups
55
+ # you care about by tagging them with `:focus` metadata. When nothing
56
+ # is tagged with `:focus`, all examples get run. RSpec also provides
57
+ # aliases for `it`, `describe`, and `context` that include `:focus`
58
+ # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
59
+ config.filter_run_when_matching :focus
60
+
61
+ # Allows RSpec to persist some state between runs in order to support
62
+ # the `--only-failures` and `--next-failure` CLI options. We recommend
63
+ # you configure your source control system to ignore this file.
64
+ config.example_status_persistence_file_path = "spec/examples.txt"
65
+
66
+ # Limits the available syntax to the non-monkey patched syntax that is
67
+ # recommended. For more details, see:
68
+ # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
69
+ # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
70
+ # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
71
+ config.disable_monkey_patching!
72
+
73
+ # This setting enables warnings. It's recommended, but in some cases may
74
+ # be too noisy due to issues in dependencies.
75
+ # config.warnings = true
76
+
77
+ # Many RSpec users commonly either run the entire suite or an individual
78
+ # file, and it's useful to allow more verbose output when running an
79
+ # individual spec file.
80
+ if config.files_to_run.one?
81
+ # Use the documentation formatter for detailed output,
82
+ # unless a formatter has already been configured
83
+ # (e.g. via a command-line flag).
84
+ config.default_formatter = "doc"
85
+ end
86
+
87
+ # Print the 10 slowest examples and example groups at the
88
+ # end of the spec run, to help surface which specs are running
89
+ # particularly slow.
90
+ config.profile_examples = 10
91
+
92
+ # Run specs in random order to surface order dependencies. If you find an
93
+ # order dependency and want to debug it, you can fix the order by providing
94
+ # the seed, which is printed after each run.
95
+ # --seed 1234
96
+ config.order = :random
97
+
98
+ # Seed global randomization in this process using the `--seed` CLI option.
99
+ # Setting this allows you to use `--seed` to deterministically reproduce
100
+ # test failures related to randomization by passing the same `--seed` value
101
+ # as the one that triggered the failure.
102
+ Kernel.srand config.seed
103
+
104
+ end
metadata ADDED
@@ -0,0 +1,187 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: medusa-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0.pre.1
5
+ platform: ruby
6
+ authors:
7
+ - Mauro Asprea
8
+ - Chris Kite
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain:
12
+ - |
13
+ -----BEGIN CERTIFICATE-----
14
+ MIIERDCCAqygAwIBAgIBATANBgkqhkiG9w0BAQsFADAmMSQwIgYDVQQDDBttYXVy
15
+ b2FzcHJlYS9EQz1nbWFpbC9EQz1jb20wHhcNMjAwODA2MTEwNDAzWhcNMjEwODA2
16
+ MTEwNDAzWjAmMSQwIgYDVQQDDBttYXVyb2FzcHJlYS9EQz1nbWFpbC9EQz1jb20w
17
+ ggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDNsvJN5orxdj+eEUiSttpC
18
+ 6N6IeTK/btk65TAbZlY7/9MVIHM/Ya3tUSWoQA6KURsksV+lUbJm0MC4pwm5y46T
19
+ p+Q6/eDz8gmofian4X+y3inyzYqmvbM2t2mPcO+Mj1NjSfGn56jqk0ey32xL7cz3
20
+ YZqx9LnkUUuqzn4QM+R8LfGUWXe9nlkMoLoNGw+X1XB8bPYoQo2YpofILEWcCLJl
21
+ NnFjcjIzQuE3NDaPIBX/cXPDpvuKNVXw51My35pQb5uW4Aa7sRZAz1MCDoFA1+XO
22
+ 9czVf+zE+YsMtuqruWOI08TEPD2kMeiDqMc1fbPkLN9Rhe2Nxj8fTK1Ptyz0mQJL
23
+ x5cQZT2XK+nmZKiU/z+91XU17BNd+LEERU6ZmjkGQKUH/Gj060kQWMnw2sOsZJS9
24
+ VVU/raletJxhODeQbShcrxUR6VSGDkgQa4G/rqtPjUQ9AXYeNeVBg2aE4DRTRFhy
25
+ gYsyE9DbAluV7BcJWO+GKntyXqhu14dnATCfSFI3WMsCAwEAAaN9MHswCQYDVR0T
26
+ BAIwADALBgNVHQ8EBAMCBLAwHQYDVR0OBBYEFAjiCg2cEkNDWhG4P1PgDwATcmgR
27
+ MCAGA1UdEQQZMBeBFW1hdXJvYXNwcmVhQGdtYWlsLmNvbTAgBgNVHRIEGTAXgRVt
28
+ YXVyb2FzcHJlYUBnbWFpbC5jb20wDQYJKoZIhvcNAQELBQADggGBAG4S5uDseGtN
29
+ EqxvogGfJ+h7Pg5pdDsMVFWXOPN9IztSHE+88Ypv9b97zNk/vBztj3nwkUuhEczv
30
+ GZeI9/F72j1+8ARWdNgHalsrTHcFWaGEYNkm06EGh21dtO939Uqjg9auv9thDD/C
31
+ 4jlEii1EvaGIaMhEdONAiRyOj0gEagw1AU5ItSxoWHFaaCEQSqXRskZMpw6fDZrE
32
+ jnJPYx5I8axku6D+/nlP3GeuMNwlqzTg1YblLDanzEdI7Yet41MgMwLcor6Z4BXb
33
+ xKDzHfGjkQfEpEvwzqSi0rAbrMv68FRybNxgB/3gwgSiEaeFeE4CFX0gm1F3m28m
34
+ Z9rgjNU1SA9dSTGLY14K3rjtn71PMlP64Ci+QJ0HA8V0+cpx1Lkn3WsywgILV16a
35
+ g4G6EZGbKCMwJDC0Wtmrygr7+THZVQlBs0ljTdrN8GXsuI9W52VlZctZQXEuoboH
36
+ mpXw1d3WewNciml1VaOG782DKqZvT0i19V5LnZzoGzmU2q3ZJw7jCw==
37
+ -----END CERTIFICATE-----
38
+ date: 2020-08-06 00:00:00.000000000 Z
39
+ dependencies:
40
+ - !ruby/object:Gem::Dependency
41
+ name: moneta
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.3'
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.3.0
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '1.3'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 1.3.0
60
+ - !ruby/object:Gem::Dependency
61
+ name: nokogiri
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - "~>"
65
+ - !ruby/object:Gem::Version
66
+ version: '1.3'
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 1.3.0
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '1.3'
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 1.3.0
80
+ - !ruby/object:Gem::Dependency
81
+ name: robotex
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '1.0'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 1.0.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.0'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 1.0.0
100
+ description: |-
101
+ == Medusa: a ruby crawler framework
102
+
103
+ Medusa is a ruby framework to crawl and collect useful information about the pages it visits.
104
+ It is versatile, allowing you to write your own specialized tasks quickly and easily.
105
+
106
+ #### Features
107
+
108
+ - Choose the links to follow on each page with `focus_crawl()`
109
+ - Multi-threaded design for high performance
110
+ - Tracks 301 HTTP redirects
111
+ - Allows exclusion of URLs based on regular expressions
112
+ - HTTPS support
113
+ - Records response time for each page
114
+ - Obey robots.txt
115
+ - In-memory or persistent storage of pages during crawl using Moneta adapters.
116
+ - Inherits OpenURI behavior (redirects, automatic charset and encoding detection, proxy configuration options).
117
+ email:
118
+ executables:
119
+ - medusa
120
+ extensions: []
121
+ extra_rdoc_files:
122
+ - README.md
123
+ files:
124
+ - CHANGELOG.md
125
+ - CONTRIBUTORS.md
126
+ - LICENSE.txt
127
+ - README.md
128
+ - Rakefile
129
+ - VERSION
130
+ - bin/medusa
131
+ - lib/medusa.rb
132
+ - lib/medusa/cli.rb
133
+ - lib/medusa/cli/count.rb
134
+ - lib/medusa/cli/cron.rb
135
+ - lib/medusa/cli/pagedepth.rb
136
+ - lib/medusa/cli/serialize.rb
137
+ - lib/medusa/cli/url_list.rb
138
+ - lib/medusa/cookie_store.rb
139
+ - lib/medusa/core.rb
140
+ - lib/medusa/exceptions.rb
141
+ - lib/medusa/http.rb
142
+ - lib/medusa/page.rb
143
+ - lib/medusa/page_store.rb
144
+ - lib/medusa/storage.rb
145
+ - lib/medusa/storage/base.rb
146
+ - lib/medusa/storage/exceptions.rb
147
+ - lib/medusa/storage/moneta.rb
148
+ - lib/medusa/tentacle.rb
149
+ - lib/medusa/version.rb
150
+ - spec/fakeweb_helper.rb
151
+ - spec/medusa_helper.rb
152
+ - spec/medusa_spec.rb
153
+ - spec/spec_helper.rb
154
+ homepage: https://github.com/brutuscat/medusa-crawler
155
+ licenses:
156
+ - MIT
157
+ metadata:
158
+ bug_tracker_uri: https://github.com/brutuscat/medusa-crawler/issues
159
+ source_code_uri: https://github.com/brutuscat/medusa-crawler/tree/v1.0.0.pre.1
160
+ post_install_message:
161
+ rdoc_options:
162
+ - "-m"
163
+ - README.md
164
+ - "-t"
165
+ - Medusa
166
+ require_paths:
167
+ - lib
168
+ required_ruby_version: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ required_rubygems_version: !ruby/object:Gem::Requirement
174
+ requirements:
175
+ - - ">"
176
+ - !ruby/object:Gem::Version
177
+ version: 1.3.1
178
+ requirements: []
179
+ rubygems_version: 3.1.2
180
+ signing_key:
181
+ specification_version: 4
182
+ summary: Medusa is a ruby crawler framework
183
+ test_files:
184
+ - spec/spec_helper.rb
185
+ - spec/medusa_helper.rb
186
+ - spec/fakeweb_helper.rb
187
+ - spec/medusa_spec.rb