medusa-crawler 1.0.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ module Medusa
2
+ module Storage
3
+
4
+ class GenericError < Error; end;
5
+
6
+ class ConnectionError < Error; end
7
+
8
+ class RetrievalError < Error; end
9
+
10
+ class InsertionError < Error; end
11
+
12
+ class CloseError < Error; end
13
+
14
+ end
15
+ end
@@ -0,0 +1,42 @@
1
+ require 'moneta'
2
+ require 'forwardable'
3
+
4
+ module Medusa
5
+ module Storage
6
+ class Moneta
7
+ extend Forwardable
8
+
9
+ def_delegators :@moneta, :[], :[]=, :delete, :key?, :clear, :close
10
+
11
+ alias has_key? key?
12
+
13
+ def initialize(name, options = {})
14
+ default_options = { threadsafe: true, prefix: 'medusa' }
15
+ @moneta = ::Moneta.new(name, default_options.merge(options))
16
+ end
17
+
18
+ def each
19
+ @moneta.each_key do |k|
20
+ yield k, @moneta.fetch(k)
21
+ end
22
+ self
23
+ end
24
+
25
+ def size
26
+ current_size = @moneta.each_key.size
27
+
28
+ return @moneta.each_key.reduce(0) { |size, k| size + 1 } if current_size.nil?
29
+ return current_size
30
+ end
31
+
32
+ def keys
33
+ @moneta.each_key.to_a.sort
34
+ end
35
+
36
+ def merge!(hash)
37
+ @moneta.merge!(hash) unless hash.empty?
38
+ self
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,39 @@
1
+ require 'medusa/http'
2
+
3
+ module Medusa
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue, opts = {})
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ @http = Medusa::HTTP.new(opts)
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Gets links from @link_queue, and returns the fetched
18
+ # Page objects into @page_queue
19
+ #
20
+ def run
21
+ loop do
22
+ link, referer, depth = @link_queue.deq
23
+
24
+ break if link == :END
25
+
26
+ @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
27
+
28
+ delay
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay] > 0
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ module Medusa
2
+ VERSION = '1.0.0.pre.1'
3
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'webmock/rspec'
4
+
5
+ WebMock.disable_net_connect!
6
+
7
+ module Medusa
8
+ AUTH = ['user', 'pass']
9
+ SPEC_DOMAIN = "http://www.example.com/"
10
+ AUTH_SPEC_DOMAIN = "http://#{AUTH.join(':')}@#{URI.parse(SPEC_DOMAIN).host}/"
11
+
12
+ class FakePage
13
+ attr_accessor :links
14
+ attr_accessor :hrefs
15
+ attr_accessor :body
16
+
17
+ def initialize(name = '', options = {})
18
+ @name = name
19
+ @links = []
20
+ @hrefs = []
21
+ @redirect = nil
22
+ @auth = false
23
+ @base = ''
24
+
25
+ @links = [options[:links]].flatten if options.has_key?(:links)
26
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
27
+ @redirect = options[:redirect] if options.has_key?(:redirect)
28
+ @auth = options[:auth] if options.has_key?(:auth)
29
+ @base = options[:base] if options.has_key?(:base)
30
+ @content_type = options[:content_type] || "text/html"
31
+ @body = options[:body]
32
+
33
+ create_body unless @body
34
+ add_to_fakeweb
35
+ end
36
+
37
+ def url
38
+ SPEC_DOMAIN + @name
39
+ end
40
+
41
+ def auth_url
42
+ AUTH_SPEC_DOMAIN + @name
43
+ end
44
+
45
+ private
46
+
47
+ def create_body
48
+ if @base
49
+ @body = "<html><head><base href=\"#{@base}\"></head><body>"
50
+ else
51
+ @body = "<html><body>"
52
+ end
53
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
54
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
55
+ @body += "</body></html>"
56
+ end
57
+
58
+ def add_to_fakeweb
59
+ options = {body: @body, status: [200, 'OK'], headers: {'Content-Type' => @content_type}}
60
+
61
+ if @redirect
62
+ options[:status] = [301, 'Moved Permanently']
63
+
64
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
65
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
66
+ options[:headers]['Location'] = redirect_url
67
+
68
+ # register the page this one redirects to
69
+ WebMock.stub_request(:get, redirect_url).to_return(body: '', status: [200, 'OK'], headers: {'Content-Type' => @content_type})
70
+ end
71
+
72
+ if @auth
73
+ unautorized_options = {body: 'Unauthorized', status: [401, 'Unauthorized']}
74
+
75
+ WebMock.stub_request(:get, url).to_return(unautorized_options)
76
+ WebMock.stub_request(:get, url).with(basic_auth: AUTH).to_return(options)
77
+ else
78
+ WebMock.stub_request(:get, url).to_return(options)
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ #default root
85
+ Medusa::FakePage.new
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'medusa'
4
+
5
+ SPEC_DOMAIN = 'http://www.example.com/'
@@ -0,0 +1,14 @@
1
+
2
+ RSpec.describe Medusa do
3
+
4
+ it "should have a version" do
5
+ expect(Medusa.const_defined?('VERSION')).to be true
6
+ end
7
+
8
+ it "should return a Medusa::Core from the crawl, which has a PageStore" do
9
+ result = Medusa.crawl(SPEC_DOMAIN)
10
+ expect(result).to be_an_instance_of(Medusa::Core)
11
+ expect(result.pages).to be_an_instance_of(Medusa::PageStore)
12
+ end
13
+
14
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'medusa_helper'
4
+
5
+ # This file was generated by the `rspec --init` command. Conventionally, all
6
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
7
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
8
+ # this file to always be loaded, without a need to explicitly require it in any
9
+ # files.
10
+ #
11
+ # Given that it is always loaded, you are encouraged to keep this file as
12
+ # light-weight as possible. Requiring heavyweight dependencies from this file
13
+ # will add to the boot time of your test suite on EVERY test run, even for an
14
+ # individual file that may not need all of that loaded. Instead, consider making
15
+ # a separate helper file that requires the additional dependencies and performs
16
+ # the additional setup, and require it from the spec files that actually need
17
+ # it.
18
+ #
19
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
20
+ RSpec.configure do |config|
21
+ # rspec-expectations config goes here. You can use an alternate
22
+ # assertion/expectation library such as wrong or the stdlib/minitest
23
+ # assertions if you prefer.
24
+ config.expect_with :rspec do |expectations|
25
+ # This option will default to `true` in RSpec 4. It makes the `description`
26
+ # and `failure_message` of custom matchers include text for helper methods
27
+ # defined using `chain`, e.g.:
28
+ # be_bigger_than(2).and_smaller_than(4).description
29
+ # # => "be bigger than 2 and smaller than 4"
30
+ # ...rather than:
31
+ # # => "be bigger than 2"
32
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
33
+ end
34
+
35
+ # rspec-mocks config goes here. You can use an alternate test double
36
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
37
+ config.mock_with :rspec do |mocks|
38
+ # Prevents you from mocking or stubbing a method that does not exist on
39
+ # a real object. This is generally recommended, and will default to
40
+ # `true` in RSpec 4.
41
+ mocks.verify_partial_doubles = true
42
+ end
43
+
44
+ # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
45
+ # have no way to turn it off -- the option exists only for backwards
46
+ # compatibility in RSpec 3). It causes shared context metadata to be
47
+ # inherited by the metadata hash of host groups and examples, rather than
48
+ # triggering implicit auto-inclusion in groups with matching metadata.
49
+ config.shared_context_metadata_behavior = :apply_to_host_groups
50
+
51
+ # The settings below are suggested to provide a good initial experience
52
+ # with RSpec, but feel free to customize to your heart's content.
53
+
54
+ # This allows you to limit a spec run to individual examples or groups
55
+ # you care about by tagging them with `:focus` metadata. When nothing
56
+ # is tagged with `:focus`, all examples get run. RSpec also provides
57
+ # aliases for `it`, `describe`, and `context` that include `:focus`
58
+ # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
59
+ config.filter_run_when_matching :focus
60
+
61
+ # Allows RSpec to persist some state between runs in order to support
62
+ # the `--only-failures` and `--next-failure` CLI options. We recommend
63
+ # you configure your source control system to ignore this file.
64
+ config.example_status_persistence_file_path = "spec/examples.txt"
65
+
66
+ # Limits the available syntax to the non-monkey patched syntax that is
67
+ # recommended. For more details, see:
68
+ # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
69
+ # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
70
+ # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
71
+ config.disable_monkey_patching!
72
+
73
+ # This setting enables warnings. It's recommended, but in some cases may
74
+ # be too noisy due to issues in dependencies.
75
+ # config.warnings = true
76
+
77
+ # Many RSpec users commonly either run the entire suite or an individual
78
+ # file, and it's useful to allow more verbose output when running an
79
+ # individual spec file.
80
+ if config.files_to_run.one?
81
+ # Use the documentation formatter for detailed output,
82
+ # unless a formatter has already been configured
83
+ # (e.g. via a command-line flag).
84
+ config.default_formatter = "doc"
85
+ end
86
+
87
+ # Print the 10 slowest examples and example groups at the
88
+ # end of the spec run, to help surface which specs are running
89
+ # particularly slow.
90
+ config.profile_examples = 10
91
+
92
+ # Run specs in random order to surface order dependencies. If you find an
93
+ # order dependency and want to debug it, you can fix the order by providing
94
+ # the seed, which is printed after each run.
95
+ # --seed 1234
96
+ config.order = :random
97
+
98
+ # Seed global randomization in this process using the `--seed` CLI option.
99
+ # Setting this allows you to use `--seed` to deterministically reproduce
100
+ # test failures related to randomization by passing the same `--seed` value
101
+ # as the one that triggered the failure.
102
+ Kernel.srand config.seed
103
+
104
+ end
metadata ADDED
@@ -0,0 +1,187 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: medusa-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0.pre.1
5
+ platform: ruby
6
+ authors:
7
+ - Mauro Asprea
8
+ - Chris Kite
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain:
12
+ - |
13
+ -----BEGIN CERTIFICATE-----
14
+ MIIERDCCAqygAwIBAgIBATANBgkqhkiG9w0BAQsFADAmMSQwIgYDVQQDDBttYXVy
15
+ b2FzcHJlYS9EQz1nbWFpbC9EQz1jb20wHhcNMjAwODA2MTEwNDAzWhcNMjEwODA2
16
+ MTEwNDAzWjAmMSQwIgYDVQQDDBttYXVyb2FzcHJlYS9EQz1nbWFpbC9EQz1jb20w
17
+ ggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQDNsvJN5orxdj+eEUiSttpC
18
+ 6N6IeTK/btk65TAbZlY7/9MVIHM/Ya3tUSWoQA6KURsksV+lUbJm0MC4pwm5y46T
19
+ p+Q6/eDz8gmofian4X+y3inyzYqmvbM2t2mPcO+Mj1NjSfGn56jqk0ey32xL7cz3
20
+ YZqx9LnkUUuqzn4QM+R8LfGUWXe9nlkMoLoNGw+X1XB8bPYoQo2YpofILEWcCLJl
21
+ NnFjcjIzQuE3NDaPIBX/cXPDpvuKNVXw51My35pQb5uW4Aa7sRZAz1MCDoFA1+XO
22
+ 9czVf+zE+YsMtuqruWOI08TEPD2kMeiDqMc1fbPkLN9Rhe2Nxj8fTK1Ptyz0mQJL
23
+ x5cQZT2XK+nmZKiU/z+91XU17BNd+LEERU6ZmjkGQKUH/Gj060kQWMnw2sOsZJS9
24
+ VVU/raletJxhODeQbShcrxUR6VSGDkgQa4G/rqtPjUQ9AXYeNeVBg2aE4DRTRFhy
25
+ gYsyE9DbAluV7BcJWO+GKntyXqhu14dnATCfSFI3WMsCAwEAAaN9MHswCQYDVR0T
26
+ BAIwADALBgNVHQ8EBAMCBLAwHQYDVR0OBBYEFAjiCg2cEkNDWhG4P1PgDwATcmgR
27
+ MCAGA1UdEQQZMBeBFW1hdXJvYXNwcmVhQGdtYWlsLmNvbTAgBgNVHRIEGTAXgRVt
28
+ YXVyb2FzcHJlYUBnbWFpbC5jb20wDQYJKoZIhvcNAQELBQADggGBAG4S5uDseGtN
29
+ EqxvogGfJ+h7Pg5pdDsMVFWXOPN9IztSHE+88Ypv9b97zNk/vBztj3nwkUuhEczv
30
+ GZeI9/F72j1+8ARWdNgHalsrTHcFWaGEYNkm06EGh21dtO939Uqjg9auv9thDD/C
31
+ 4jlEii1EvaGIaMhEdONAiRyOj0gEagw1AU5ItSxoWHFaaCEQSqXRskZMpw6fDZrE
32
+ jnJPYx5I8axku6D+/nlP3GeuMNwlqzTg1YblLDanzEdI7Yet41MgMwLcor6Z4BXb
33
+ xKDzHfGjkQfEpEvwzqSi0rAbrMv68FRybNxgB/3gwgSiEaeFeE4CFX0gm1F3m28m
34
+ Z9rgjNU1SA9dSTGLY14K3rjtn71PMlP64Ci+QJ0HA8V0+cpx1Lkn3WsywgILV16a
35
+ g4G6EZGbKCMwJDC0Wtmrygr7+THZVQlBs0ljTdrN8GXsuI9W52VlZctZQXEuoboH
36
+ mpXw1d3WewNciml1VaOG782DKqZvT0i19V5LnZzoGzmU2q3ZJw7jCw==
37
+ -----END CERTIFICATE-----
38
+ date: 2020-08-06 00:00:00.000000000 Z
39
+ dependencies:
40
+ - !ruby/object:Gem::Dependency
41
+ name: moneta
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.3'
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.3.0
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '1.3'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 1.3.0
60
+ - !ruby/object:Gem::Dependency
61
+ name: nokogiri
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - "~>"
65
+ - !ruby/object:Gem::Version
66
+ version: '1.3'
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 1.3.0
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '1.3'
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 1.3.0
80
+ - !ruby/object:Gem::Dependency
81
+ name: robotex
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '1.0'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 1.0.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.0'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 1.0.0
100
+ description: |-
101
+ == Medusa: a ruby crawler framework
102
+
103
+ Medusa is a ruby framework to crawl and collect useful information about the pages it visits.
104
+ It is versatile, allowing you to write your own specialized tasks quickly and easily.
105
+
106
+ #### Features
107
+
108
+ - Choose the links to follow on each page with `focus_crawl()`
109
+ - Multi-threaded design for high performance
110
+ - Tracks 301 HTTP redirects
111
+ - Allows exclusion of URLs based on regular expressions
112
+ - HTTPS support
113
+ - Records response time for each page
114
+ - Obey robots.txt
115
+ - In-memory or persistent storage of pages during crawl using Moneta adapters.
116
+ - Inherits OpenURI behavior (redirects, automatic charset and encoding detection, proxy configuration options).
117
+ email:
118
+ executables:
119
+ - medusa
120
+ extensions: []
121
+ extra_rdoc_files:
122
+ - README.md
123
+ files:
124
+ - CHANGELOG.md
125
+ - CONTRIBUTORS.md
126
+ - LICENSE.txt
127
+ - README.md
128
+ - Rakefile
129
+ - VERSION
130
+ - bin/medusa
131
+ - lib/medusa.rb
132
+ - lib/medusa/cli.rb
133
+ - lib/medusa/cli/count.rb
134
+ - lib/medusa/cli/cron.rb
135
+ - lib/medusa/cli/pagedepth.rb
136
+ - lib/medusa/cli/serialize.rb
137
+ - lib/medusa/cli/url_list.rb
138
+ - lib/medusa/cookie_store.rb
139
+ - lib/medusa/core.rb
140
+ - lib/medusa/exceptions.rb
141
+ - lib/medusa/http.rb
142
+ - lib/medusa/page.rb
143
+ - lib/medusa/page_store.rb
144
+ - lib/medusa/storage.rb
145
+ - lib/medusa/storage/base.rb
146
+ - lib/medusa/storage/exceptions.rb
147
+ - lib/medusa/storage/moneta.rb
148
+ - lib/medusa/tentacle.rb
149
+ - lib/medusa/version.rb
150
+ - spec/fakeweb_helper.rb
151
+ - spec/medusa_helper.rb
152
+ - spec/medusa_spec.rb
153
+ - spec/spec_helper.rb
154
+ homepage: https://github.com/brutuscat/medusa-crawler
155
+ licenses:
156
+ - MIT
157
+ metadata:
158
+ bug_tracker_uri: https://github.com/brutuscat/medusa-crawler/issues
159
+ source_code_uri: https://github.com/brutuscat/medusa-crawler/tree/v1.0.0.pre.1
160
+ post_install_message:
161
+ rdoc_options:
162
+ - "-m"
163
+ - README.md
164
+ - "-t"
165
+ - Medusa
166
+ require_paths:
167
+ - lib
168
+ required_ruby_version: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ required_rubygems_version: !ruby/object:Gem::Requirement
174
+ requirements:
175
+ - - ">"
176
+ - !ruby/object:Gem::Version
177
+ version: 1.3.1
178
+ requirements: []
179
+ rubygems_version: 3.1.2
180
+ signing_key:
181
+ specification_version: 4
182
+ summary: Medusa is a ruby crawler framework
183
+ test_files:
184
+ - spec/spec_helper.rb
185
+ - spec/medusa_helper.rb
186
+ - spec/fakeweb_helper.rb
187
+ - spec/medusa_spec.rb