medusa-crawler 1.0.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/CHANGELOG.md +20 -0
- data/CONTRIBUTORS.md +22 -0
- data/LICENSE.txt +20 -0
- data/README.md +48 -0
- data/Rakefile +24 -0
- data/VERSION +1 -0
- data/bin/medusa +4 -0
- data/lib/medusa.rb +2 -0
- data/lib/medusa/cli.rb +24 -0
- data/lib/medusa/cli/count.rb +22 -0
- data/lib/medusa/cli/cron.rb +90 -0
- data/lib/medusa/cli/pagedepth.rb +32 -0
- data/lib/medusa/cli/serialize.rb +35 -0
- data/lib/medusa/cli/url_list.rb +41 -0
- data/lib/medusa/cookie_store.rb +35 -0
- data/lib/medusa/core.rb +305 -0
- data/lib/medusa/exceptions.rb +5 -0
- data/lib/medusa/http.rb +202 -0
- data/lib/medusa/page.rb +229 -0
- data/lib/medusa/page_store.rb +160 -0
- data/lib/medusa/storage.rb +8 -0
- data/lib/medusa/storage/base.rb +81 -0
- data/lib/medusa/storage/exceptions.rb +15 -0
- data/lib/medusa/storage/moneta.rb +42 -0
- data/lib/medusa/tentacle.rb +39 -0
- data/lib/medusa/version.rb +3 -0
- data/spec/fakeweb_helper.rb +85 -0
- data/spec/medusa_helper.rb +5 -0
- data/spec/medusa_spec.rb +14 -0
- data/spec/spec_helper.rb +104 -0
- metadata +187 -0
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2ad0d3a02d8345991481a05954b6776801c51c20b61b38b37b61ab91659ea420
|
4
|
+
data.tar.gz: 84bc2865f48f987e60bde0fcf60fd6cef2b0b019e374ed64183a8f9cae7c3ee5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ca1ccfad54b1337fa5c6cd5975888efe8ed237db0e5b3eecd3d993fdcf0a0afc5d9e18c24f54f87918dfdce9e92bb7304d73c1ee276bee7c6a2622ab0b3560e4
|
7
|
+
data.tar.gz: 64f0701dfed21963879edc810ee8eb2d444fccb19543566774dde59f4973866b9bbdbbe9418eb766b6b6fd8f317539347da4b26edcfb56b098d0eb56f6132779
|
checksums.yaml.gz.sig
ADDED
Binary file
|
data.tar.gz.sig
ADDED
Binary file
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
|
2
|
+
## Anemone forked into Medusa (2014-12-13)
|
3
|
+
|
4
|
+
Features:
|
5
|
+
|
6
|
+
- Switch to use `OpenURI` instead of `net/http`, gaining out of the box support for:
|
7
|
+
- Http basic auth options
|
8
|
+
- Proxy configuration options
|
9
|
+
- Automatic string encoding detection based on charset
|
10
|
+
- Connection read timeout option
|
11
|
+
- Ability to control the RETRY_LIMIT upon connection errors
|
12
|
+
|
13
|
+
Changes:
|
14
|
+
|
15
|
+
- Renamed Anemone to Medusa
|
16
|
+
- Revamped the [README](https://github.com/brutuscat/medusa/blob/master/README.md) file
|
17
|
+
- Revamped the [CHANGELOG](https://github.com/brutuscat/medusa/blob/master/CHANGELOG.md) file
|
18
|
+
- Revamped the [CONTRIBUTORS](https://github.com/brutuscat/medusa/blob/master/CONTRIBUTORS.mdd) file
|
19
|
+
|
20
|
+
> Refer to the [Anemone changelog](https://github.com/chriskite/anemone/blob/next/CHANGELOG.rdoc) for a travel in time.
|
data/CONTRIBUTORS.md
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Contributors
|
2
|
+
|
3
|
+
Many thanks to the following folks who have contributed code to Medusa (a fork of Anemone).
|
4
|
+
|
5
|
+
In no particular order:
|
6
|
+
|
7
|
+
|
8
|
+
| Person | Github | Twitter |
|
9
|
+
| ------------- |:-------------:| --------:|
|
10
|
+
| Chris Kite | [chriskite](https://github.com/chriskite) | |
|
11
|
+
| Marc Seeger | | |
|
12
|
+
| Joost Baaij | | |
|
13
|
+
| Laurent Arnoud | | |
|
14
|
+
| Cheng Huang | [zzzhc](https://github.com/zzzhc) | |
|
15
|
+
| Mauro Asprea | [brutuscat](https://github.com/brutuscat) | [@brutuscat](https://twitter.com/brutuscat) |
|
16
|
+
| Alex Pooley | | |
|
17
|
+
| Luca Pradovera | [polysics](https://github.com/polysics) | |
|
18
|
+
| Sergey Kojin | | |
|
19
|
+
| Richard Paul | | |
|
20
|
+
|
21
|
+
|
22
|
+
> If you are submitting a [PR](https://help.github.com/articles/using-pull-requests/), feel free to add yourself to this table.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Vertive, Inc.
|
2
|
+
Copyright (c) 2020 Mauro Asprea <mauroasprea@gmail.com>
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
of this software and associated documentation files (the "Software"), to deal
|
6
|
+
in the Software without restriction, including without limitation the rights
|
7
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the Software is
|
9
|
+
furnished to do so, subject to the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be included in
|
12
|
+
all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
20
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# Medusa: a ruby crawler framework 
|
2
|
+
|
3
|
+
Medusa is a framework to crawl and collect useful information about the pages it visits. It is versatile, allowing you to write your own specialized tasks quickly and easily.
|
4
|
+
|
5
|
+
|
6
|
+
## Features
|
7
|
+
|
8
|
+
- Choose the links to follow on each page with `focus_crawl()`
|
9
|
+
- Multi-threaded design for high performance
|
10
|
+
- Tracks 301 HTTP redirects
|
11
|
+
- Allows exclusion of URLs based on regular expressions
|
12
|
+
- HTTPS support
|
13
|
+
- Records response time for each page
|
14
|
+
- Obey robots.txt
|
15
|
+
- In-memory or persistent storage of pages during crawl using [Moneta](https://github.com/moneta-rb/moneta) adapters
|
16
|
+
- Inherits OpenURI behavior (redirects, automatic charset and encoding detection, proxy configuration options).
|
17
|
+
|
18
|
+
## Examples
|
19
|
+
|
20
|
+
See the scripts under the <tt>lib/Medusa/cli</tt> directory for examples of several useful Medusa tasks.
|
21
|
+
|
22
|
+
## TODO
|
23
|
+
|
24
|
+
- [x] Simplify storage module using [Moneta](https://github.com/minad/moneta)
|
25
|
+
- [x] Add multiverse of ruby versions and runtimes in test suite
|
26
|
+
- [ ] Solve memory issues with a persistent Queue
|
27
|
+
- [ ] Improve docs & examples
|
28
|
+
- [ ] Allow to control the crawler, eg: "stop", "resume"
|
29
|
+
- [ ] Improve logging facilities to collect stats, catch errors & failures
|
30
|
+
- [ ] Add the concept of "bots" or drivers to interact with pages (eg: capybara)
|
31
|
+
|
32
|
+
**Do you have an idea? [Open an issue so we can discuss it](https://github.com/brutuscat/medusa-crawler/issues/new)**
|
33
|
+
|
34
|
+
## Requirements
|
35
|
+
|
36
|
+
- moneta
|
37
|
+
- nokogiri
|
38
|
+
- robotex
|
39
|
+
|
40
|
+
## Development
|
41
|
+
|
42
|
+
To test and develop this gem, additional requirements are:
|
43
|
+
- rspec
|
44
|
+
- webmock
|
45
|
+
|
46
|
+
## Disclaimer
|
47
|
+
|
48
|
+
Medusa is a revamped version of the defunk anemone gem.
|
data/Rakefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
require 'rdoc/task'
|
4
|
+
|
5
|
+
desc "Run all specs"
|
6
|
+
RSpec::Core::RakeTask.new(:rspec) do |spec|
|
7
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
8
|
+
end
|
9
|
+
|
10
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
11
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
12
|
+
spec.rcov = true
|
13
|
+
end
|
14
|
+
|
15
|
+
task :default => :rspec
|
16
|
+
|
17
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
18
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
19
|
+
|
20
|
+
rdoc.rdoc_dir = 'rdoc'
|
21
|
+
rdoc.title = "medusa #{version}"
|
22
|
+
rdoc.rdoc_files.include('README*')
|
23
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
24
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0.pre.1
|
data/bin/medusa
ADDED
data/lib/medusa.rb
ADDED
data/lib/medusa/cli.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Medusa
|
2
|
+
module CLI
|
3
|
+
COMMANDS = %w[count cron pagedepth serialize url-list]
|
4
|
+
|
5
|
+
def self.run
|
6
|
+
command = ARGV.shift
|
7
|
+
|
8
|
+
if COMMANDS.include? command
|
9
|
+
load "medusa/cli/#{command.tr('-', '_')}.rb"
|
10
|
+
else
|
11
|
+
puts <<-INFO
|
12
|
+
Medusa is a web spider framework that can collect
|
13
|
+
useful information about pages it visits.
|
14
|
+
|
15
|
+
Usage:
|
16
|
+
medusa <command> [arguments]
|
17
|
+
|
18
|
+
Commands:
|
19
|
+
#{COMMANDS.join(', ')}
|
20
|
+
INFO
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'medusa'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
url = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
medusa count <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs the total number
|
13
|
+
of unique pages on the site.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Medusa.crawl(url) do |medusa|
|
19
|
+
medusa.after_crawl do |pages|
|
20
|
+
puts pages.uniq!.size
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'medusa'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
options.output_file = 'urls.txt'
|
8
|
+
|
9
|
+
begin
|
10
|
+
# make sure that the last argument is a URL we can crawl
|
11
|
+
root = URI(ARGV.last)
|
12
|
+
rescue
|
13
|
+
puts <<-INFO
|
14
|
+
Usage:
|
15
|
+
medusa cron [options] <url>
|
16
|
+
|
17
|
+
Synopsis:
|
18
|
+
Combination of `count`, `pagedepth` and `url-list` commands.
|
19
|
+
Performs pagedepth, url list, and count functionality.
|
20
|
+
Outputs results to STDOUT and link list to file (urls.txt).
|
21
|
+
Meant to be run daily as a cron job.
|
22
|
+
|
23
|
+
Options:
|
24
|
+
-r, --relative Output relative URLs (rather than absolute)
|
25
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
26
|
+
INFO
|
27
|
+
exit(0)
|
28
|
+
end
|
29
|
+
|
30
|
+
# parse command-line options
|
31
|
+
opts = OptionParser.new
|
32
|
+
opts.on('-r', '--relative') { options.relative = true }
|
33
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
34
|
+
opts.parse!(ARGV)
|
35
|
+
|
36
|
+
Medusa.crawl(root, {:discard_page_bodies => true}) do |medusa|
|
37
|
+
|
38
|
+
medusa.after_crawl do |pages|
|
39
|
+
puts "Crawl results for #{root}\n"
|
40
|
+
|
41
|
+
# print a list of 404's
|
42
|
+
not_found = []
|
43
|
+
pages.each_value do |page|
|
44
|
+
url = page.url.to_s
|
45
|
+
not_found << url if page.not_found?
|
46
|
+
end
|
47
|
+
unless not_found.empty?
|
48
|
+
puts "\n404's:"
|
49
|
+
|
50
|
+
missing_links = pages.urls_linking_to(not_found)
|
51
|
+
missing_links.each do |url, links|
|
52
|
+
if options.relative
|
53
|
+
puts URI(url).path.to_s
|
54
|
+
else
|
55
|
+
puts url
|
56
|
+
end
|
57
|
+
links.slice(0..10).each do |u|
|
58
|
+
u = u.path if options.relative
|
59
|
+
puts " linked from #{u}"
|
60
|
+
end
|
61
|
+
|
62
|
+
puts " ..." if links.size > 10
|
63
|
+
end
|
64
|
+
|
65
|
+
print "\n"
|
66
|
+
end
|
67
|
+
|
68
|
+
# remove redirect aliases, and calculate pagedepths
|
69
|
+
pages = pages.shortest_paths!(root).uniq
|
70
|
+
depths = pages.values.inject({}) do |depths, page|
|
71
|
+
depths[page.depth] ||= 0
|
72
|
+
depths[page.depth] += 1
|
73
|
+
depths
|
74
|
+
end
|
75
|
+
|
76
|
+
# print the page count
|
77
|
+
puts "Total pages: #{pages.size}\n"
|
78
|
+
|
79
|
+
# print a list of depths
|
80
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
81
|
+
|
82
|
+
# output a list of urls to file
|
83
|
+
file = open(options.output_file, 'w')
|
84
|
+
pages.each_key do |url|
|
85
|
+
url = options.relative ? url.path.to_s : url.to_s
|
86
|
+
file.puts url
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'medusa'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
root = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
medusa pagedepth <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs a count of
|
13
|
+
the number of pages at each depth of the crawl.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Medusa.crawl(root, read_timeout: 3, discard_page_bodies: true, obey_robots_txt: true) do |medusa|
|
19
|
+
medusa.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
+
|
21
|
+
medusa.after_crawl do |pages|
|
22
|
+
pages = pages.shortest_paths!(root).uniq!
|
23
|
+
|
24
|
+
depths = pages.values.inject({}) do |depths, page|
|
25
|
+
depths[page.depth] ||= 0
|
26
|
+
depths[page.depth] += 1
|
27
|
+
depths
|
28
|
+
end
|
29
|
+
|
30
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'medusa'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
begin
|
6
|
+
# make sure that the first option is a URL we can crawl
|
7
|
+
root = URI(ARGV[0])
|
8
|
+
rescue
|
9
|
+
puts <<-INFO
|
10
|
+
Usage:
|
11
|
+
medusa serialize [options] <url>
|
12
|
+
|
13
|
+
Synopsis:
|
14
|
+
Crawls a site starting at the given URL and saves the resulting
|
15
|
+
PageStore object to a file using Marshal serialization.
|
16
|
+
|
17
|
+
Options:
|
18
|
+
-o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
|
19
|
+
INFO
|
20
|
+
exit(0)
|
21
|
+
end
|
22
|
+
|
23
|
+
options = OpenStruct.new
|
24
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Medusa.crawl(root) do |medusa|
|
32
|
+
medusa.after_crawl do |pages|
|
33
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'medusa'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
|
8
|
+
begin
|
9
|
+
# make sure that the last option is a URL we can crawl
|
10
|
+
root = URI(ARGV.last)
|
11
|
+
rescue
|
12
|
+
puts <<-INFO
|
13
|
+
Usage:
|
14
|
+
medusa url-list [options] <url>
|
15
|
+
|
16
|
+
Synopsis:
|
17
|
+
Crawls a site starting at the given URL, and outputs the URL of each page
|
18
|
+
in the domain as they are encountered.
|
19
|
+
|
20
|
+
Options:
|
21
|
+
-r, --relative Output relative URLs (rather than absolute)
|
22
|
+
INFO
|
23
|
+
exit(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-r', '--relative') { options.relative = true }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Medusa.crawl(root, :discard_page_bodies => true) do |medusa|
|
32
|
+
|
33
|
+
medusa.on_every_page do |page|
|
34
|
+
if options.relative
|
35
|
+
puts page.url.path
|
36
|
+
else
|
37
|
+
puts page.url
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
require 'webrick/cookie'
|
3
|
+
|
4
|
+
class WEBrick::Cookie
|
5
|
+
def expired?
|
6
|
+
!!expires && expires < Time.now
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module Medusa
|
11
|
+
class CookieStore < DelegateClass(Hash)
|
12
|
+
|
13
|
+
def initialize(cookies = nil)
|
14
|
+
@cookies = {}
|
15
|
+
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
|
16
|
+
super(@cookies)
|
17
|
+
end
|
18
|
+
|
19
|
+
def merge!(set_cookie_str)
|
20
|
+
begin
|
21
|
+
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
|
22
|
+
hash[cookie.name] = cookie if !!cookie
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
@cookies.merge! cookie_hash
|
26
|
+
rescue
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|