speed_spider 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +65 -0
- data/Rakefile +1 -0
- data/bin/speed_spider +6 -0
- data/lib/speed_spider/anemone_hack.rb +19 -0
- data/lib/speed_spider/cli.rb +123 -0
- data/lib/speed_spider/crawler.rb +82 -0
- data/lib/speed_spider/version.rb +3 -0
- data/lib/speed_spider.rb +17 -0
- data/speed_spider.gemspec +24 -0
- metadata +100 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: abdb6ebdea8dbe7f00e7c9e487641a45d0d47b49
|
4
|
+
data.tar.gz: 3b06cf74adb37f274516b16215ae71627a36ec69
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fb6e3f517125ab47b511abaf4a1e09d3ecfeab330557c7fe60f165f3350de3434c0d57f3534c1134cefb97a340869afc2abafe8ec5a687e061340bbeccf36cc4
|
7
|
+
data.tar.gz: 5bc260f898cf66898073fd547696b237a6ad89f45fd2650e5518983bcefa440bc8e4d5cf31da1ad4110ac3c0614e96aecf05389b19dde4f5e2e0c902bae48613
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Ryan Wang
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# SpeedSpider
|
2
|
+
|
3
|
+
A simple and speedy web spider for pages downloading.
|
4
|
+
|
5
|
+
SpeedSpider is based on ruby spider framework [Anemone][1], it's easy to use and very fast since it uses threads for page fetching.
|
6
|
+
|
7
|
+
## What kind of files will be downloaded
|
8
|
+
|
9
|
+
### links in html pages
|
10
|
+
|
11
|
+
* link, xpath: `//a[@href]`
|
12
|
+
* stylesheet, xpath: `//link[@src]`
|
13
|
+
* javascript, xpath: `//script[@src]`
|
14
|
+
* iframe file, xpath: `//iframe[@src]`
|
15
|
+
* image file, xpath: `//img[@src]`
|
16
|
+
|
17
|
+
### urls in stylesheet files
|
18
|
+
|
19
|
+
* urls with parttern url\((.*)\)
|
20
|
+
|
21
|
+
## Installation
|
22
|
+
|
23
|
+
install it with rubygem:
|
24
|
+
|
25
|
+
gem install 'speed_spider'
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
Usage: speed_spider [options] start_url
|
29
|
+
|
30
|
+
options:
|
31
|
+
-S, --slient slient output
|
32
|
+
-D, --dir String directory for download files to save to. "download" by default
|
33
|
+
-b, --base_url String any url not starts with base_url will not be saved
|
34
|
+
-t, --threads Integer threads to run for fetching pages, 4 by default
|
35
|
+
-u, --user_agent String words for request header USER_AGENT
|
36
|
+
-d, --delay Integer delay between requests
|
37
|
+
-o, --obey_robots_text obey robots exclustion protocol
|
38
|
+
-l, --depth_limit limit the depth of the crawl
|
39
|
+
-r, --redirect_limit Integer number of times HTTP redirects will be followed
|
40
|
+
-a, --accept_cookies accept cookies from the server and send them back?
|
41
|
+
-s, --skip_query_strings skip any link with a query string? e.g. http://foo.com/?u=user
|
42
|
+
-H, --proxy_host String proxy server hostname
|
43
|
+
-P, --proxy_port Integer proxy server port number
|
44
|
+
-T, --read_timeout Integer HTTP read timeout in seconds
|
45
|
+
-V, --version Show version
|
46
|
+
|
47
|
+
## Example
|
48
|
+
|
49
|
+
speed_spider http://twitter.github.io/bootstrap/
|
50
|
+
|
51
|
+
It will download all files within the same domain as `twitter.github.io`, and save to `download/twitter.github.io/`.
|
52
|
+
|
53
|
+
speed_spider -b http://ruby-doc.org/core-2.0/ http://ruby-doc.org/core-2.0/
|
54
|
+
|
55
|
+
It will only download urls start with http://ruby-doc.org/core-2.0/, notice `assets` files like image, css, js, font will not obey `base_url` rule.
|
56
|
+
|
57
|
+
## Contributing
|
58
|
+
|
59
|
+
1. Fork it
|
60
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
61
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
62
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
63
|
+
5. Create new Pull Request
|
64
|
+
|
65
|
+
[1]: http://anemone.rubyforge.org/
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/speed_spider
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
module Anemone
|
2
|
+
class Core
|
3
|
+
def assets?(link)
|
4
|
+
%w(js css jpg jpeg png bmp gif svg ttf woff eot).any? do |e|
|
5
|
+
/#{e}/i =~ File.extname(link.path).split('.').pop
|
6
|
+
end
|
7
|
+
end
|
8
|
+
#
|
9
|
+
# Returns +true+ if *link* should not be visited because
|
10
|
+
# its URL matches a skip_link pattern.
|
11
|
+
#
|
12
|
+
def skip_link_with_hack?(link)
|
13
|
+
skip_link_without_hack?(link) or !assets?(link) and !link.to_s.start_with? @opts[:base_url]
|
14
|
+
end
|
15
|
+
|
16
|
+
alias_method :skip_link_without_hack?, :skip_link?
|
17
|
+
alias_method :skip_link?, :skip_link_with_hack?
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'speed_spider/crawler'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
module SpeedSpider
|
6
|
+
class Cli
|
7
|
+
attr_reader :options, :option_parser
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@options = {
|
11
|
+
# only url start with base_url will save to local
|
12
|
+
:base_url => '',
|
13
|
+
# directory for downloaded files to save to
|
14
|
+
:dir => 'download',
|
15
|
+
# run 4 Tentacle threads to fetch pages
|
16
|
+
:threads => 4,
|
17
|
+
# verbose output
|
18
|
+
:verbose => true,
|
19
|
+
# don't throw away the page response body after scanning it for links
|
20
|
+
:discard_page_bodies => false,
|
21
|
+
# identify self as WebCrawler/VERSION
|
22
|
+
:user_agent => "SpeedSpider/#{SpeedSpider::VERSION}",
|
23
|
+
# no delay between requests
|
24
|
+
:delay => 0,
|
25
|
+
# don't obey the robots exclusion protocol
|
26
|
+
:obey_robots_txt => false,
|
27
|
+
# by default, don't limit the depth of the crawl
|
28
|
+
:depth_limit => false,
|
29
|
+
# number of times HTTP redirects will be followed
|
30
|
+
:redirect_limit => 5,
|
31
|
+
# storage engine defaults to Hash in +process_options+ if none specified
|
32
|
+
:storage => nil,
|
33
|
+
# Hash of cookie name => value to send with HTTP requests
|
34
|
+
:cookies => nil,
|
35
|
+
# accept cookies from the server and send them back?
|
36
|
+
:accept_cookies => false,
|
37
|
+
# skip any link with a query string? e.g. http://foo.com/?u=user
|
38
|
+
:skip_query_strings => false,
|
39
|
+
# proxy server hostname
|
40
|
+
:proxy_host => nil,
|
41
|
+
# proxy server port number
|
42
|
+
:proxy_port => false,
|
43
|
+
# HTTP read timeout in seconds
|
44
|
+
:read_timeout => nil
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse!
|
49
|
+
@option_parser = OptionParser.new do |opts|
|
50
|
+
opts.banner = "Usage: speed_spider [options] start_url"
|
51
|
+
opts.separator ""
|
52
|
+
opts.separator "options:"
|
53
|
+
|
54
|
+
opts.on('-S', '--slient', 'slient output') do
|
55
|
+
@options[:verbose] = false
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.on('-D', '--dir String', 'directory for download files to save to. "download" by default') do |value|
|
59
|
+
options[:dir] = value
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.on('-b', '--base_url String', 'any url not starts with base_url will not be saved') do |value|
|
63
|
+
value += '/' unless value.end_with? '/'
|
64
|
+
options[:base_url] = value
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.on('-t', '--threads Integer', Integer, 'threads to run for fetching pages, 4 by default') do |value|
|
68
|
+
@options[:threads] = value
|
69
|
+
end
|
70
|
+
|
71
|
+
opts.on('-u', '--user_agent String', 'words for request header USER_AGENT') do |value|
|
72
|
+
@options[:user_agent] = value
|
73
|
+
end
|
74
|
+
|
75
|
+
opts.on('-d', '--delay Integer', Integer, 'delay between requests in seconds') do |value|
|
76
|
+
@options[:delay] = value
|
77
|
+
end
|
78
|
+
|
79
|
+
opts.on('-o', '--obey_robots_text', 'obey robots exclustion protocol') do
|
80
|
+
@options[:obey_robots_txt] = true
|
81
|
+
end
|
82
|
+
|
83
|
+
opts.on('-l', '--depth_limit', 'limit the depth of the crawl') do
|
84
|
+
@options[:delay] = true
|
85
|
+
end
|
86
|
+
|
87
|
+
opts.on('-r', '--redirect_limit Integer', Integer, 'number of times HTTP redirects will be followed') do |value|
|
88
|
+
@options[:redirect_limit] = value
|
89
|
+
end
|
90
|
+
|
91
|
+
opts.on('-a', '--accept_cookies', 'accept cookies from the server and send them back?') do
|
92
|
+
@options[:accept_cookies] = true
|
93
|
+
end
|
94
|
+
|
95
|
+
opts.on('-s', '--skip_query_strings', 'skip any link with a query string? e.g. http://foo.com/?u=user') do
|
96
|
+
@options[:skip_query_strings] = true
|
97
|
+
end
|
98
|
+
|
99
|
+
opts.on('-H', '--proxy_host String', 'proxy server hostname') do |value|
|
100
|
+
@options[:proxy_host] = value
|
101
|
+
end
|
102
|
+
|
103
|
+
opts.on('-P', '--proxy_port Integer', Integer, 'proxy server port number') do |value|
|
104
|
+
@options[:proxy_port] = value
|
105
|
+
end
|
106
|
+
|
107
|
+
opts.on('-T', '--read_timeout Integer', Integer, 'HTTP read timeout in seconds') do |value|
|
108
|
+
@options[:read_timeout] = value
|
109
|
+
end
|
110
|
+
|
111
|
+
# print the version.
|
112
|
+
opts.on_tail("-V", "--version", "Show version") do
|
113
|
+
puts SpeedSpider::VERSION
|
114
|
+
exit
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
@option_parser.parse!
|
119
|
+
|
120
|
+
self
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'speed_spider/anemone_hack'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'uri'
|
5
|
+
|
6
|
+
module SpeedSpider
|
7
|
+
class Crawler
|
8
|
+
def initialize(start_url, options)
|
9
|
+
@start_url = start_url
|
10
|
+
@base_url = options[:base_url]
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
# return urls from css file contents
|
15
|
+
def get_urls_from_css data, pos = 0
|
16
|
+
if m = data.match(/url\((.*?)\)/i, pos)
|
17
|
+
[ m[1] ] + get_urls_from_css(data, m.end(1) + 1)
|
18
|
+
else
|
19
|
+
[]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def focus_crawl
|
24
|
+
lambda { |page|
|
25
|
+
links = []
|
26
|
+
if page.doc
|
27
|
+
# include javascripts and img files as target links
|
28
|
+
page.doc.search('//script[@src]', '//img[@src]', '//iframe[@src]').each do |s|
|
29
|
+
u = s['src']
|
30
|
+
next if u.nil? or u.empty?
|
31
|
+
abs = page.to_absolute u rescue next
|
32
|
+
links << abs if page.in_domain? abs
|
33
|
+
end
|
34
|
+
|
35
|
+
# include css files as target links
|
36
|
+
page.doc.search('//link[@href]').each do |s|
|
37
|
+
u = s['href']
|
38
|
+
next if u.nil? or u.empty?
|
39
|
+
abs = page.to_absolute u rescue next
|
40
|
+
links << abs if page.in_domain? abs
|
41
|
+
|
42
|
+
end
|
43
|
+
elsif page.url.to_s.end_with? '.css'
|
44
|
+
get_urls_from_css(page.body).each do |s|
|
45
|
+
u = s.gsub('"', '').gsub("'", '')
|
46
|
+
next if u.nil? or u.empty?
|
47
|
+
abs = page.to_absolute u rescue next
|
48
|
+
links << abs if page.in_domain? abs
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
page.links + links.uniq
|
53
|
+
}
|
54
|
+
end
|
55
|
+
|
56
|
+
def after_crawl
|
57
|
+
lambda { |pages|
|
58
|
+
pages.each do |url, page|
|
59
|
+
path = page.url.path
|
60
|
+
path += 'index.html' if path.end_with? '/' or path.empty?
|
61
|
+
|
62
|
+
path = "#{@options[:dir]}/#{page.url.host}#{path}"
|
63
|
+
dir = File.dirname path
|
64
|
+
|
65
|
+
FileUtils.mkdir_p dir unless dir.empty?
|
66
|
+
File.open path, 'w' do |f|
|
67
|
+
f.write page.body
|
68
|
+
end
|
69
|
+
|
70
|
+
puts "save file #{path}" if @options[:verbose]
|
71
|
+
end
|
72
|
+
}
|
73
|
+
end
|
74
|
+
|
75
|
+
def crawl
|
76
|
+
Anemone.crawl @start_url, @options do |spider|
|
77
|
+
spider.focus_crawl &focus_crawl
|
78
|
+
spider.after_crawl &after_crawl
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/speed_spider.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "speed_spider/version"
|
2
|
+
require 'speed_spider/cli'
|
3
|
+
require 'speed_spider/crawler'
|
4
|
+
require 'debugger'
|
5
|
+
|
6
|
+
module SpeedSpider
|
7
|
+
def self.crawl
|
8
|
+
cli = Cli.new.parse!
|
9
|
+
|
10
|
+
start_url = ARGV[0]
|
11
|
+
(puts cli.option_parser.help; exit 1) if start_url.nil?
|
12
|
+
|
13
|
+
crawler = Crawler.new start_url, cli.options
|
14
|
+
crawler.crawl
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'speed_spider/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "speed_spider"
|
8
|
+
spec.version = SpeedSpider::VERSION
|
9
|
+
spec.authors = ["Ryan Wang"]
|
10
|
+
spec.email = ["wongyouth@gmail.com"]
|
11
|
+
spec.description = %q{A simple web spider tool for crawling pages to local based on a url}
|
12
|
+
spec.summary = %q{A simple web spider tool for download pages from a base url including css js html and iframe source files}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "anemone", "~> 0.7.2"
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: speed_spider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ryan Wang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-06-01 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: anemone
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.7.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.7.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: A simple web spider tool for crawling pages to local based on a url
|
56
|
+
email:
|
57
|
+
- wongyouth@gmail.com
|
58
|
+
executables:
|
59
|
+
- speed_spider
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- .gitignore
|
64
|
+
- Gemfile
|
65
|
+
- LICENSE.txt
|
66
|
+
- README.md
|
67
|
+
- Rakefile
|
68
|
+
- bin/speed_spider
|
69
|
+
- lib/speed_spider.rb
|
70
|
+
- lib/speed_spider/anemone_hack.rb
|
71
|
+
- lib/speed_spider/cli.rb
|
72
|
+
- lib/speed_spider/crawler.rb
|
73
|
+
- lib/speed_spider/version.rb
|
74
|
+
- speed_spider.gemspec
|
75
|
+
homepage: ''
|
76
|
+
licenses:
|
77
|
+
- MIT
|
78
|
+
metadata: {}
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - '>='
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 2.0.3
|
96
|
+
signing_key:
|
97
|
+
specification_version: 4
|
98
|
+
summary: A simple web spider tool for download pages from a base url including css
|
99
|
+
js html and iframe source files
|
100
|
+
test_files: []
|