speed_spider 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +65 -0
- data/Rakefile +1 -0
- data/bin/speed_spider +6 -0
- data/lib/speed_spider/anemone_hack.rb +19 -0
- data/lib/speed_spider/cli.rb +123 -0
- data/lib/speed_spider/crawler.rb +82 -0
- data/lib/speed_spider/version.rb +3 -0
- data/lib/speed_spider.rb +17 -0
- data/speed_spider.gemspec +24 -0
- metadata +100 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: abdb6ebdea8dbe7f00e7c9e487641a45d0d47b49
|
4
|
+
data.tar.gz: 3b06cf74adb37f274516b16215ae71627a36ec69
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fb6e3f517125ab47b511abaf4a1e09d3ecfeab330557c7fe60f165f3350de3434c0d57f3534c1134cefb97a340869afc2abafe8ec5a687e061340bbeccf36cc4
|
7
|
+
data.tar.gz: 5bc260f898cf66898073fd547696b237a6ad89f45fd2650e5518983bcefa440bc8e4d5cf31da1ad4110ac3c0614e96aecf05389b19dde4f5e2e0c902bae48613
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Ryan Wang
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# SpeedSpider
|
2
|
+
|
3
|
+
A simple and speedy web spider for pages downloading.
|
4
|
+
|
5
|
+
SpeedSpider is based on ruby spider framework [Anemone][1], it's easy to use and very fast since it uses threads for page fetching.
|
6
|
+
|
7
|
+
## What kind of files will be downloaded
|
8
|
+
|
9
|
+
### links in html pages
|
10
|
+
|
11
|
+
* link, xpath: `//a[@href]`
|
12
|
+
* stylesheet, xpath: `//link[@src]`
|
13
|
+
* javascript, xpath: `//script[@src]`
|
14
|
+
* iframe file, xpath: `//iframe[@src]`
|
15
|
+
* image file, xpath: `//img[@src]`
|
16
|
+
|
17
|
+
### urls in stylesheet files
|
18
|
+
|
19
|
+
* urls with parttern url\((.*)\)
|
20
|
+
|
21
|
+
## Installation
|
22
|
+
|
23
|
+
install it with rubygem:
|
24
|
+
|
25
|
+
gem install 'speed_spider'
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
Usage: speed_spider [options] start_url
|
29
|
+
|
30
|
+
options:
|
31
|
+
-S, --slient slient output
|
32
|
+
-D, --dir String directory for download files to save to. "download" by default
|
33
|
+
-b, --base_url String any url not starts with base_url will not be saved
|
34
|
+
-t, --threads Integer threads to run for fetching pages, 4 by default
|
35
|
+
-u, --user_agent String words for request header USER_AGENT
|
36
|
+
-d, --delay Integer delay between requests
|
37
|
+
-o, --obey_robots_text obey robots exclustion protocol
|
38
|
+
-l, --depth_limit limit the depth of the crawl
|
39
|
+
-r, --redirect_limit Integer number of times HTTP redirects will be followed
|
40
|
+
-a, --accept_cookies accept cookies from the server and send them back?
|
41
|
+
-s, --skip_query_strings skip any link with a query string? e.g. http://foo.com/?u=user
|
42
|
+
-H, --proxy_host String proxy server hostname
|
43
|
+
-P, --proxy_port Integer proxy server port number
|
44
|
+
-T, --read_timeout Integer HTTP read timeout in seconds
|
45
|
+
-V, --version Show version
|
46
|
+
|
47
|
+
## Example
|
48
|
+
|
49
|
+
speed_spider http://twitter.github.io/bootstrap/
|
50
|
+
|
51
|
+
It will download all files within the same domain as `twitter.github.io`, and save to `download/twitter.github.io/`.
|
52
|
+
|
53
|
+
speed_spider -b http://ruby-doc.org/core-2.0/ http://ruby-doc.org/core-2.0/
|
54
|
+
|
55
|
+
It will only download urls start with http://ruby-doc.org/core-2.0/, notice `assets` files like image, css, js, font will not obey `base_url` rule.
|
56
|
+
|
57
|
+
## Contributing
|
58
|
+
|
59
|
+
1. Fork it
|
60
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
61
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
62
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
63
|
+
5. Create new Pull Request
|
64
|
+
|
65
|
+
[1]: http://anemone.rubyforge.org/
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/speed_spider
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
module Anemone
|
2
|
+
class Core
|
3
|
+
def assets?(link)
|
4
|
+
%w(js css jpg jpeg png bmp gif svg ttf woff eot).any? do |e|
|
5
|
+
/#{e}/i =~ File.extname(link.path).split('.').pop
|
6
|
+
end
|
7
|
+
end
|
8
|
+
#
|
9
|
+
# Returns +true+ if *link* should not be visited because
|
10
|
+
# its URL matches a skip_link pattern.
|
11
|
+
#
|
12
|
+
def skip_link_with_hack?(link)
|
13
|
+
skip_link_without_hack?(link) or !assets?(link) and !link.to_s.start_with? @opts[:base_url]
|
14
|
+
end
|
15
|
+
|
16
|
+
alias_method :skip_link_without_hack?, :skip_link?
|
17
|
+
alias_method :skip_link?, :skip_link_with_hack?
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'speed_spider/crawler'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
module SpeedSpider
|
6
|
+
class Cli
|
7
|
+
attr_reader :options, :option_parser
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@options = {
|
11
|
+
# only url start with base_url will save to local
|
12
|
+
:base_url => '',
|
13
|
+
# directory for downloaded files to save to
|
14
|
+
:dir => 'download',
|
15
|
+
# run 4 Tentacle threads to fetch pages
|
16
|
+
:threads => 4,
|
17
|
+
# verbose output
|
18
|
+
:verbose => true,
|
19
|
+
# don't throw away the page response body after scanning it for links
|
20
|
+
:discard_page_bodies => false,
|
21
|
+
# identify self as WebCrawler/VERSION
|
22
|
+
:user_agent => "SpeedSpider/#{SpeedSpider::VERSION}",
|
23
|
+
# no delay between requests
|
24
|
+
:delay => 0,
|
25
|
+
# don't obey the robots exclusion protocol
|
26
|
+
:obey_robots_txt => false,
|
27
|
+
# by default, don't limit the depth of the crawl
|
28
|
+
:depth_limit => false,
|
29
|
+
# number of times HTTP redirects will be followed
|
30
|
+
:redirect_limit => 5,
|
31
|
+
# storage engine defaults to Hash in +process_options+ if none specified
|
32
|
+
:storage => nil,
|
33
|
+
# Hash of cookie name => value to send with HTTP requests
|
34
|
+
:cookies => nil,
|
35
|
+
# accept cookies from the server and send them back?
|
36
|
+
:accept_cookies => false,
|
37
|
+
# skip any link with a query string? e.g. http://foo.com/?u=user
|
38
|
+
:skip_query_strings => false,
|
39
|
+
# proxy server hostname
|
40
|
+
:proxy_host => nil,
|
41
|
+
# proxy server port number
|
42
|
+
:proxy_port => false,
|
43
|
+
# HTTP read timeout in seconds
|
44
|
+
:read_timeout => nil
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse!
|
49
|
+
@option_parser = OptionParser.new do |opts|
|
50
|
+
opts.banner = "Usage: speed_spider [options] start_url"
|
51
|
+
opts.separator ""
|
52
|
+
opts.separator "options:"
|
53
|
+
|
54
|
+
opts.on('-S', '--slient', 'slient output') do
|
55
|
+
@options[:verbose] = false
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.on('-D', '--dir String', 'directory for download files to save to. "download" by default') do |value|
|
59
|
+
options[:dir] = value
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.on('-b', '--base_url String', 'any url not starts with base_url will not be saved') do |value|
|
63
|
+
value += '/' unless value.end_with? '/'
|
64
|
+
options[:base_url] = value
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.on('-t', '--threads Integer', Integer, 'threads to run for fetching pages, 4 by default') do |value|
|
68
|
+
@options[:threads] = value
|
69
|
+
end
|
70
|
+
|
71
|
+
opts.on('-u', '--user_agent String', 'words for request header USER_AGENT') do |value|
|
72
|
+
@options[:user_agent] = value
|
73
|
+
end
|
74
|
+
|
75
|
+
opts.on('-d', '--delay Integer', Integer, 'delay between requests in seconds') do |value|
|
76
|
+
@options[:delay] = value
|
77
|
+
end
|
78
|
+
|
79
|
+
opts.on('-o', '--obey_robots_text', 'obey robots exclustion protocol') do
|
80
|
+
@options[:obey_robots_txt] = true
|
81
|
+
end
|
82
|
+
|
83
|
+
opts.on('-l', '--depth_limit', 'limit the depth of the crawl') do
|
84
|
+
@options[:delay] = true
|
85
|
+
end
|
86
|
+
|
87
|
+
opts.on('-r', '--redirect_limit Integer', Integer, 'number of times HTTP redirects will be followed') do |value|
|
88
|
+
@options[:redirect_limit] = value
|
89
|
+
end
|
90
|
+
|
91
|
+
opts.on('-a', '--accept_cookies', 'accept cookies from the server and send them back?') do
|
92
|
+
@options[:accept_cookies] = true
|
93
|
+
end
|
94
|
+
|
95
|
+
opts.on('-s', '--skip_query_strings', 'skip any link with a query string? e.g. http://foo.com/?u=user') do
|
96
|
+
@options[:skip_query_strings] = true
|
97
|
+
end
|
98
|
+
|
99
|
+
opts.on('-H', '--proxy_host String', 'proxy server hostname') do |value|
|
100
|
+
@options[:proxy_host] = value
|
101
|
+
end
|
102
|
+
|
103
|
+
opts.on('-P', '--proxy_port Integer', Integer, 'proxy server port number') do |value|
|
104
|
+
@options[:proxy_port] = value
|
105
|
+
end
|
106
|
+
|
107
|
+
opts.on('-T', '--read_timeout Integer', Integer, 'HTTP read timeout in seconds') do |value|
|
108
|
+
@options[:read_timeout] = value
|
109
|
+
end
|
110
|
+
|
111
|
+
# print the version.
|
112
|
+
opts.on_tail("-V", "--version", "Show version") do
|
113
|
+
puts SpeedSpider::VERSION
|
114
|
+
exit
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
@option_parser.parse!
|
119
|
+
|
120
|
+
self
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'speed_spider/anemone_hack'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'uri'
|
5
|
+
|
6
|
+
module SpeedSpider
|
7
|
+
class Crawler
|
8
|
+
def initialize(start_url, options)
|
9
|
+
@start_url = start_url
|
10
|
+
@base_url = options[:base_url]
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
# return urls from css file contents
|
15
|
+
def get_urls_from_css data, pos = 0
|
16
|
+
if m = data.match(/url\((.*?)\)/i, pos)
|
17
|
+
[ m[1] ] + get_urls_from_css(data, m.end(1) + 1)
|
18
|
+
else
|
19
|
+
[]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def focus_crawl
|
24
|
+
lambda { |page|
|
25
|
+
links = []
|
26
|
+
if page.doc
|
27
|
+
# include javascripts and img files as target links
|
28
|
+
page.doc.search('//script[@src]', '//img[@src]', '//iframe[@src]').each do |s|
|
29
|
+
u = s['src']
|
30
|
+
next if u.nil? or u.empty?
|
31
|
+
abs = page.to_absolute u rescue next
|
32
|
+
links << abs if page.in_domain? abs
|
33
|
+
end
|
34
|
+
|
35
|
+
# include css files as target links
|
36
|
+
page.doc.search('//link[@href]').each do |s|
|
37
|
+
u = s['href']
|
38
|
+
next if u.nil? or u.empty?
|
39
|
+
abs = page.to_absolute u rescue next
|
40
|
+
links << abs if page.in_domain? abs
|
41
|
+
|
42
|
+
end
|
43
|
+
elsif page.url.to_s.end_with? '.css'
|
44
|
+
get_urls_from_css(page.body).each do |s|
|
45
|
+
u = s.gsub('"', '').gsub("'", '')
|
46
|
+
next if u.nil? or u.empty?
|
47
|
+
abs = page.to_absolute u rescue next
|
48
|
+
links << abs if page.in_domain? abs
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
page.links + links.uniq
|
53
|
+
}
|
54
|
+
end
|
55
|
+
|
56
|
+
def after_crawl
|
57
|
+
lambda { |pages|
|
58
|
+
pages.each do |url, page|
|
59
|
+
path = page.url.path
|
60
|
+
path += 'index.html' if path.end_with? '/' or path.empty?
|
61
|
+
|
62
|
+
path = "#{@options[:dir]}/#{page.url.host}#{path}"
|
63
|
+
dir = File.dirname path
|
64
|
+
|
65
|
+
FileUtils.mkdir_p dir unless dir.empty?
|
66
|
+
File.open path, 'w' do |f|
|
67
|
+
f.write page.body
|
68
|
+
end
|
69
|
+
|
70
|
+
puts "save file #{path}" if @options[:verbose]
|
71
|
+
end
|
72
|
+
}
|
73
|
+
end
|
74
|
+
|
75
|
+
def crawl
|
76
|
+
Anemone.crawl @start_url, @options do |spider|
|
77
|
+
spider.focus_crawl &focus_crawl
|
78
|
+
spider.after_crawl &after_crawl
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/speed_spider.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "speed_spider/version"
|
2
|
+
require 'speed_spider/cli'
|
3
|
+
require 'speed_spider/crawler'
|
4
|
+
require 'debugger'
|
5
|
+
|
6
|
+
module SpeedSpider
|
7
|
+
def self.crawl
|
8
|
+
cli = Cli.new.parse!
|
9
|
+
|
10
|
+
start_url = ARGV[0]
|
11
|
+
(puts cli.option_parser.help; exit 1) if start_url.nil?
|
12
|
+
|
13
|
+
crawler = Crawler.new start_url, cli.options
|
14
|
+
crawler.crawl
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'speed_spider/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "speed_spider"
|
8
|
+
spec.version = SpeedSpider::VERSION
|
9
|
+
spec.authors = ["Ryan Wang"]
|
10
|
+
spec.email = ["wongyouth@gmail.com"]
|
11
|
+
spec.description = %q{A simple web spider tool for crawling pages to local based on a url}
|
12
|
+
spec.summary = %q{A simple web spider tool for download pages from a base url including css js html and iframe source files}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "anemone", "~> 0.7.2"
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: speed_spider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ryan Wang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-06-01 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: anemone
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.7.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.7.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: A simple web spider tool for crawling pages to local based on a url
|
56
|
+
email:
|
57
|
+
- wongyouth@gmail.com
|
58
|
+
executables:
|
59
|
+
- speed_spider
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- .gitignore
|
64
|
+
- Gemfile
|
65
|
+
- LICENSE.txt
|
66
|
+
- README.md
|
67
|
+
- Rakefile
|
68
|
+
- bin/speed_spider
|
69
|
+
- lib/speed_spider.rb
|
70
|
+
- lib/speed_spider/anemone_hack.rb
|
71
|
+
- lib/speed_spider/cli.rb
|
72
|
+
- lib/speed_spider/crawler.rb
|
73
|
+
- lib/speed_spider/version.rb
|
74
|
+
- speed_spider.gemspec
|
75
|
+
homepage: ''
|
76
|
+
licenses:
|
77
|
+
- MIT
|
78
|
+
metadata: {}
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - '>='
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 2.0.3
|
96
|
+
signing_key:
|
97
|
+
specification_version: 4
|
98
|
+
summary: A simple web spider tool for download pages from a base url including css
|
99
|
+
js html and iframe source files
|
100
|
+
test_files: []
|