speed_spider 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: abdb6ebdea8dbe7f00e7c9e487641a45d0d47b49
4
+ data.tar.gz: 3b06cf74adb37f274516b16215ae71627a36ec69
5
+ SHA512:
6
+ metadata.gz: fb6e3f517125ab47b511abaf4a1e09d3ecfeab330557c7fe60f165f3350de3434c0d57f3534c1134cefb97a340869afc2abafe8ec5a687e061340bbeccf36cc4
7
+ data.tar.gz: 5bc260f898cf66898073fd547696b237a6ad89f45fd2650e5518983bcefa440bc8e4d5cf31da1ad4110ac3c0614e96aecf05389b19dde4f5e2e0c902bae48613
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in speed_spider.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Ryan Wang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,65 @@
1
+ # SpeedSpider
2
+
3
+ A simple and speedy web spider for pages downloading.
4
+
5
+ SpeedSpider is based on ruby spider framework [Anemone][1], it's easy to use and very fast since it uses threads for page fetching.
6
+
7
+ ## What kind of files will be downloaded
8
+
9
+ ### links in html pages
10
+
11
+ * link, xpath: `//a[@href]`
12
+ * stylesheet, xpath: `//link[@src]`
13
+ * javascript, xpath: `//script[@src]`
14
+ * iframe file, xpath: `//iframe[@src]`
15
+ * image file, xpath: `//img[@src]`
16
+
17
+ ### urls in stylesheet files
18
+
19
+ * urls with parttern url\((.*)\)
20
+
21
+ ## Installation
22
+
23
+ install it with rubygem:
24
+
25
+ gem install 'speed_spider'
26
+
27
+ ## Usage
28
+ Usage: speed_spider [options] start_url
29
+
30
+ options:
31
+ -S, --slient slient output
32
+ -D, --dir String directory for download files to save to. "download" by default
33
+ -b, --base_url String any url not starts with base_url will not be saved
34
+ -t, --threads Integer threads to run for fetching pages, 4 by default
35
+ -u, --user_agent String words for request header USER_AGENT
36
+ -d, --delay Integer delay between requests
37
+ -o, --obey_robots_text obey robots exclustion protocol
38
+ -l, --depth_limit limit the depth of the crawl
39
+ -r, --redirect_limit Integer number of times HTTP redirects will be followed
40
+ -a, --accept_cookies accept cookies from the server and send them back?
41
+ -s, --skip_query_strings skip any link with a query string? e.g. http://foo.com/?u=user
42
+ -H, --proxy_host String proxy server hostname
43
+ -P, --proxy_port Integer proxy server port number
44
+ -T, --read_timeout Integer HTTP read timeout in seconds
45
+ -V, --version Show version
46
+
47
+ ## Example
48
+
49
+ speed_spider http://twitter.github.io/bootstrap/
50
+
51
+ It will download all files within the same domain as `twitter.github.io`, and save to `download/twitter.github.io/`.
52
+
53
+ speed_spider -b http://ruby-doc.org/core-2.0/ http://ruby-doc.org/core-2.0/
54
+
55
+ It will only download urls start with http://ruby-doc.org/core-2.0/, notice `assets` files like image, css, js, font will not obey `base_url` rule.
56
+
57
+ ## Contributing
58
+
59
+ 1. Fork it
60
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
61
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
62
+ 4. Push to the branch (`git push origin my-new-feature`)
63
+ 5. Create new Pull Request
64
+
65
+ [1]: http://anemone.rubyforge.org/
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/speed_spider ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path('../../lib', __FILE__)
4
+ require 'speed_spider'
5
+
6
+ SpeedSpider.crawl
@@ -0,0 +1,19 @@
1
+ module Anemone
2
+ class Core
3
+ def assets?(link)
4
+ %w(js css jpg jpeg png bmp gif svg ttf woff eot).any? do |e|
5
+ /#{e}/i =~ File.extname(link.path).split('.').pop
6
+ end
7
+ end
8
+ #
9
+ # Returns +true+ if *link* should not be visited because
10
+ # its URL matches a skip_link pattern.
11
+ #
12
+ def skip_link_with_hack?(link)
13
+ skip_link_without_hack?(link) or !assets?(link) and !link.to_s.start_with? @opts[:base_url]
14
+ end
15
+
16
+ alias_method :skip_link_without_hack?, :skip_link?
17
+ alias_method :skip_link?, :skip_link_with_hack?
18
+ end
19
+ end
@@ -0,0 +1,123 @@
1
+ require 'speed_spider/crawler'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ module SpeedSpider
6
+ class Cli
7
+ attr_reader :options, :option_parser
8
+
9
+ def initialize
10
+ @options = {
11
+ # only url start with base_url will save to local
12
+ :base_url => '',
13
+ # directory for downloaded files to save to
14
+ :dir => 'download',
15
+ # run 4 Tentacle threads to fetch pages
16
+ :threads => 4,
17
+ # verbose output
18
+ :verbose => true,
19
+ # don't throw away the page response body after scanning it for links
20
+ :discard_page_bodies => false,
21
+ # identify self as WebCrawler/VERSION
22
+ :user_agent => "SpeedSpider/#{SpeedSpider::VERSION}",
23
+ # no delay between requests
24
+ :delay => 0,
25
+ # don't obey the robots exclusion protocol
26
+ :obey_robots_txt => false,
27
+ # by default, don't limit the depth of the crawl
28
+ :depth_limit => false,
29
+ # number of times HTTP redirects will be followed
30
+ :redirect_limit => 5,
31
+ # storage engine defaults to Hash in +process_options+ if none specified
32
+ :storage => nil,
33
+ # Hash of cookie name => value to send with HTTP requests
34
+ :cookies => nil,
35
+ # accept cookies from the server and send them back?
36
+ :accept_cookies => false,
37
+ # skip any link with a query string? e.g. http://foo.com/?u=user
38
+ :skip_query_strings => false,
39
+ # proxy server hostname
40
+ :proxy_host => nil,
41
+ # proxy server port number
42
+ :proxy_port => false,
43
+ # HTTP read timeout in seconds
44
+ :read_timeout => nil
45
+ }
46
+ end
47
+
48
+ def parse!
49
+ @option_parser = OptionParser.new do |opts|
50
+ opts.banner = "Usage: speed_spider [options] start_url"
51
+ opts.separator ""
52
+ opts.separator "options:"
53
+
54
+ opts.on('-S', '--slient', 'slient output') do
55
+ @options[:verbose] = false
56
+ end
57
+
58
+ opts.on('-D', '--dir String', 'directory for download files to save to. "download" by default') do |value|
59
+ options[:dir] = value
60
+ end
61
+
62
+ opts.on('-b', '--base_url String', 'any url not starts with base_url will not be saved') do |value|
63
+ value += '/' unless value.end_with? '/'
64
+ options[:base_url] = value
65
+ end
66
+
67
+ opts.on('-t', '--threads Integer', Integer, 'threads to run for fetching pages, 4 by default') do |value|
68
+ @options[:threads] = value
69
+ end
70
+
71
+ opts.on('-u', '--user_agent String', 'words for request header USER_AGENT') do |value|
72
+ @options[:user_agent] = value
73
+ end
74
+
75
+ opts.on('-d', '--delay Integer', Integer, 'delay between requests in seconds') do |value|
76
+ @options[:delay] = value
77
+ end
78
+
79
+ opts.on('-o', '--obey_robots_text', 'obey robots exclustion protocol') do
80
+ @options[:obey_robots_txt] = true
81
+ end
82
+
83
+ opts.on('-l', '--depth_limit', 'limit the depth of the crawl') do
84
+ @options[:delay] = true
85
+ end
86
+
87
+ opts.on('-r', '--redirect_limit Integer', Integer, 'number of times HTTP redirects will be followed') do |value|
88
+ @options[:redirect_limit] = value
89
+ end
90
+
91
+ opts.on('-a', '--accept_cookies', 'accept cookies from the server and send them back?') do
92
+ @options[:accept_cookies] = true
93
+ end
94
+
95
+ opts.on('-s', '--skip_query_strings', 'skip any link with a query string? e.g. http://foo.com/?u=user') do
96
+ @options[:skip_query_strings] = true
97
+ end
98
+
99
+ opts.on('-H', '--proxy_host String', 'proxy server hostname') do |value|
100
+ @options[:proxy_host] = value
101
+ end
102
+
103
+ opts.on('-P', '--proxy_port Integer', Integer, 'proxy server port number') do |value|
104
+ @options[:proxy_port] = value
105
+ end
106
+
107
+ opts.on('-T', '--read_timeout Integer', Integer, 'HTTP read timeout in seconds') do |value|
108
+ @options[:read_timeout] = value
109
+ end
110
+
111
+ # print the version.
112
+ opts.on_tail("-V", "--version", "Show version") do
113
+ puts SpeedSpider::VERSION
114
+ exit
115
+ end
116
+ end
117
+
118
+ @option_parser.parse!
119
+
120
+ self
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,82 @@
1
+ require 'anemone'
2
+ require 'speed_spider/anemone_hack'
3
+ require 'fileutils'
4
+ require 'uri'
5
+
6
+ module SpeedSpider
7
+ class Crawler
8
+ def initialize(start_url, options)
9
+ @start_url = start_url
10
+ @base_url = options[:base_url]
11
+ @options = options
12
+ end
13
+
14
+ # return urls from css file contents
15
+ def get_urls_from_css data, pos = 0
16
+ if m = data.match(/url\((.*?)\)/i, pos)
17
+ [ m[1] ] + get_urls_from_css(data, m.end(1) + 1)
18
+ else
19
+ []
20
+ end
21
+ end
22
+
23
+ def focus_crawl
24
+ lambda { |page|
25
+ links = []
26
+ if page.doc
27
+ # include javascripts and img files as target links
28
+ page.doc.search('//script[@src]', '//img[@src]', '//iframe[@src]').each do |s|
29
+ u = s['src']
30
+ next if u.nil? or u.empty?
31
+ abs = page.to_absolute u rescue next
32
+ links << abs if page.in_domain? abs
33
+ end
34
+
35
+ # include css files as target links
36
+ page.doc.search('//link[@href]').each do |s|
37
+ u = s['href']
38
+ next if u.nil? or u.empty?
39
+ abs = page.to_absolute u rescue next
40
+ links << abs if page.in_domain? abs
41
+
42
+ end
43
+ elsif page.url.to_s.end_with? '.css'
44
+ get_urls_from_css(page.body).each do |s|
45
+ u = s.gsub('"', '').gsub("'", '')
46
+ next if u.nil? or u.empty?
47
+ abs = page.to_absolute u rescue next
48
+ links << abs if page.in_domain? abs
49
+ end
50
+ end
51
+
52
+ page.links + links.uniq
53
+ }
54
+ end
55
+
56
+ def after_crawl
57
+ lambda { |pages|
58
+ pages.each do |url, page|
59
+ path = page.url.path
60
+ path += 'index.html' if path.end_with? '/' or path.empty?
61
+
62
+ path = "#{@options[:dir]}/#{page.url.host}#{path}"
63
+ dir = File.dirname path
64
+
65
+ FileUtils.mkdir_p dir unless dir.empty?
66
+ File.open path, 'w' do |f|
67
+ f.write page.body
68
+ end
69
+
70
+ puts "save file #{path}" if @options[:verbose]
71
+ end
72
+ }
73
+ end
74
+
75
+ def crawl
76
+ Anemone.crawl @start_url, @options do |spider|
77
+ spider.focus_crawl &focus_crawl
78
+ spider.after_crawl &after_crawl
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,3 @@
1
+ module SpeedSpider
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,17 @@
1
+ require "speed_spider/version"
2
+ require 'speed_spider/cli'
3
+ require 'speed_spider/crawler'
4
+ require 'debugger'
5
+
6
+ module SpeedSpider
7
+ def self.crawl
8
+ cli = Cli.new.parse!
9
+
10
+ start_url = ARGV[0]
11
+ (puts cli.option_parser.help; exit 1) if start_url.nil?
12
+
13
+ crawler = Crawler.new start_url, cli.options
14
+ crawler.crawl
15
+ end
16
+
17
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'speed_spider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "speed_spider"
8
+ spec.version = SpeedSpider::VERSION
9
+ spec.authors = ["Ryan Wang"]
10
+ spec.email = ["wongyouth@gmail.com"]
11
+ spec.description = %q{A simple web spider tool for crawling pages to local based on a url}
12
+ spec.summary = %q{A simple web spider tool for download pages from a base url including css js html and iframe source files}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "anemone", "~> 0.7.2"
22
+ spec.add_development_dependency "bundler", "~> 1.3"
23
+ spec.add_development_dependency "rake"
24
+ end
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: speed_spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ryan Wang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-06-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: anemone
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.7.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.7.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: A simple web spider tool for crawling pages to local based on a url
56
+ email:
57
+ - wongyouth@gmail.com
58
+ executables:
59
+ - speed_spider
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - .gitignore
64
+ - Gemfile
65
+ - LICENSE.txt
66
+ - README.md
67
+ - Rakefile
68
+ - bin/speed_spider
69
+ - lib/speed_spider.rb
70
+ - lib/speed_spider/anemone_hack.rb
71
+ - lib/speed_spider/cli.rb
72
+ - lib/speed_spider/crawler.rb
73
+ - lib/speed_spider/version.rb
74
+ - speed_spider.gemspec
75
+ homepage: ''
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.0.3
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: A simple web spider tool for download pages from a base url including css
99
+ js html and iframe source files
100
+ test_files: []