speed_spider 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: abdb6ebdea8dbe7f00e7c9e487641a45d0d47b49
4
+ data.tar.gz: 3b06cf74adb37f274516b16215ae71627a36ec69
5
+ SHA512:
6
+ metadata.gz: fb6e3f517125ab47b511abaf4a1e09d3ecfeab330557c7fe60f165f3350de3434c0d57f3534c1134cefb97a340869afc2abafe8ec5a687e061340bbeccf36cc4
7
+ data.tar.gz: 5bc260f898cf66898073fd547696b237a6ad89f45fd2650e5518983bcefa440bc8e4d5cf31da1ad4110ac3c0614e96aecf05389b19dde4f5e2e0c902bae48613
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in speed_spider.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Ryan Wang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,65 @@
1
+ # SpeedSpider
2
+
3
+ A simple and speedy web spider for pages downloading.
4
+
5
+ SpeedSpider is based on ruby spider framework [Anemone][1], it's easy to use and very fast since it uses threads for page fetching.
6
+
7
+ ## What kind of files will be downloaded
8
+
9
+ ### links in html pages
10
+
11
+ * link, xpath: `//a[@href]`
12
+ * stylesheet, xpath: `//link[@src]`
13
+ * javascript, xpath: `//script[@src]`
14
+ * iframe file, xpath: `//iframe[@src]`
15
+ * image file, xpath: `//img[@src]`
16
+
17
+ ### urls in stylesheet files
18
+
19
+ * urls with parttern url\((.*)\)
20
+
21
+ ## Installation
22
+
23
+ install it with rubygem:
24
+
25
+ gem install 'speed_spider'
26
+
27
+ ## Usage
28
+ Usage: speed_spider [options] start_url
29
+
30
+ options:
31
+ -S, --slient slient output
32
+ -D, --dir String directory for download files to save to. "download" by default
33
+ -b, --base_url String any url not starts with base_url will not be saved
34
+ -t, --threads Integer threads to run for fetching pages, 4 by default
35
+ -u, --user_agent String words for request header USER_AGENT
36
+ -d, --delay Integer delay between requests
37
+ -o, --obey_robots_text obey robots exclustion protocol
38
+ -l, --depth_limit limit the depth of the crawl
39
+ -r, --redirect_limit Integer number of times HTTP redirects will be followed
40
+ -a, --accept_cookies accept cookies from the server and send them back?
41
+ -s, --skip_query_strings skip any link with a query string? e.g. http://foo.com/?u=user
42
+ -H, --proxy_host String proxy server hostname
43
+ -P, --proxy_port Integer proxy server port number
44
+ -T, --read_timeout Integer HTTP read timeout in seconds
45
+ -V, --version Show version
46
+
47
+ ## Example
48
+
49
+ speed_spider http://twitter.github.io/bootstrap/
50
+
51
+ It will download all files within the same domain as `twitter.github.io`, and save to `download/twitter.github.io/`.
52
+
53
+ speed_spider -b http://ruby-doc.org/core-2.0/ http://ruby-doc.org/core-2.0/
54
+
55
+ It will only download urls start with http://ruby-doc.org/core-2.0/, notice `assets` files like image, css, js, font will not obey `base_url` rule.
56
+
57
+ ## Contributing
58
+
59
+ 1. Fork it
60
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
61
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
62
+ 4. Push to the branch (`git push origin my-new-feature`)
63
+ 5. Create new Pull Request
64
+
65
+ [1]: http://anemone.rubyforge.org/
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/speed_spider ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path('../../lib', __FILE__)
4
+ require 'speed_spider'
5
+
6
+ SpeedSpider.crawl
@@ -0,0 +1,19 @@
1
+ module Anemone
2
+ class Core
3
+ def assets?(link)
4
+ %w(js css jpg jpeg png bmp gif svg ttf woff eot).any? do |e|
5
+ /#{e}/i =~ File.extname(link.path).split('.').pop
6
+ end
7
+ end
8
+ #
9
+ # Returns +true+ if *link* should not be visited because
10
+ # its URL matches a skip_link pattern.
11
+ #
12
+ def skip_link_with_hack?(link)
13
+ skip_link_without_hack?(link) or !assets?(link) and !link.to_s.start_with? @opts[:base_url]
14
+ end
15
+
16
+ alias_method :skip_link_without_hack?, :skip_link?
17
+ alias_method :skip_link?, :skip_link_with_hack?
18
+ end
19
+ end
@@ -0,0 +1,123 @@
1
+ require 'speed_spider/crawler'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ module SpeedSpider
6
+ class Cli
7
+ attr_reader :options, :option_parser
8
+
9
+ def initialize
10
+ @options = {
11
+ # only url start with base_url will save to local
12
+ :base_url => '',
13
+ # directory for downloaded files to save to
14
+ :dir => 'download',
15
+ # run 4 Tentacle threads to fetch pages
16
+ :threads => 4,
17
+ # verbose output
18
+ :verbose => true,
19
+ # don't throw away the page response body after scanning it for links
20
+ :discard_page_bodies => false,
21
+ # identify self as WebCrawler/VERSION
22
+ :user_agent => "SpeedSpider/#{SpeedSpider::VERSION}",
23
+ # no delay between requests
24
+ :delay => 0,
25
+ # don't obey the robots exclusion protocol
26
+ :obey_robots_txt => false,
27
+ # by default, don't limit the depth of the crawl
28
+ :depth_limit => false,
29
+ # number of times HTTP redirects will be followed
30
+ :redirect_limit => 5,
31
+ # storage engine defaults to Hash in +process_options+ if none specified
32
+ :storage => nil,
33
+ # Hash of cookie name => value to send with HTTP requests
34
+ :cookies => nil,
35
+ # accept cookies from the server and send them back?
36
+ :accept_cookies => false,
37
+ # skip any link with a query string? e.g. http://foo.com/?u=user
38
+ :skip_query_strings => false,
39
+ # proxy server hostname
40
+ :proxy_host => nil,
41
+ # proxy server port number
42
+ :proxy_port => false,
43
+ # HTTP read timeout in seconds
44
+ :read_timeout => nil
45
+ }
46
+ end
47
+
48
+ def parse!
49
+ @option_parser = OptionParser.new do |opts|
50
+ opts.banner = "Usage: speed_spider [options] start_url"
51
+ opts.separator ""
52
+ opts.separator "options:"
53
+
54
+ opts.on('-S', '--slient', 'slient output') do
55
+ @options[:verbose] = false
56
+ end
57
+
58
+ opts.on('-D', '--dir String', 'directory for download files to save to. "download" by default') do |value|
59
+ options[:dir] = value
60
+ end
61
+
62
+ opts.on('-b', '--base_url String', 'any url not starts with base_url will not be saved') do |value|
63
+ value += '/' unless value.end_with? '/'
64
+ options[:base_url] = value
65
+ end
66
+
67
+ opts.on('-t', '--threads Integer', Integer, 'threads to run for fetching pages, 4 by default') do |value|
68
+ @options[:threads] = value
69
+ end
70
+
71
+ opts.on('-u', '--user_agent String', 'words for request header USER_AGENT') do |value|
72
+ @options[:user_agent] = value
73
+ end
74
+
75
+ opts.on('-d', '--delay Integer', Integer, 'delay between requests in seconds') do |value|
76
+ @options[:delay] = value
77
+ end
78
+
79
+ opts.on('-o', '--obey_robots_text', 'obey robots exclustion protocol') do
80
+ @options[:obey_robots_txt] = true
81
+ end
82
+
83
+ opts.on('-l', '--depth_limit', 'limit the depth of the crawl') do
84
+ @options[:delay] = true
85
+ end
86
+
87
+ opts.on('-r', '--redirect_limit Integer', Integer, 'number of times HTTP redirects will be followed') do |value|
88
+ @options[:redirect_limit] = value
89
+ end
90
+
91
+ opts.on('-a', '--accept_cookies', 'accept cookies from the server and send them back?') do
92
+ @options[:accept_cookies] = true
93
+ end
94
+
95
+ opts.on('-s', '--skip_query_strings', 'skip any link with a query string? e.g. http://foo.com/?u=user') do
96
+ @options[:skip_query_strings] = true
97
+ end
98
+
99
+ opts.on('-H', '--proxy_host String', 'proxy server hostname') do |value|
100
+ @options[:proxy_host] = value
101
+ end
102
+
103
+ opts.on('-P', '--proxy_port Integer', Integer, 'proxy server port number') do |value|
104
+ @options[:proxy_port] = value
105
+ end
106
+
107
+ opts.on('-T', '--read_timeout Integer', Integer, 'HTTP read timeout in seconds') do |value|
108
+ @options[:read_timeout] = value
109
+ end
110
+
111
+ # print the version.
112
+ opts.on_tail("-V", "--version", "Show version") do
113
+ puts SpeedSpider::VERSION
114
+ exit
115
+ end
116
+ end
117
+
118
+ @option_parser.parse!
119
+
120
+ self
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,82 @@
1
+ require 'anemone'
2
+ require 'speed_spider/anemone_hack'
3
+ require 'fileutils'
4
+ require 'uri'
5
+
6
+ module SpeedSpider
7
+ class Crawler
8
+ def initialize(start_url, options)
9
+ @start_url = start_url
10
+ @base_url = options[:base_url]
11
+ @options = options
12
+ end
13
+
14
+ # return urls from css file contents
15
+ def get_urls_from_css data, pos = 0
16
+ if m = data.match(/url\((.*?)\)/i, pos)
17
+ [ m[1] ] + get_urls_from_css(data, m.end(1) + 1)
18
+ else
19
+ []
20
+ end
21
+ end
22
+
23
+ def focus_crawl
24
+ lambda { |page|
25
+ links = []
26
+ if page.doc
27
+ # include javascripts and img files as target links
28
+ page.doc.search('//script[@src]', '//img[@src]', '//iframe[@src]').each do |s|
29
+ u = s['src']
30
+ next if u.nil? or u.empty?
31
+ abs = page.to_absolute u rescue next
32
+ links << abs if page.in_domain? abs
33
+ end
34
+
35
+ # include css files as target links
36
+ page.doc.search('//link[@href]').each do |s|
37
+ u = s['href']
38
+ next if u.nil? or u.empty?
39
+ abs = page.to_absolute u rescue next
40
+ links << abs if page.in_domain? abs
41
+
42
+ end
43
+ elsif page.url.to_s.end_with? '.css'
44
+ get_urls_from_css(page.body).each do |s|
45
+ u = s.gsub('"', '').gsub("'", '')
46
+ next if u.nil? or u.empty?
47
+ abs = page.to_absolute u rescue next
48
+ links << abs if page.in_domain? abs
49
+ end
50
+ end
51
+
52
+ page.links + links.uniq
53
+ }
54
+ end
55
+
56
+ def after_crawl
57
+ lambda { |pages|
58
+ pages.each do |url, page|
59
+ path = page.url.path
60
+ path += 'index.html' if path.end_with? '/' or path.empty?
61
+
62
+ path = "#{@options[:dir]}/#{page.url.host}#{path}"
63
+ dir = File.dirname path
64
+
65
+ FileUtils.mkdir_p dir unless dir.empty?
66
+ File.open path, 'w' do |f|
67
+ f.write page.body
68
+ end
69
+
70
+ puts "save file #{path}" if @options[:verbose]
71
+ end
72
+ }
73
+ end
74
+
75
+ def crawl
76
+ Anemone.crawl @start_url, @options do |spider|
77
+ spider.focus_crawl &focus_crawl
78
+ spider.after_crawl &after_crawl
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,3 @@
1
+ module SpeedSpider
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,17 @@
1
+ require "speed_spider/version"
2
+ require 'speed_spider/cli'
3
+ require 'speed_spider/crawler'
4
+ require 'debugger'
5
+
6
+ module SpeedSpider
7
+ def self.crawl
8
+ cli = Cli.new.parse!
9
+
10
+ start_url = ARGV[0]
11
+ (puts cli.option_parser.help; exit 1) if start_url.nil?
12
+
13
+ crawler = Crawler.new start_url, cli.options
14
+ crawler.crawl
15
+ end
16
+
17
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'speed_spider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "speed_spider"
8
+ spec.version = SpeedSpider::VERSION
9
+ spec.authors = ["Ryan Wang"]
10
+ spec.email = ["wongyouth@gmail.com"]
11
+ spec.description = %q{A simple web spider tool for crawling pages to local based on a url}
12
+ spec.summary = %q{A simple web spider tool for download pages from a base url including css js html and iframe source files}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "anemone", "~> 0.7.2"
22
+ spec.add_development_dependency "bundler", "~> 1.3"
23
+ spec.add_development_dependency "rake"
24
+ end
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: speed_spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ryan Wang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-06-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: anemone
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.7.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.7.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: A simple web spider tool for crawling pages to local based on a url
56
+ email:
57
+ - wongyouth@gmail.com
58
+ executables:
59
+ - speed_spider
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - .gitignore
64
+ - Gemfile
65
+ - LICENSE.txt
66
+ - README.md
67
+ - Rakefile
68
+ - bin/speed_spider
69
+ - lib/speed_spider.rb
70
+ - lib/speed_spider/anemone_hack.rb
71
+ - lib/speed_spider/cli.rb
72
+ - lib/speed_spider/crawler.rb
73
+ - lib/speed_spider/version.rb
74
+ - speed_spider.gemspec
75
+ homepage: ''
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.0.3
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: A simple web spider tool for download pages from a base url including css
99
+ js html and iframe source files
100
+ test_files: []