mech_warrior 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.ruby-version +1 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +61 -0
- data/README.md +58 -0
- data/Rakefile +6 -0
- data/bin/spider +12 -0
- data/lib/mech_warrior.rb +60 -0
- data/lib/mech_warrior/crawler.rb +75 -0
- data/lib/mech_warrior/mech_cell.rb +22 -0
- data/lib/mech_warrior/version.rb +3 -0
- data/mech_warrior.gemspec +32 -0
- data/spec/fakeweb_helper.rb +65 -0
- data/spec/mech_warrior_spec.rb +89 -0
- data/spec/spec_helper.rb +6 -0
- metadata +147 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 63e3f44169fe19d92e5c237ce58460e35522bc1d
|
4
|
+
data.tar.gz: 95bc44122c6b15c317c512866a1f1c00a769260f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: eb826d2289cbed61494ef15285ee2ae0e2fad84cc40a6434895f0eedd3430706fe6539e912af94faf98a49307b288146c19be23b8febc7dce15d91d5bba7e473
|
7
|
+
data.tar.gz: 849c53b351db05f04375bf73e10e6c76f284b62d0664cab8697926792377f1468119c5bd0fb7afbbed14c930b4464109c1342758a8341f4586bb4e0f59a3c40f
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.1.1
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
mech_warrior (0.0.1)
|
5
|
+
celluloid (~> 0)
|
6
|
+
mechanize (~> 2.7)
|
7
|
+
xml-sitemap (~> 1.3)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
builder (3.2.2)
|
13
|
+
celluloid (0.15.2)
|
14
|
+
timers (~> 1.1.0)
|
15
|
+
diff-lcs (1.2.5)
|
16
|
+
domain_name (0.5.18)
|
17
|
+
unf (>= 0.0.5, < 1.0.0)
|
18
|
+
fakeweb (1.3.0)
|
19
|
+
http-cookie (1.0.2)
|
20
|
+
domain_name (~> 0.5)
|
21
|
+
mechanize (2.7.3)
|
22
|
+
domain_name (~> 0.5, >= 0.5.1)
|
23
|
+
http-cookie (~> 1.0)
|
24
|
+
mime-types (~> 2.0)
|
25
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
26
|
+
net-http-persistent (~> 2.5, >= 2.5.2)
|
27
|
+
nokogiri (~> 1.4)
|
28
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
29
|
+
webrobots (>= 0.0.9, < 0.2)
|
30
|
+
mime-types (2.2)
|
31
|
+
mini_portile (0.5.3)
|
32
|
+
net-http-digest_auth (1.4)
|
33
|
+
net-http-persistent (2.9.4)
|
34
|
+
nokogiri (1.6.1)
|
35
|
+
mini_portile (~> 0.5.0)
|
36
|
+
ntlm-http (0.1.1)
|
37
|
+
rake (0.9.6)
|
38
|
+
rspec (2.14.1)
|
39
|
+
rspec-core (~> 2.14.0)
|
40
|
+
rspec-expectations (~> 2.14.0)
|
41
|
+
rspec-mocks (~> 2.14.0)
|
42
|
+
rspec-core (2.14.8)
|
43
|
+
rspec-expectations (2.14.5)
|
44
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
45
|
+
rspec-mocks (2.14.6)
|
46
|
+
timers (1.1.0)
|
47
|
+
unf (0.1.4)
|
48
|
+
unf_ext
|
49
|
+
unf_ext (0.0.6)
|
50
|
+
webrobots (0.1.1)
|
51
|
+
xml-sitemap (1.3.3)
|
52
|
+
builder (>= 2.0)
|
53
|
+
|
54
|
+
PLATFORMS
|
55
|
+
ruby
|
56
|
+
|
57
|
+
DEPENDENCIES
|
58
|
+
fakeweb (~> 1.3)
|
59
|
+
mech_warrior!
|
60
|
+
rake (~> 0)
|
61
|
+
rspec (~> 2.14)
|
data/README.md
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
MechWarrior
|
2
|
+
=========
|
3
|
+
|
4
|
+
MechWarrior is a Mechanize and Celluloid powered site crawler that generates a
|
5
|
+
JSON file of all pages, links on pages, and assets those pages rely upon
|
6
|
+
as well as optionally generating an XML sitemap compliant with sitemaps 0.9
|
7
|
+
protocol.
|
8
|
+
|
9
|
+
|
10
|
+
Version
|
11
|
+
----
|
12
|
+
|
13
|
+
0.0.1
|
14
|
+
|
15
|
+
Tech
|
16
|
+
-----------
|
17
|
+
|
18
|
+
MechWarrior relies on several excellent RubyGems
|
19
|
+
|
20
|
+
* [Mechanize] - a ruby library that makes automated web interaction easy.
|
21
|
+
* [Celluloid] - an Actor-based concurrent object framework for Ruby
|
22
|
+
* [XML-Sitemap] - provides easy XML sitemap generation for Ruby/Rails/Merb/Sinatra applications
|
23
|
+
|
24
|
+
|
25
|
+
Installation
|
26
|
+
--------------
|
27
|
+
|
28
|
+
```sh
|
29
|
+
gem install mech_warrior-0.0.1.gem
|
30
|
+
```
|
31
|
+
|
32
|
+
Crawling a site
|
33
|
+
---------------
|
34
|
+
|
35
|
+
```sh
|
36
|
+
bin/spider
|
37
|
+
```
|
38
|
+
and enter a host name, followed by any additional options you wish to pass in
|
39
|
+
to override default options in `lib/mech_warrior.rb`
|
40
|
+
|
41
|
+
|
42
|
+
Todo
|
43
|
+
----
|
44
|
+
Some of the functionality, including XML Sitemaps, is untested.
|
45
|
+
Support for multiple hosts in a single spider is currently incomplete,
|
46
|
+
despite the 'allowed_hosts' array, unless all but default host have
|
47
|
+
only absolute links to follow.
|
48
|
+
|
49
|
+
License
|
50
|
+
----
|
51
|
+
|
52
|
+
MIT
|
53
|
+
|
54
|
+
[mechanize]:https://github.com/sparklemotion/mechanize
|
55
|
+
[celluloid]:http://celluloid.io/
|
56
|
+
[xml-sitemap]:https://github.com/sosedoff/xml-sitemap
|
57
|
+
|
58
|
+
|
data/Rakefile
ADDED
data/bin/spider
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative "../lib/mech_warrior"
|
4
|
+
|
5
|
+
puts "Host:"
|
6
|
+
host = gets.chomp
|
7
|
+
puts
|
8
|
+
puts "Other options:"
|
9
|
+
opts = gets.chomp
|
10
|
+
opts_hash = eval("{" + opts + "}")
|
11
|
+
puts opts_hash.to_json
|
12
|
+
MechWarrior.crawl(opts_hash.merge(default_host: host))
|
data/lib/mech_warrior.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'xml-sitemap'
|
3
|
+
require 'logger'
|
4
|
+
require 'celluloid/autostart'
|
5
|
+
require_relative 'mech_warrior/mech_cell'
|
6
|
+
require_relative 'mech_warrior/crawler'
|
7
|
+
|
8
|
+
module MechWarrior
|
9
|
+
SITEMAP_MAX_LINKS = 50000
|
10
|
+
DEFAULTS = {
|
11
|
+
allowed_domains: [],
|
12
|
+
default_protocol: 'http://',
|
13
|
+
default_host: 'www.example.com',
|
14
|
+
# this is less 'default_host' at the moment than 'only', though links to other domains will work as long
|
15
|
+
# as all links on other domains' pages are absolute. To support multiple domains while supporting
|
16
|
+
# relative links, some new state would have to be introduced to track 'current_host'
|
17
|
+
max_depth_divisor: 256, # this results in max depth of 4096 on my machine, seems deep enough
|
18
|
+
pool_size: 20,
|
19
|
+
logger_class: Logger,
|
20
|
+
log_file_name: "mech_warrior_errors.txt"
|
21
|
+
}
|
22
|
+
|
23
|
+
def self.crawl(opts={})
|
24
|
+
crawl_results = Crawler.new(opts)
|
25
|
+
crawl_results.agent_pool.future.terminate
|
26
|
+
unless opts[:skip_asset_json]
|
27
|
+
File.open("#{crawl_results.default_host}_crawl_#{Time.now.gmtime}", 'w') do |file|
|
28
|
+
file.write(JSON.pretty_generate(crawl_results.pages))
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
if sitemap_opts = opts[:generate_sitemap]
|
33
|
+
generate_sitemap(crawl_results.default_host,
|
34
|
+
crawl_results.pages,
|
35
|
+
sitemap_opts.respond_to?(:keys) ? sitemap_opts : {}
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
crawl_results
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
#generate_sitemap is untested and NOT production ready, but is functional
|
44
|
+
#and probably a better output format if asset/link data is not needed
|
45
|
+
def self.generate_sitemap(default_host, pages, opts, sitemap_file_num=1)
|
46
|
+
page_keys = pages.keys
|
47
|
+
current_page_keys = page_keys.slice(0...SITEMAP_MAX_LINKS)
|
48
|
+
|
49
|
+
site_map = XmlSitemap::Map.new(default_host) do |map|
|
50
|
+
current_page_keys.each do |page|
|
51
|
+
map.add URI(page).path, opts if URI(page).path.length > 0
|
52
|
+
end
|
53
|
+
end
|
54
|
+
site_map.render_to("./site_map_#{default_host}_#{sitemap_file_num}")
|
55
|
+
|
56
|
+
if page_keys.count > SITEMAP_MAX_LINKS
|
57
|
+
generate_sitemap(default_host, page_keys.slice(SITEMAP_MAX_LINKS..-1), opts, sitemap_file_num + 1)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module MechWarrior
|
2
|
+
class Crawler
|
3
|
+
attr_reader :agent_pool, :pages, :opts, :default_host, :default_protocol, :logger, :output_file
|
4
|
+
|
5
|
+
def initialize(override_opts={})
|
6
|
+
@opts = DEFAULTS.merge(override_opts)
|
7
|
+
@default_host = opts[:default_host]
|
8
|
+
@default_protocol = opts[:default_protocol]
|
9
|
+
opts[:allowed_domains] << default_host
|
10
|
+
@output_file = opts[:output_file] || File.open(opts[:log_file_name], 'a')
|
11
|
+
@logger = opts[:logger_class].new(output_file)
|
12
|
+
@agent_pool = MechCell.pool(size: opts[:pool_size], args: [logger])
|
13
|
+
@pages = {}
|
14
|
+
start_url = opts[:start_url] || "#{default_protocol}#{default_host}/"
|
15
|
+
pages[normalize_url(start_url)] = {}
|
16
|
+
index_url(start_url) unless opts[:no_index]
|
17
|
+
self
|
18
|
+
ensure
|
19
|
+
output_file.close if output_file.respond_to?(:close)
|
20
|
+
end
|
21
|
+
|
22
|
+
def index_url(href)
|
23
|
+
schemed_url = normalize_url(href)
|
24
|
+
future = page_future(schemed_url)
|
25
|
+
process_page(future, schemed_url)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def process_page(page_future, url, depth=0)
|
31
|
+
return if depth > RubyVM::DEFAULT_PARAMS[:thread_vm_stack_size]/opts[:max_depth_divisor]
|
32
|
+
page = page_future.value
|
33
|
+
if page && page.respond_to?(:links)
|
34
|
+
pages[url] = {}
|
35
|
+
pages[url][:links] = page.respond_to?(:links) ? page.links.map(&:href) : []
|
36
|
+
pages[url][:assets] = {
|
37
|
+
images: page.image_urls,
|
38
|
+
scripts: page.search('script'),
|
39
|
+
asset_links: page.search('link'), #css, icons
|
40
|
+
iframes: page.iframes
|
41
|
+
}
|
42
|
+
urls = links_to_follow(page).map {|link| normalize_url(link.href)}
|
43
|
+
futures = urls.map {|url| page_future(url)}
|
44
|
+
pairs = futures.zip(urls)
|
45
|
+
pairs.each {|future, url| process_page(future, url, depth +1)}
|
46
|
+
end
|
47
|
+
rescue URI::InvalidURIError => e
|
48
|
+
logger << "InvalidURIError processing links on page at URL: #{url} -- #{e}\n"
|
49
|
+
end
|
50
|
+
|
51
|
+
def page_future(url)
|
52
|
+
agent_pool.future.get(url)
|
53
|
+
end
|
54
|
+
|
55
|
+
def get_page(url)
|
56
|
+
agent_pool.get(url)
|
57
|
+
end
|
58
|
+
|
59
|
+
def normalize_url(href)
|
60
|
+
URI(href).scheme ? href : "#{default_protocol}#{default_host}#{href}"
|
61
|
+
end
|
62
|
+
|
63
|
+
def follow_link?(link) #follow only pages not indexed and relative links or whitelisted link hosts
|
64
|
+
if link.href && URI(link.href)
|
65
|
+
pages[normalize_url(link.href)].nil? && (link.uri.host.nil? || opts[:allowed_domains].include?(link.uri.host))
|
66
|
+
end
|
67
|
+
rescue URI::InvalidURIError => e
|
68
|
+
logger << "InvalidURIError on link with href: #{link.href} -- #{e}\n"
|
69
|
+
end
|
70
|
+
|
71
|
+
def links_to_follow(page)
|
72
|
+
page.links.select { |link| follow_link?(link) }
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module MechWarrior
|
2
|
+
class MechCell
|
3
|
+
include Celluloid
|
4
|
+
attr_reader :agent, :logger
|
5
|
+
MECH_ERRORS = [
|
6
|
+
SocketError,
|
7
|
+
Mechanize::ResponseCodeError,
|
8
|
+
Mechanize::ResponseReadError,
|
9
|
+
Mechanize::UnsupportedSchemeError
|
10
|
+
]
|
11
|
+
def initialize(logger)
|
12
|
+
@agent = Mechanize.new
|
13
|
+
@logger = logger
|
14
|
+
end
|
15
|
+
|
16
|
+
def get(url)
|
17
|
+
agent.get(url)
|
18
|
+
rescue *MECH_ERRORS => e
|
19
|
+
logger << "Caught Exception getting URL: #{url} -- #{e}\n"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
lib = File.expand_path('../lib/', __FILE__)
|
2
|
+
$:.unshift lib unless $:.include?(lib)
|
3
|
+
require "mech_warrior/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "mech_warrior"
|
7
|
+
s.version = MechWarrior::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Brian Glusman"]
|
10
|
+
s.email = ["brian@glusman.me"]
|
11
|
+
s.summary = "Crawler and asset list/sitemap generator"
|
12
|
+
s.licenses = ["MIT", "BSD"]
|
13
|
+
s.extensions = ["Rakefile"]
|
14
|
+
|
15
|
+
s.description = <<-DESC
|
16
|
+
Spider a web host with many mechanize agents concurrently, and generate an asset JSON
|
17
|
+
and/or an XML sitemap of the result
|
18
|
+
DESC
|
19
|
+
|
20
|
+
|
21
|
+
s.files = `git ls-files`.split("\n")
|
22
|
+
s.test_files = `git ls-files -- {spec}/*`.split("\n")
|
23
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
24
|
+
s.require_paths = ["lib"]
|
25
|
+
|
26
|
+
s.add_runtime_dependency "mechanize", '~> 2.7'
|
27
|
+
s.add_runtime_dependency "xml-sitemap", '~> 1.3'
|
28
|
+
s.add_runtime_dependency "celluloid", '~> 0'
|
29
|
+
s.add_development_dependency "rake", '~> 0'
|
30
|
+
s.add_development_dependency "rspec", '~> 2.14'
|
31
|
+
s.add_development_dependency "fakeweb", '~> 1.3'
|
32
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
FakeWeb.allow_net_connect = false
|
2
|
+
module MechWarrior
|
3
|
+
require_relative '../lib/mech_warrior'
|
4
|
+
DEFAULT_HOST = DEFAULTS[:default_host]
|
5
|
+
SPEC_DOMAIN = "http://#{DEFAULT_HOST}/"
|
6
|
+
|
7
|
+
class FakePage
|
8
|
+
attr_accessor :links
|
9
|
+
attr_accessor :hrefs
|
10
|
+
attr_accessor :body
|
11
|
+
|
12
|
+
def initialize(name = '', options = {})
|
13
|
+
@name = name
|
14
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
15
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
16
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
17
|
+
@base = options[:base] if options.has_key?(:base)
|
18
|
+
@content_type = options[:content_type] || "text/html"
|
19
|
+
@body = options[:body]
|
20
|
+
|
21
|
+
create_body unless @body
|
22
|
+
add_to_fakeweb
|
23
|
+
end
|
24
|
+
|
25
|
+
def url
|
26
|
+
SPEC_DOMAIN + @name
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def create_body
|
32
|
+
if @base
|
33
|
+
@body = "<html><head><base href=\"#{@base}\"></head><body>"
|
34
|
+
else
|
35
|
+
@body = "<html><body>"
|
36
|
+
end
|
37
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
38
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
39
|
+
@body += "</body></html>"
|
40
|
+
end
|
41
|
+
|
42
|
+
def add_to_fakeweb
|
43
|
+
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
44
|
+
|
45
|
+
if @redirect
|
46
|
+
options[:status] = [301, "Permanently Moved"]
|
47
|
+
|
48
|
+
# only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
|
49
|
+
redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
|
50
|
+
options[:location] = redirect_url
|
51
|
+
|
52
|
+
# register the page this one redirects to
|
53
|
+
FakeWeb.register_uri(:get, redirect_url, {:body => '',
|
54
|
+
:content_type => @content_type,
|
55
|
+
:status => [200, "OK"]})
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
#default root
|
65
|
+
MechWarrior::FakePage.new
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module MechWarrior
|
4
|
+
describe Crawler do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
FakeWeb.clean_registry
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "crawl" do
|
11
|
+
|
12
|
+
context "crawl all the html pages in a domain by following <a> href's" do
|
13
|
+
let(:pages) do
|
14
|
+
pages = []
|
15
|
+
pages << FakePage.new('0', links: ['1', '2'])
|
16
|
+
pages << FakePage.new('1', links: ['3'])
|
17
|
+
pages << FakePage.new('2')
|
18
|
+
pages << FakePage.new('3')
|
19
|
+
pages
|
20
|
+
end
|
21
|
+
|
22
|
+
subject { Crawler.new(default_host: MechWarrior::DEFAULTS[:default_host],
|
23
|
+
start_url: pages[0].url,
|
24
|
+
logger_class: String,
|
25
|
+
output_file: "")
|
26
|
+
}
|
27
|
+
|
28
|
+
it {should have(4).pages }
|
29
|
+
its(:logger) {should be_empty }
|
30
|
+
end
|
31
|
+
|
32
|
+
context "should not follow links that leave the original domain" do
|
33
|
+
let(:pages) do
|
34
|
+
pages = []
|
35
|
+
pages << FakePage.new('0', links: ['1'], :hrefs => 'http://www.other.com/')
|
36
|
+
pages << FakePage.new('1')
|
37
|
+
pages
|
38
|
+
end
|
39
|
+
|
40
|
+
subject { Crawler.new(default_host: MechWarrior::DEFAULTS[:default_host],
|
41
|
+
start_url: pages[0].url,
|
42
|
+
logger_class: String,
|
43
|
+
output_file: "")
|
44
|
+
}
|
45
|
+
it { should have(2).pages }
|
46
|
+
its("pages.keys") { should_not include('http://www.other.com/') }
|
47
|
+
its(:logger) {should be_empty }
|
48
|
+
end
|
49
|
+
|
50
|
+
context "should not index non-html links" do
|
51
|
+
let(:pages) do
|
52
|
+
pages = []
|
53
|
+
pages << FakePage.new('0', links: ['1', '2'])
|
54
|
+
pages << FakePage.new('1', content_type: 'application/pdf')
|
55
|
+
pages << FakePage.new('2', content_type: 'text/csv')
|
56
|
+
pages
|
57
|
+
end
|
58
|
+
|
59
|
+
subject { Crawler.new(default_host: MechWarrior::DEFAULTS[:default_host],
|
60
|
+
start_url: pages[0].url,
|
61
|
+
logger_class: String,
|
62
|
+
output_file: "")
|
63
|
+
}
|
64
|
+
it { should have(1).pages }
|
65
|
+
its(:logger) {should be_empty }
|
66
|
+
end
|
67
|
+
|
68
|
+
context "should ignore invalid URLs" do
|
69
|
+
let(:pages) do
|
70
|
+
pages = []
|
71
|
+
pages << FakePage.new('0', links: ['1', '2'])
|
72
|
+
pages << FakePage.new('1', links: ['not a valid url'])
|
73
|
+
pages << FakePage.new('2')
|
74
|
+
pages << FakePage.new('not_a_valid_url')
|
75
|
+
pages
|
76
|
+
end
|
77
|
+
|
78
|
+
subject { Crawler.new(default_host: MechWarrior::DEFAULTS[:default_host],
|
79
|
+
start_url: pages[0].url,
|
80
|
+
logger_class: String,
|
81
|
+
output_file: "")
|
82
|
+
}
|
83
|
+
it { should have(3).pages }
|
84
|
+
its(:logger) {should_not be_empty }
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mech_warrior
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brian Glusman
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-17 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mechanize
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: xml-sitemap
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: celluloid
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '2.14'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '2.14'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: fakeweb
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.3'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.3'
|
97
|
+
description: |2
|
98
|
+
Spider a web host with many mechanize agents concurrently, and generate an asset JSON
|
99
|
+
and/or an XML sitemap of the result
|
100
|
+
email:
|
101
|
+
- brian@glusman.me
|
102
|
+
executables:
|
103
|
+
- spider
|
104
|
+
extensions:
|
105
|
+
- Rakefile
|
106
|
+
extra_rdoc_files: []
|
107
|
+
files:
|
108
|
+
- ".ruby-version"
|
109
|
+
- Gemfile
|
110
|
+
- Gemfile.lock
|
111
|
+
- README.md
|
112
|
+
- Rakefile
|
113
|
+
- bin/spider
|
114
|
+
- lib/mech_warrior.rb
|
115
|
+
- lib/mech_warrior/crawler.rb
|
116
|
+
- lib/mech_warrior/mech_cell.rb
|
117
|
+
- lib/mech_warrior/version.rb
|
118
|
+
- mech_warrior.gemspec
|
119
|
+
- spec/fakeweb_helper.rb
|
120
|
+
- spec/mech_warrior_spec.rb
|
121
|
+
- spec/spec_helper.rb
|
122
|
+
homepage:
|
123
|
+
licenses:
|
124
|
+
- MIT
|
125
|
+
- BSD
|
126
|
+
metadata: {}
|
127
|
+
post_install_message:
|
128
|
+
rdoc_options: []
|
129
|
+
require_paths:
|
130
|
+
- lib
|
131
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
137
|
+
requirements:
|
138
|
+
- - ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
requirements: []
|
142
|
+
rubyforge_project:
|
143
|
+
rubygems_version: 2.2.2
|
144
|
+
signing_key:
|
145
|
+
specification_version: 4
|
146
|
+
summary: Crawler and asset list/sitemap generator
|
147
|
+
test_files: []
|