anemone 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +19 -0
- data/lib/anemone/anemone.rb +14 -6
- data/lib/anemone/http.rb +1 -1
- data/lib/anemone/tentacle.rb +2 -0
- data/spec/anemone_spec.rb +36 -0
- data/spec/core_spec.rb +128 -0
- data/spec/fakeweb_helper.rb +55 -0
- data/spec/page_spec.rb +49 -0
- data/spec/spec_helper.rb +7 -0
- metadata +20 -17
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2009 Vertive, Inc.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/lib/anemone/anemone.rb
CHANGED
@@ -3,11 +3,8 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.1.
|
7
|
-
|
8
|
-
# User-Agent string used for HTTP requests
|
9
|
-
USER_AGENT = "Anemone/#{self::VERSION}"
|
10
|
-
|
6
|
+
VERSION = '0.1.2'
|
7
|
+
|
11
8
|
#module-wide options
|
12
9
|
def Anemone.options=(options)
|
13
10
|
@options = options
|
@@ -31,7 +28,18 @@ module Anemone
|
|
31
28
|
|
32
29
|
#by default, don't throw away the page response body after scanning it for links
|
33
30
|
Anemone.options.discard_page_bodies ||= false
|
34
|
-
|
31
|
+
|
32
|
+
#by default, identify self as Anemone/VERSION
|
33
|
+
Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
|
34
|
+
|
35
|
+
#no delay between requests by default
|
36
|
+
Anemone.options.delay ||= 0
|
37
|
+
|
38
|
+
#use a single thread if a delay was requested
|
39
|
+
if(Anemone.options.delay != 0)
|
40
|
+
Anemone.options.threads = 1
|
41
|
+
end
|
42
|
+
|
35
43
|
Core.crawl(urls, &block)
|
36
44
|
end
|
37
45
|
end
|
data/lib/anemone/http.rb
CHANGED
@@ -31,7 +31,7 @@ module Anemone
|
|
31
31
|
def self.get_response(url)
|
32
32
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
33
33
|
Net::HTTP.start(url.host, url.port) do |http|
|
34
|
-
return http.get(full_path, {'User-Agent' => Anemone
|
34
|
+
return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
|
35
35
|
end
|
36
36
|
end
|
37
37
|
end
|
data/lib/anemone/tentacle.rb
CHANGED
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Anemone do
|
4
|
+
|
5
|
+
it "should have a version" do
|
6
|
+
Anemone.const_defined?('VERSION').should == true
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have options" do
|
10
|
+
Anemone.should respond_to(:options)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept options for the crawl" do
|
14
|
+
Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
15
|
+
:threads => 2,
|
16
|
+
:discard_page_bodies => true,
|
17
|
+
:user_agent => 'test')
|
18
|
+
Anemone.options.verbose.should == false
|
19
|
+
Anemone.options.threads.should == 2
|
20
|
+
Anemone.options.discard_page_bodies.should == true
|
21
|
+
Anemone.options.delay.should == 0
|
22
|
+
Anemone.options.user_agent.should == 'test'
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should use 1 thread if a delay is requested" do
|
26
|
+
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
|
27
|
+
Anemone.options.threads.should == 1
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should return a Anemone::Core from the crawl, which has a PageHash" do
|
31
|
+
result = Anemone.crawl(SPEC_DOMAIN)
|
32
|
+
result.should be_an_instance_of(Anemone::Core)
|
33
|
+
result.pages.should be_an_instance_of(Anemone::PageHash)
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
data/spec/core_spec.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe Core do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
FakeWeb.clean_registry
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should crawl all the html pages in a domain by following <a> href's" do
|
11
|
+
pages = []
|
12
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
13
|
+
pages << FakePage.new('1', :links => ['3'])
|
14
|
+
pages << FakePage.new('2')
|
15
|
+
pages << FakePage.new('3')
|
16
|
+
|
17
|
+
Anemone.crawl(pages[0].url).should have(4).pages
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should not leave the original domain" do
|
21
|
+
pages = []
|
22
|
+
pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
|
23
|
+
pages << FakePage.new('1')
|
24
|
+
|
25
|
+
core = Anemone.crawl(pages[0].url)
|
26
|
+
|
27
|
+
core.should have(2).pages
|
28
|
+
core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should follow http redirects" do
|
32
|
+
pages = []
|
33
|
+
pages << FakePage.new('0', :links => ['1'])
|
34
|
+
pages << FakePage.new('1', :redirect => '2')
|
35
|
+
pages << FakePage.new('2')
|
36
|
+
|
37
|
+
Anemone.crawl(pages[0].url).should have(3).pages
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should accept multiple starting URLs" do
|
41
|
+
pages = []
|
42
|
+
pages << FakePage.new('0', :links => ['1'])
|
43
|
+
pages << FakePage.new('1')
|
44
|
+
pages << FakePage.new('2', :links => ['3'])
|
45
|
+
pages << FakePage.new('3')
|
46
|
+
|
47
|
+
Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should include the query string when following links" do
|
51
|
+
pages = []
|
52
|
+
pages << FakePage.new('0', :links => ['1?foo=1'])
|
53
|
+
pages << FakePage.new('1?foo=1')
|
54
|
+
pages << FakePage.new('1')
|
55
|
+
|
56
|
+
core = Anemone.crawl(pages[0].url)
|
57
|
+
|
58
|
+
core.should have(2).pages
|
59
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should be able to skip links based on a RegEx" do
|
63
|
+
pages = []
|
64
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
65
|
+
pages << FakePage.new('1')
|
66
|
+
pages << FakePage.new('2')
|
67
|
+
|
68
|
+
core = Anemone.crawl(pages[0].url) do |a|
|
69
|
+
a.skip_links_like /1/
|
70
|
+
end
|
71
|
+
|
72
|
+
core.should have(2).pages
|
73
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should be able to call a block on every page" do
|
77
|
+
pages = []
|
78
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
79
|
+
pages << FakePage.new('1')
|
80
|
+
pages << FakePage.new('2')
|
81
|
+
|
82
|
+
count = 0
|
83
|
+
Anemone.crawl(pages[0].url) do |a|
|
84
|
+
a.on_every_page { count += 1 }
|
85
|
+
end
|
86
|
+
|
87
|
+
count.should == 3
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should not discard page bodies by default" do
|
91
|
+
Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should optionally discard page bodies to conserve memory" do
|
95
|
+
core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
|
96
|
+
core.pages.values.first.doc.should be_nil
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should provide a focus_crawl method to select the links on each page to follow" do
|
100
|
+
pages = []
|
101
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
102
|
+
pages << FakePage.new('1')
|
103
|
+
pages << FakePage.new('2')
|
104
|
+
|
105
|
+
core = Anemone.crawl(pages[0].url) do |a|
|
106
|
+
a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
|
107
|
+
end
|
108
|
+
|
109
|
+
core.should have(2).pages
|
110
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should optionally delay between page requests" do
|
114
|
+
delay = 0.25
|
115
|
+
|
116
|
+
pages = []
|
117
|
+
pages << FakePage.new('0', :links => '1')
|
118
|
+
pages << FakePage.new('1')
|
119
|
+
|
120
|
+
start = Time.now
|
121
|
+
Anemone.crawl(pages[0].url, :delay => delay)
|
122
|
+
finish = Time.now
|
123
|
+
|
124
|
+
(finish - start).should satisfy {|t| t > delay * 2}
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
begin
|
2
|
+
require 'fakeweb'
|
3
|
+
rescue LoadError
|
4
|
+
warn "You need the 'fakeweb' gem installed to test Anemone"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
FakeWeb.allow_net_connect = false
|
9
|
+
|
10
|
+
module Anemone
|
11
|
+
SPEC_DOMAIN = "http://www.example.com/"
|
12
|
+
|
13
|
+
class FakePage
|
14
|
+
attr_accessor :links
|
15
|
+
attr_accessor :hrefs
|
16
|
+
|
17
|
+
def initialize(name = '', options = {})
|
18
|
+
@name = name
|
19
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
20
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
21
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
22
|
+
|
23
|
+
create_body
|
24
|
+
add_to_fakeweb
|
25
|
+
end
|
26
|
+
|
27
|
+
def url
|
28
|
+
SPEC_DOMAIN + @name
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def create_body
|
34
|
+
@body = "<html><body>"
|
35
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
36
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
37
|
+
@body += "</body></html>"
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_to_fakeweb
|
41
|
+
options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
|
42
|
+
|
43
|
+
if @redirect
|
44
|
+
options[:status] = [301, "Permanently Moved"]
|
45
|
+
options[:location] = SPEC_DOMAIN + @redirect
|
46
|
+
end
|
47
|
+
|
48
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
#default root
|
54
|
+
Anemone::FakePage.new
|
55
|
+
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe Page do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@page = Page.fetch(FakePage.new('home').url)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should be able to fetch a page" do
|
11
|
+
@page.should_not be_nil
|
12
|
+
@page.url.to_s.should include('home')
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should store the response headers when fetching a page" do
|
16
|
+
@page.headers.should_not be_nil
|
17
|
+
@page.headers.should have_key('content-type')
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should have an OpenStruct attribute for the developer to store data in" do
|
21
|
+
@page.data.should_not be_nil
|
22
|
+
@page.data.should be_an_instance_of(OpenStruct)
|
23
|
+
|
24
|
+
@page.data.test = 'test'
|
25
|
+
@page.data.test.should == 'test'
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should have a Nokogori::HTML::Document attribute for the page body" do
|
29
|
+
@page.doc.should_not be_nil
|
30
|
+
@page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should indicate whether it was fetched after an HTTP redirect" do
|
34
|
+
@page.should respond_to(:redirect?)
|
35
|
+
|
36
|
+
@page.redirect?.should == false
|
37
|
+
|
38
|
+
Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should have a method to tell if a URI is in the same domain as the page" do
|
42
|
+
@page.should respond_to(:in_domain?)
|
43
|
+
|
44
|
+
@page.in_domain?(URI(FakePage.new('test').url)).should == true
|
45
|
+
@page.in_domain?(URI('http://www.other.com/')).should == false
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-08-10 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -35,23 +35,22 @@ extensions: []
|
|
35
35
|
extra_rdoc_files:
|
36
36
|
- README.rdoc
|
37
37
|
files:
|
38
|
-
-
|
39
|
-
-
|
40
|
-
- bin/anemone_cron.rb
|
38
|
+
- LICENSE.txt
|
39
|
+
- README.rdoc
|
41
40
|
- bin/anemone_count.rb
|
41
|
+
- bin/anemone_cron.rb
|
42
|
+
- bin/anemone_pagedepth.rb
|
42
43
|
- bin/anemone_serialize.rb
|
43
|
-
-
|
44
|
-
- lib/anemone
|
45
|
-
- lib/anemone/
|
44
|
+
- bin/anemone_url_list.rb
|
45
|
+
- lib/anemone.rb
|
46
|
+
- lib/anemone/anemone.rb
|
46
47
|
- lib/anemone/core.rb
|
47
48
|
- lib/anemone/http.rb
|
48
|
-
- lib/anemone/
|
49
|
-
- lib/anemone.rb
|
50
|
-
-
|
49
|
+
- lib/anemone/page.rb
|
50
|
+
- lib/anemone/page_hash.rb
|
51
|
+
- lib/anemone/tentacle.rb
|
51
52
|
has_rdoc: true
|
52
53
|
homepage: http://anemone.rubyforge.org
|
53
|
-
licenses: []
|
54
|
-
|
55
54
|
post_install_message:
|
56
55
|
rdoc_options:
|
57
56
|
- -m
|
@@ -75,9 +74,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
74
|
requirements: []
|
76
75
|
|
77
76
|
rubyforge_project: anemone
|
78
|
-
rubygems_version: 1.3.
|
77
|
+
rubygems_version: 1.3.1
|
79
78
|
signing_key:
|
80
|
-
specification_version:
|
79
|
+
specification_version: 2
|
81
80
|
summary: Anemone web-spider framework
|
82
|
-
test_files:
|
83
|
-
|
81
|
+
test_files:
|
82
|
+
- spec/anemone_spec.rb
|
83
|
+
- spec/core_spec.rb
|
84
|
+
- spec/page_spec.rb
|
85
|
+
- spec/fakeweb_helper.rb
|
86
|
+
- spec/spec_helper.rb
|