anemone 0.0.6 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/anemone/anemone.rb +3 -3
- data/lib/anemone/core.rb +36 -14
- data/lib/anemone/page.rb +14 -7
- metadata +12 -13
data/lib/anemone/anemone.rb
CHANGED
@@ -3,7 +3,7 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.0
|
6
|
+
VERSION = '0.1.0'
|
7
7
|
|
8
8
|
# User-Agent string used for HTTP requests
|
9
9
|
USER_AGENT = "Anemone/#{self::VERSION}"
|
@@ -20,7 +20,7 @@ module Anemone
|
|
20
20
|
#
|
21
21
|
# Convenience method to start a crawl using Core
|
22
22
|
#
|
23
|
-
def Anemone.crawl(
|
23
|
+
def Anemone.crawl(urls, options = {}, &block)
|
24
24
|
Anemone.options = OpenStruct.new(options)
|
25
25
|
|
26
26
|
#by default, run 4 Tentacle threads to fetch pages
|
@@ -32,6 +32,6 @@ module Anemone
|
|
32
32
|
#by default, don't throw away the page response body after scanning it for links
|
33
33
|
Anemone.options.discard_page_bodies ||= false
|
34
34
|
|
35
|
-
Core.crawl(
|
35
|
+
Core.crawl(urls, &block)
|
36
36
|
end
|
37
37
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -9,12 +9,13 @@ module Anemone
|
|
9
9
|
attr_reader :pages
|
10
10
|
|
11
11
|
#
|
12
|
-
# Initialize the crawl with
|
12
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
13
|
+
# and optional *block*
|
13
14
|
#
|
14
|
-
def initialize(
|
15
|
-
|
16
|
-
@url = url
|
17
|
-
|
15
|
+
def initialize(urls, &block)
|
16
|
+
@urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
|
17
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
18
|
+
|
18
19
|
@tentacles = []
|
19
20
|
@pages = PageHash.new
|
20
21
|
@on_every_page_blocks = []
|
@@ -80,10 +81,22 @@ module Anemone
|
|
80
81
|
self
|
81
82
|
end
|
82
83
|
|
84
|
+
#
|
85
|
+
# Specify a block which will select which links to follow on each page.
|
86
|
+
# The block should return an Array of URI objects.
|
87
|
+
#
|
88
|
+
def focus_crawl(&block)
|
89
|
+
@focus_crawl_block = block
|
90
|
+
self
|
91
|
+
end
|
92
|
+
|
83
93
|
#
|
84
94
|
# Perform the crawl
|
85
95
|
#
|
86
96
|
def run
|
97
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
98
|
+
return if @urls.empty?
|
99
|
+
|
87
100
|
link_queue = Queue.new
|
88
101
|
page_queue = Queue.new
|
89
102
|
|
@@ -91,28 +104,27 @@ module Anemone
|
|
91
104
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
92
105
|
end
|
93
106
|
|
94
|
-
|
95
|
-
|
96
|
-
link_queue.enq(@url)
|
107
|
+
@urls.each{ |url| link_queue.enq(url) }
|
97
108
|
|
98
|
-
|
109
|
+
loop do
|
99
110
|
page = page_queue.deq
|
100
111
|
|
101
112
|
@pages[page.url] = page
|
102
113
|
|
103
114
|
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
104
115
|
|
116
|
+
#perform the on_every_page blocks for this page
|
105
117
|
do_page_blocks(page)
|
106
118
|
|
107
119
|
page.doc = nil if Anemone.options.discard_page_bodies
|
108
120
|
|
109
|
-
page.
|
110
|
-
|
111
|
-
|
112
|
-
@pages[link] = nil
|
113
|
-
end
|
121
|
+
links_to_follow(page).each do |link|
|
122
|
+
link_queue.enq(link)
|
123
|
+
@pages[link] = nil
|
114
124
|
end
|
115
125
|
|
126
|
+
#create an entry in the page hash for each alias of this page,
|
127
|
+
#i.e. all the pages that redirected to this page
|
116
128
|
page.aliases.each do |aka|
|
117
129
|
if !@pages.has_key?(aka) or @pages[aka].nil?
|
118
130
|
@pages[aka] = page.alias_clone(aka)
|
@@ -165,6 +177,16 @@ module Anemone
|
|
165
177
|
end
|
166
178
|
end
|
167
179
|
|
180
|
+
#
|
181
|
+
# Return an Array of links to follow from the given page.
|
182
|
+
# Based on whether or not the link has already been crawled,
|
183
|
+
# and the block given to focus_crawl()
|
184
|
+
#
|
185
|
+
def links_to_follow(page)
|
186
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
187
|
+
links.find_all { |link| visit_link?(link) }
|
188
|
+
end
|
189
|
+
|
168
190
|
#
|
169
191
|
# Returns +true+ if *link* has not been visited already,
|
170
192
|
# and is not excluded by a skip_link pattern. Returns
|
data/lib/anemone/page.rb
CHANGED
@@ -9,12 +9,12 @@ module Anemone
|
|
9
9
|
attr_reader :url
|
10
10
|
# Array of distinct A tag HREFs from the page
|
11
11
|
attr_reader :links
|
12
|
-
#
|
13
|
-
attr_reader :
|
12
|
+
# Headers of the HTTP response
|
13
|
+
attr_reader :headers
|
14
14
|
|
15
|
-
#OpenStruct for user-stored data
|
15
|
+
# OpenStruct for user-stored data
|
16
16
|
attr_accessor :data
|
17
|
-
#Nokogiri document for the HTML body
|
17
|
+
# Nokogiri document for the HTML body
|
18
18
|
attr_accessor :doc
|
19
19
|
# Integer response code of the page
|
20
20
|
attr_accessor :code
|
@@ -39,7 +39,7 @@ module Anemone
|
|
39
39
|
aka = location
|
40
40
|
end
|
41
41
|
|
42
|
-
return Page.new(url, response.body, code, response
|
42
|
+
return Page.new(url, response.body, code, response.to_hash, aka)
|
43
43
|
rescue
|
44
44
|
return Page.new(url)
|
45
45
|
end
|
@@ -48,10 +48,10 @@ module Anemone
|
|
48
48
|
#
|
49
49
|
# Create a new page
|
50
50
|
#
|
51
|
-
def initialize(url, body = nil, code = nil,
|
51
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
|
52
52
|
@url = url
|
53
53
|
@code = code
|
54
|
-
@
|
54
|
+
@headers = headers
|
55
55
|
@links = []
|
56
56
|
@aliases = []
|
57
57
|
@data = OpenStruct.new
|
@@ -119,6 +119,13 @@ module Anemone
|
|
119
119
|
end
|
120
120
|
end
|
121
121
|
|
122
|
+
#
|
123
|
+
# The content-type returned by the HTTP request for this page
|
124
|
+
#
|
125
|
+
def content_type
|
126
|
+
@headers['content-type'][0] rescue nil
|
127
|
+
end
|
128
|
+
|
122
129
|
#
|
123
130
|
# Returns +true+ if the page is a HTML document, returns +false+
|
124
131
|
# otherwise.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-07-
|
12
|
+
date: 2009-07-11 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -35,23 +35,22 @@ extensions: []
|
|
35
35
|
extra_rdoc_files:
|
36
36
|
- README.rdoc
|
37
37
|
files:
|
38
|
+
- bin/anemone_url_list.rb
|
39
|
+
- bin/anemone_serialize.rb
|
40
|
+
- bin/anemone_pagedepth.rb
|
38
41
|
- bin/anemone_count.rb
|
39
42
|
- bin/anemone_cron.rb
|
40
|
-
-
|
41
|
-
-
|
42
|
-
-
|
43
|
-
- lib/anemone/anemone.rb
|
43
|
+
- lib/anemone.rb
|
44
|
+
- lib/anemone
|
45
|
+
- lib/anemone/page.rb
|
44
46
|
- lib/anemone/core.rb
|
47
|
+
- lib/anemone/anemone.rb
|
45
48
|
- lib/anemone/http.rb
|
46
|
-
- lib/anemone/page.rb
|
47
|
-
- lib/anemone/page_hash.rb
|
48
49
|
- lib/anemone/tentacle.rb
|
49
|
-
- lib/anemone.rb
|
50
|
+
- lib/anemone/page_hash.rb
|
50
51
|
- README.rdoc
|
51
52
|
has_rdoc: true
|
52
53
|
homepage: http://anemone.rubyforge.org
|
53
|
-
licenses: []
|
54
|
-
|
55
54
|
post_install_message:
|
56
55
|
rdoc_options:
|
57
56
|
- -m
|
@@ -75,9 +74,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
74
|
requirements: []
|
76
75
|
|
77
76
|
rubyforge_project: anemone
|
78
|
-
rubygems_version: 1.3.
|
77
|
+
rubygems_version: 1.3.1
|
79
78
|
signing_key:
|
80
|
-
specification_version:
|
79
|
+
specification_version: 2
|
81
80
|
summary: Anemone web-spider framework
|
82
81
|
test_files: []
|
83
82
|
|