anemone 0.0.6 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/anemone/anemone.rb +3 -3
- data/lib/anemone/core.rb +36 -14
- data/lib/anemone/page.rb +14 -7
- metadata +12 -13
data/lib/anemone/anemone.rb
CHANGED
@@ -3,7 +3,7 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.0
|
6
|
+
VERSION = '0.1.0'
|
7
7
|
|
8
8
|
# User-Agent string used for HTTP requests
|
9
9
|
USER_AGENT = "Anemone/#{self::VERSION}"
|
@@ -20,7 +20,7 @@ module Anemone
|
|
20
20
|
#
|
21
21
|
# Convenience method to start a crawl using Core
|
22
22
|
#
|
23
|
-
def Anemone.crawl(
|
23
|
+
def Anemone.crawl(urls, options = {}, &block)
|
24
24
|
Anemone.options = OpenStruct.new(options)
|
25
25
|
|
26
26
|
#by default, run 4 Tentacle threads to fetch pages
|
@@ -32,6 +32,6 @@ module Anemone
|
|
32
32
|
#by default, don't throw away the page response body after scanning it for links
|
33
33
|
Anemone.options.discard_page_bodies ||= false
|
34
34
|
|
35
|
-
Core.crawl(
|
35
|
+
Core.crawl(urls, &block)
|
36
36
|
end
|
37
37
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -9,12 +9,13 @@ module Anemone
|
|
9
9
|
attr_reader :pages
|
10
10
|
|
11
11
|
#
|
12
|
-
# Initialize the crawl with
|
12
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
13
|
+
# and optional *block*
|
13
14
|
#
|
14
|
-
def initialize(
|
15
|
-
|
16
|
-
@url = url
|
17
|
-
|
15
|
+
def initialize(urls, &block)
|
16
|
+
@urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
|
17
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
18
|
+
|
18
19
|
@tentacles = []
|
19
20
|
@pages = PageHash.new
|
20
21
|
@on_every_page_blocks = []
|
@@ -80,10 +81,22 @@ module Anemone
|
|
80
81
|
self
|
81
82
|
end
|
82
83
|
|
84
|
+
#
|
85
|
+
# Specify a block which will select which links to follow on each page.
|
86
|
+
# The block should return an Array of URI objects.
|
87
|
+
#
|
88
|
+
def focus_crawl(&block)
|
89
|
+
@focus_crawl_block = block
|
90
|
+
self
|
91
|
+
end
|
92
|
+
|
83
93
|
#
|
84
94
|
# Perform the crawl
|
85
95
|
#
|
86
96
|
def run
|
97
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
98
|
+
return if @urls.empty?
|
99
|
+
|
87
100
|
link_queue = Queue.new
|
88
101
|
page_queue = Queue.new
|
89
102
|
|
@@ -91,28 +104,27 @@ module Anemone
|
|
91
104
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
92
105
|
end
|
93
106
|
|
94
|
-
|
95
|
-
|
96
|
-
link_queue.enq(@url)
|
107
|
+
@urls.each{ |url| link_queue.enq(url) }
|
97
108
|
|
98
|
-
|
109
|
+
loop do
|
99
110
|
page = page_queue.deq
|
100
111
|
|
101
112
|
@pages[page.url] = page
|
102
113
|
|
103
114
|
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
104
115
|
|
116
|
+
#perform the on_every_page blocks for this page
|
105
117
|
do_page_blocks(page)
|
106
118
|
|
107
119
|
page.doc = nil if Anemone.options.discard_page_bodies
|
108
120
|
|
109
|
-
page.
|
110
|
-
|
111
|
-
|
112
|
-
@pages[link] = nil
|
113
|
-
end
|
121
|
+
links_to_follow(page).each do |link|
|
122
|
+
link_queue.enq(link)
|
123
|
+
@pages[link] = nil
|
114
124
|
end
|
115
125
|
|
126
|
+
#create an entry in the page hash for each alias of this page,
|
127
|
+
#i.e. all the pages that redirected to this page
|
116
128
|
page.aliases.each do |aka|
|
117
129
|
if !@pages.has_key?(aka) or @pages[aka].nil?
|
118
130
|
@pages[aka] = page.alias_clone(aka)
|
@@ -165,6 +177,16 @@ module Anemone
|
|
165
177
|
end
|
166
178
|
end
|
167
179
|
|
180
|
+
#
|
181
|
+
# Return an Array of links to follow from the given page.
|
182
|
+
# Based on whether or not the link has already been crawled,
|
183
|
+
# and the block given to focus_crawl()
|
184
|
+
#
|
185
|
+
def links_to_follow(page)
|
186
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
187
|
+
links.find_all { |link| visit_link?(link) }
|
188
|
+
end
|
189
|
+
|
168
190
|
#
|
169
191
|
# Returns +true+ if *link* has not been visited already,
|
170
192
|
# and is not excluded by a skip_link pattern. Returns
|
data/lib/anemone/page.rb
CHANGED
@@ -9,12 +9,12 @@ module Anemone
|
|
9
9
|
attr_reader :url
|
10
10
|
# Array of distinct A tag HREFs from the page
|
11
11
|
attr_reader :links
|
12
|
-
#
|
13
|
-
attr_reader :
|
12
|
+
# Headers of the HTTP response
|
13
|
+
attr_reader :headers
|
14
14
|
|
15
|
-
#OpenStruct for user-stored data
|
15
|
+
# OpenStruct for user-stored data
|
16
16
|
attr_accessor :data
|
17
|
-
#Nokogiri document for the HTML body
|
17
|
+
# Nokogiri document for the HTML body
|
18
18
|
attr_accessor :doc
|
19
19
|
# Integer response code of the page
|
20
20
|
attr_accessor :code
|
@@ -39,7 +39,7 @@ module Anemone
|
|
39
39
|
aka = location
|
40
40
|
end
|
41
41
|
|
42
|
-
return Page.new(url, response.body, code, response
|
42
|
+
return Page.new(url, response.body, code, response.to_hash, aka)
|
43
43
|
rescue
|
44
44
|
return Page.new(url)
|
45
45
|
end
|
@@ -48,10 +48,10 @@ module Anemone
|
|
48
48
|
#
|
49
49
|
# Create a new page
|
50
50
|
#
|
51
|
-
def initialize(url, body = nil, code = nil,
|
51
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
|
52
52
|
@url = url
|
53
53
|
@code = code
|
54
|
-
@
|
54
|
+
@headers = headers
|
55
55
|
@links = []
|
56
56
|
@aliases = []
|
57
57
|
@data = OpenStruct.new
|
@@ -119,6 +119,13 @@ module Anemone
|
|
119
119
|
end
|
120
120
|
end
|
121
121
|
|
122
|
+
#
|
123
|
+
# The content-type returned by the HTTP request for this page
|
124
|
+
#
|
125
|
+
def content_type
|
126
|
+
@headers['content-type'][0] rescue nil
|
127
|
+
end
|
128
|
+
|
122
129
|
#
|
123
130
|
# Returns +true+ if the page is a HTML document, returns +false+
|
124
131
|
# otherwise.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-07-
|
12
|
+
date: 2009-07-11 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -35,23 +35,22 @@ extensions: []
|
|
35
35
|
extra_rdoc_files:
|
36
36
|
- README.rdoc
|
37
37
|
files:
|
38
|
+
- bin/anemone_url_list.rb
|
39
|
+
- bin/anemone_serialize.rb
|
40
|
+
- bin/anemone_pagedepth.rb
|
38
41
|
- bin/anemone_count.rb
|
39
42
|
- bin/anemone_cron.rb
|
40
|
-
-
|
41
|
-
-
|
42
|
-
-
|
43
|
-
- lib/anemone/anemone.rb
|
43
|
+
- lib/anemone.rb
|
44
|
+
- lib/anemone
|
45
|
+
- lib/anemone/page.rb
|
44
46
|
- lib/anemone/core.rb
|
47
|
+
- lib/anemone/anemone.rb
|
45
48
|
- lib/anemone/http.rb
|
46
|
-
- lib/anemone/page.rb
|
47
|
-
- lib/anemone/page_hash.rb
|
48
49
|
- lib/anemone/tentacle.rb
|
49
|
-
- lib/anemone.rb
|
50
|
+
- lib/anemone/page_hash.rb
|
50
51
|
- README.rdoc
|
51
52
|
has_rdoc: true
|
52
53
|
homepage: http://anemone.rubyforge.org
|
53
|
-
licenses: []
|
54
|
-
|
55
54
|
post_install_message:
|
56
55
|
rdoc_options:
|
57
56
|
- -m
|
@@ -75,9 +74,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
74
|
requirements: []
|
76
75
|
|
77
76
|
rubyforge_project: anemone
|
78
|
-
rubygems_version: 1.3.
|
77
|
+
rubygems_version: 1.3.1
|
79
78
|
signing_key:
|
80
|
-
specification_version:
|
79
|
+
specification_version: 2
|
81
80
|
summary: Anemone web-spider framework
|
82
81
|
test_files: []
|
83
82
|
|