anemone 0.0.6 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,7 +3,7 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.0.6'
6
+ VERSION = '0.1.0'
7
7
 
8
8
  # User-Agent string used for HTTP requests
9
9
  USER_AGENT = "Anemone/#{self::VERSION}"
@@ -20,7 +20,7 @@ module Anemone
20
20
  #
21
21
  # Convenience method to start a crawl using Core
22
22
  #
23
- def Anemone.crawl(url, options = {}, &block)
23
+ def Anemone.crawl(urls, options = {}, &block)
24
24
  Anemone.options = OpenStruct.new(options)
25
25
 
26
26
  #by default, run 4 Tentacle threads to fetch pages
@@ -32,6 +32,6 @@ module Anemone
32
32
  #by default, don't throw away the page response body after scanning it for links
33
33
  Anemone.options.discard_page_bodies ||= false
34
34
 
35
- Core.crawl(url, &block)
35
+ Core.crawl(urls, &block)
36
36
  end
37
37
  end
data/lib/anemone/core.rb CHANGED
@@ -9,12 +9,13 @@ module Anemone
9
9
  attr_reader :pages
10
10
 
11
11
  #
12
- # Initialize the crawl with a starting *url*, *options*, and optional *block*
12
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
13
+ # and optional *block*
13
14
  #
14
- def initialize(url, &block)
15
- url = URI(url) if url.is_a?(String)
16
- @url = url
17
- @url.path = "/" if @url.path.empty?
15
+ def initialize(urls, &block)
16
+ @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
17
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
18
+
18
19
  @tentacles = []
19
20
  @pages = PageHash.new
20
21
  @on_every_page_blocks = []
@@ -80,10 +81,22 @@ module Anemone
80
81
  self
81
82
  end
82
83
 
84
+ #
85
+ # Specify a block which will select which links to follow on each page.
86
+ # The block should return an Array of URI objects.
87
+ #
88
+ def focus_crawl(&block)
89
+ @focus_crawl_block = block
90
+ self
91
+ end
92
+
83
93
  #
84
94
  # Perform the crawl
85
95
  #
86
96
  def run
97
+ @urls.delete_if { |url| !visit_link?(url) }
98
+ return if @urls.empty?
99
+
87
100
  link_queue = Queue.new
88
101
  page_queue = Queue.new
89
102
 
@@ -91,28 +104,27 @@ module Anemone
91
104
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
92
105
  end
93
106
 
94
- return if !visit_link?(@url)
95
-
96
- link_queue.enq(@url)
107
+ @urls.each{ |url| link_queue.enq(url) }
97
108
 
98
- while true do
109
+ loop do
99
110
  page = page_queue.deq
100
111
 
101
112
  @pages[page.url] = page
102
113
 
103
114
  puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
104
115
 
116
+ #perform the on_every_page blocks for this page
105
117
  do_page_blocks(page)
106
118
 
107
119
  page.doc = nil if Anemone.options.discard_page_bodies
108
120
 
109
- page.links.each do |link|
110
- if visit_link?(link)
111
- link_queue.enq(link)
112
- @pages[link] = nil
113
- end
121
+ links_to_follow(page).each do |link|
122
+ link_queue.enq(link)
123
+ @pages[link] = nil
114
124
  end
115
125
 
126
+ #create an entry in the page hash for each alias of this page,
127
+ #i.e. all the pages that redirected to this page
116
128
  page.aliases.each do |aka|
117
129
  if !@pages.has_key?(aka) or @pages[aka].nil?
118
130
  @pages[aka] = page.alias_clone(aka)
@@ -165,6 +177,16 @@ module Anemone
165
177
  end
166
178
  end
167
179
 
180
+ #
181
+ # Return an Array of links to follow from the given page.
182
+ # Based on whether or not the link has already been crawled,
183
+ # and the block given to focus_crawl()
184
+ #
185
+ def links_to_follow(page)
186
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
187
+ links.find_all { |link| visit_link?(link) }
188
+ end
189
+
168
190
  #
169
191
  # Returns +true+ if *link* has not been visited already,
170
192
  # and is not excluded by a skip_link pattern. Returns
data/lib/anemone/page.rb CHANGED
@@ -9,12 +9,12 @@ module Anemone
9
9
  attr_reader :url
10
10
  # Array of distinct A tag HREFs from the page
11
11
  attr_reader :links
12
- #Content-type of the HTTP response
13
- attr_reader :content_type
12
+ # Headers of the HTTP response
13
+ attr_reader :headers
14
14
 
15
- #OpenStruct for user-stored data
15
+ # OpenStruct for user-stored data
16
16
  attr_accessor :data
17
- #Nokogiri document for the HTML body
17
+ # Nokogiri document for the HTML body
18
18
  attr_accessor :doc
19
19
  # Integer response code of the page
20
20
  attr_accessor :code
@@ -39,7 +39,7 @@ module Anemone
39
39
  aka = location
40
40
  end
41
41
 
42
- return Page.new(url, response.body, code, response['Content-Type'], aka)
42
+ return Page.new(url, response.body, code, response.to_hash, aka)
43
43
  rescue
44
44
  return Page.new(url)
45
45
  end
@@ -48,10 +48,10 @@ module Anemone
48
48
  #
49
49
  # Create a new page
50
50
  #
51
- def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
51
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
52
52
  @url = url
53
53
  @code = code
54
- @content_type = content_type
54
+ @headers = headers
55
55
  @links = []
56
56
  @aliases = []
57
57
  @data = OpenStruct.new
@@ -119,6 +119,13 @@ module Anemone
119
119
  end
120
120
  end
121
121
 
122
+ #
123
+ # The content-type returned by the HTTP request for this page
124
+ #
125
+ def content_type
126
+ @headers['content-type'][0] rescue nil
127
+ end
128
+
122
129
  #
123
130
  # Returns +true+ if the page is a HTML document, returns +false+
124
131
  # otherwise.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-07-04 00:00:00 -05:00
12
+ date: 2009-07-11 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -35,23 +35,22 @@ extensions: []
35
35
  extra_rdoc_files:
36
36
  - README.rdoc
37
37
  files:
38
+ - bin/anemone_url_list.rb
39
+ - bin/anemone_serialize.rb
40
+ - bin/anemone_pagedepth.rb
38
41
  - bin/anemone_count.rb
39
42
  - bin/anemone_cron.rb
40
- - bin/anemone_pagedepth.rb
41
- - bin/anemone_serialize.rb
42
- - bin/anemone_url_list.rb
43
- - lib/anemone/anemone.rb
43
+ - lib/anemone.rb
44
+ - lib/anemone
45
+ - lib/anemone/page.rb
44
46
  - lib/anemone/core.rb
47
+ - lib/anemone/anemone.rb
45
48
  - lib/anemone/http.rb
46
- - lib/anemone/page.rb
47
- - lib/anemone/page_hash.rb
48
49
  - lib/anemone/tentacle.rb
49
- - lib/anemone.rb
50
+ - lib/anemone/page_hash.rb
50
51
  - README.rdoc
51
52
  has_rdoc: true
52
53
  homepage: http://anemone.rubyforge.org
53
- licenses: []
54
-
55
54
  post_install_message:
56
55
  rdoc_options:
57
56
  - -m
@@ -75,9 +74,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
74
  requirements: []
76
75
 
77
76
  rubyforge_project: anemone
78
- rubygems_version: 1.3.4
77
+ rubygems_version: 1.3.1
79
78
  signing_key:
80
- specification_version: 3
79
+ specification_version: 2
81
80
  summary: Anemone web-spider framework
82
81
  test_files: []
83
82