anemone 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.0.6'
6
+ VERSION = '0.1.0'
7
7
 
8
8
  # User-Agent string used for HTTP requests
9
9
  USER_AGENT = "Anemone/#{self::VERSION}"
@@ -20,7 +20,7 @@ module Anemone
20
20
  #
21
21
  # Convenience method to start a crawl using Core
22
22
  #
23
- def Anemone.crawl(url, options = {}, &block)
23
+ def Anemone.crawl(urls, options = {}, &block)
24
24
  Anemone.options = OpenStruct.new(options)
25
25
 
26
26
  #by default, run 4 Tentacle threads to fetch pages
@@ -32,6 +32,6 @@ module Anemone
32
32
  #by default, don't throw away the page response body after scanning it for links
33
33
  Anemone.options.discard_page_bodies ||= false
34
34
 
35
- Core.crawl(url, &block)
35
+ Core.crawl(urls, &block)
36
36
  end
37
37
  end
data/lib/anemone/core.rb CHANGED
@@ -9,12 +9,13 @@ module Anemone
9
9
  attr_reader :pages
10
10
 
11
11
  #
12
- # Initialize the crawl with a starting *url*, *options*, and optional *block*
12
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
13
+ # and optional *block*
13
14
  #
14
- def initialize(url, &block)
15
- url = URI(url) if url.is_a?(String)
16
- @url = url
17
- @url.path = "/" if @url.path.empty?
15
+ def initialize(urls, &block)
16
+ @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
17
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
18
+
18
19
  @tentacles = []
19
20
  @pages = PageHash.new
20
21
  @on_every_page_blocks = []
@@ -80,10 +81,22 @@ module Anemone
80
81
  self
81
82
  end
82
83
 
84
+ #
85
+ # Specify a block which will select which links to follow on each page.
86
+ # The block should return an Array of URI objects.
87
+ #
88
+ def focus_crawl(&block)
89
+ @focus_crawl_block = block
90
+ self
91
+ end
92
+
83
93
  #
84
94
  # Perform the crawl
85
95
  #
86
96
  def run
97
+ @urls.delete_if { |url| !visit_link?(url) }
98
+ return if @urls.empty?
99
+
87
100
  link_queue = Queue.new
88
101
  page_queue = Queue.new
89
102
 
@@ -91,28 +104,27 @@ module Anemone
91
104
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
92
105
  end
93
106
 
94
- return if !visit_link?(@url)
95
-
96
- link_queue.enq(@url)
107
+ @urls.each{ |url| link_queue.enq(url) }
97
108
 
98
- while true do
109
+ loop do
99
110
  page = page_queue.deq
100
111
 
101
112
  @pages[page.url] = page
102
113
 
103
114
  puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
104
115
 
116
+ #perform the on_every_page blocks for this page
105
117
  do_page_blocks(page)
106
118
 
107
119
  page.doc = nil if Anemone.options.discard_page_bodies
108
120
 
109
- page.links.each do |link|
110
- if visit_link?(link)
111
- link_queue.enq(link)
112
- @pages[link] = nil
113
- end
121
+ links_to_follow(page).each do |link|
122
+ link_queue.enq(link)
123
+ @pages[link] = nil
114
124
  end
115
125
 
126
+ #create an entry in the page hash for each alias of this page,
127
+ #i.e. all the pages that redirected to this page
116
128
  page.aliases.each do |aka|
117
129
  if !@pages.has_key?(aka) or @pages[aka].nil?
118
130
  @pages[aka] = page.alias_clone(aka)
@@ -165,6 +177,16 @@ module Anemone
165
177
  end
166
178
  end
167
179
 
180
+ #
181
+ # Return an Array of links to follow from the given page.
182
+ # Based on whether or not the link has already been crawled,
183
+ # and the block given to focus_crawl()
184
+ #
185
+ def links_to_follow(page)
186
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
187
+ links.find_all { |link| visit_link?(link) }
188
+ end
189
+
168
190
  #
169
191
  # Returns +true+ if *link* has not been visited already,
170
192
  # and is not excluded by a skip_link pattern. Returns
data/lib/anemone/page.rb CHANGED
@@ -9,12 +9,12 @@ module Anemone
9
9
  attr_reader :url
10
10
  # Array of distinct A tag HREFs from the page
11
11
  attr_reader :links
12
- #Content-type of the HTTP response
13
- attr_reader :content_type
12
+ # Headers of the HTTP response
13
+ attr_reader :headers
14
14
 
15
- #OpenStruct for user-stored data
15
+ # OpenStruct for user-stored data
16
16
  attr_accessor :data
17
- #Nokogiri document for the HTML body
17
+ # Nokogiri document for the HTML body
18
18
  attr_accessor :doc
19
19
  # Integer response code of the page
20
20
  attr_accessor :code
@@ -39,7 +39,7 @@ module Anemone
39
39
  aka = location
40
40
  end
41
41
 
42
- return Page.new(url, response.body, code, response['Content-Type'], aka)
42
+ return Page.new(url, response.body, code, response.to_hash, aka)
43
43
  rescue
44
44
  return Page.new(url)
45
45
  end
@@ -48,10 +48,10 @@ module Anemone
48
48
  #
49
49
  # Create a new page
50
50
  #
51
- def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
51
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
52
52
  @url = url
53
53
  @code = code
54
- @content_type = content_type
54
+ @headers = headers
55
55
  @links = []
56
56
  @aliases = []
57
57
  @data = OpenStruct.new
@@ -119,6 +119,13 @@ module Anemone
119
119
  end
120
120
  end
121
121
 
122
+ #
123
+ # The content-type returned by the HTTP request for this page
124
+ #
125
+ def content_type
126
+ @headers['content-type'][0] rescue nil
127
+ end
128
+
122
129
  #
123
130
  # Returns +true+ if the page is a HTML document, returns +false+
124
131
  # otherwise.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-07-04 00:00:00 -05:00
12
+ date: 2009-07-11 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -35,23 +35,22 @@ extensions: []
35
35
  extra_rdoc_files:
36
36
  - README.rdoc
37
37
  files:
38
+ - bin/anemone_url_list.rb
39
+ - bin/anemone_serialize.rb
40
+ - bin/anemone_pagedepth.rb
38
41
  - bin/anemone_count.rb
39
42
  - bin/anemone_cron.rb
40
- - bin/anemone_pagedepth.rb
41
- - bin/anemone_serialize.rb
42
- - bin/anemone_url_list.rb
43
- - lib/anemone/anemone.rb
43
+ - lib/anemone.rb
44
+ - lib/anemone
45
+ - lib/anemone/page.rb
44
46
  - lib/anemone/core.rb
47
+ - lib/anemone/anemone.rb
45
48
  - lib/anemone/http.rb
46
- - lib/anemone/page.rb
47
- - lib/anemone/page_hash.rb
48
49
  - lib/anemone/tentacle.rb
49
- - lib/anemone.rb
50
+ - lib/anemone/page_hash.rb
50
51
  - README.rdoc
51
52
  has_rdoc: true
52
53
  homepage: http://anemone.rubyforge.org
53
- licenses: []
54
-
55
54
  post_install_message:
56
55
  rdoc_options:
57
56
  - -m
@@ -75,9 +74,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
74
  requirements: []
76
75
 
77
76
  rubyforge_project: anemone
78
- rubygems_version: 1.3.4
77
+ rubygems_version: 1.3.1
79
78
  signing_key:
80
- specification_version: 3
79
+ specification_version: 2
81
80
  summary: Anemone web-spider framework
82
81
  test_files: []
83
82