anemone 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,37 @@
1
+ require 'ostruct'
1
2
  require 'anemone/core'
2
3
 
3
4
  module Anemone
4
5
  # Version number
5
- VERSION = '0.0.1'
6
+ VERSION = '0.0.2'
6
7
 
7
8
  # User-Agent string used for HTTP requests
8
9
  USER_AGENT = "Anemone/#{self::VERSION}"
9
10
 
11
+ #module-wide options
12
+ def Anemone.options=(options)
13
+ @options = options
14
+ end
15
+
16
+ def Anemone.options
17
+ @options
18
+ end
19
+
10
20
  #
11
21
  # Convenience method to start a crawl using Core
12
22
  #
13
23
  def Anemone.crawl(url, options = {}, &block)
14
- Core.crawl(url, options, &block)
24
+ Anemone.options = OpenStruct.new(options)
25
+
26
+ #by default, run 4 Tentacle threads to fetch pages
27
+ Anemone.options.threads ||= 4
28
+
29
+ #disable verbose output by default
30
+ Anemone.options.verbose ||= false
31
+
32
+ #by default, throw away the page response body after scanning it for links, to save memory
33
+ Anemone.options.discard_page_bodies ||= true
34
+
35
+ Core.crawl(url, &block)
15
36
  end
16
37
  end
data/lib/anemone/core.rb CHANGED
@@ -11,10 +11,9 @@ module Anemone
11
11
  #
12
12
  # Initialize the crawl with a starting *url*, *options*, and optional *block*
13
13
  #
14
- def initialize(url, options={}, &block)
14
+ def initialize(url, &block)
15
15
  url = URI(url) if url.is_a?(String)
16
16
  @url = url
17
- @options = options
18
17
  @tentacles = []
19
18
  @pages = PageHash.new
20
19
  @on_every_page_blocks = []
@@ -22,17 +21,14 @@ module Anemone
22
21
  @skip_link_patterns = []
23
22
  @after_crawl_blocks = []
24
23
 
25
- @options[:threads] ||= 4
26
- @options[:verbose] ||= false
27
-
28
24
  block.call(self) if block
29
25
  end
30
26
 
31
27
  #
32
28
  # Convenience method to start a new crawl
33
29
  #
34
- def self.crawl(root, options={}, &block)
35
- self.new(root, options) do |core|
30
+ def self.crawl(root, &block)
31
+ self.new(root) do |core|
36
32
  block.call(core) if block
37
33
  core.run
38
34
  core.do_after_crawl_blocks
@@ -91,7 +87,7 @@ module Anemone
91
87
  link_queue = Queue.new
92
88
  page_queue = Queue.new
93
89
 
94
- @options[:threads].times do |id|
90
+ Anemone.options.threads.times do |id|
95
91
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
96
92
  end
97
93
 
@@ -104,7 +100,7 @@ module Anemone
104
100
 
105
101
  @pages[page.url] = page
106
102
 
107
- puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
103
+ puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
108
104
 
109
105
  do_page_blocks(page)
110
106
 
data/lib/anemone/page.rb CHANGED
@@ -7,9 +7,13 @@ module Anemone
7
7
  attr_reader :url
8
8
  # Array of distinct A tag HREFs from the page
9
9
  attr_reader :links
10
- # Integer response code of the page
11
- attr_reader :code
10
+ #Body of the HTTP response
11
+ attr_reader :body
12
+ #Content-type of the HTTP response
13
+ attr_reader :content_type
12
14
 
15
+ # Integer response code of the page
16
+ attr_accessor :code
13
17
  # Array of redirect-aliases for the page
14
18
  attr_accessor :aliases
15
19
  # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
@@ -31,7 +35,7 @@ module Anemone
31
35
  aka = location
32
36
  end
33
37
 
34
- return Page.new(url, response, code, aka)
38
+ return Page.new(url, response.body, code, response['Content-Type'], aka)
35
39
  rescue
36
40
  return Page.new(url)
37
41
  end
@@ -40,18 +44,19 @@ module Anemone
40
44
  #
41
45
  # Create a new page
42
46
  #
43
- def initialize(url, response = nil, code = nil, aka = nil)
47
+ def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
44
48
  @url = url
45
- @response = response
49
+ @body = body unless Anemone.options.discard_page_bodies
46
50
  @code = code
51
+ @content_type = content_type
47
52
  @links = []
48
53
  @aliases = []
49
54
 
50
55
  @aliases << aka if !aka.nil?
51
56
 
52
57
  #get a list of distinct links on the page, in absolute url form
53
- if @response and @response.body
54
- Hpricot(@response.body).search('a').each do |a|
58
+ if body
59
+ Hpricot(body).search('a').each do |a|
55
60
  u = a['href']
56
61
  next if u.nil?
57
62
 
@@ -75,7 +80,10 @@ module Anemone
75
80
  # with a 200 response code
76
81
  #
77
82
  def alias_clone(url)
78
- Page.new(url, @response, 200, @url)
83
+ p = clone
84
+ p.add_alias!(@aka) if !@aka.nil?
85
+ p.code = 200
86
+ p
79
87
  end
80
88
 
81
89
  #
@@ -99,27 +107,13 @@ module Anemone
99
107
  results.concat([link].concat(page_hash[link].aliases))
100
108
  end
101
109
  end
102
-
103
- #
104
- # Returns the response body for the page
105
- #
106
- def body
107
- @response.body
108
- end
109
-
110
- #
111
- # Returns the +Content-Type+ header for the page
112
- #
113
- def content_type
114
- @response['Content-Type']
115
- end
116
110
 
117
111
  #
118
112
  # Returns +true+ if the page is a HTML document, returns +false+
119
113
  # otherwise.
120
114
  #
121
115
  def html?
122
- (content_type =~ /text\/html/) == 0
116
+ (@content_type =~ /text\/html/) == 0
123
117
  end
124
118
 
125
119
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-04-14 00:00:00 -05:00
12
+ date: 2009-04-30 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency