anemone 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,16 +1,37 @@
1
+ require 'ostruct'
1
2
  require 'anemone/core'
2
3
 
3
4
  module Anemone
4
5
  # Version number
5
- VERSION = '0.0.1'
6
+ VERSION = '0.0.2'
6
7
 
7
8
  # User-Agent string used for HTTP requests
8
9
  USER_AGENT = "Anemone/#{self::VERSION}"
9
10
 
11
+ #module-wide options
12
+ def Anemone.options=(options)
13
+ @options = options
14
+ end
15
+
16
+ def Anemone.options
17
+ @options
18
+ end
19
+
10
20
  #
11
21
  # Convenience method to start a crawl using Core
12
22
  #
13
23
  def Anemone.crawl(url, options = {}, &block)
14
- Core.crawl(url, options, &block)
24
+ Anemone.options = OpenStruct.new(options)
25
+
26
+ #by default, run 4 Tentacle threads to fetch pages
27
+ Anemone.options.threads ||= 4
28
+
29
+ #disable verbose output by default
30
+ Anemone.options.verbose ||= false
31
+
32
+ #by default, throw away the page response body after scanning it for links, to save memory
33
+ Anemone.options.discard_page_bodies ||= true
34
+
35
+ Core.crawl(url, &block)
15
36
  end
16
37
  end
data/lib/anemone/core.rb CHANGED
@@ -11,10 +11,9 @@ module Anemone
11
11
  #
12
12
  # Initialize the crawl with a starting *url*, *options*, and optional *block*
13
13
  #
14
- def initialize(url, options={}, &block)
14
+ def initialize(url, &block)
15
15
  url = URI(url) if url.is_a?(String)
16
16
  @url = url
17
- @options = options
18
17
  @tentacles = []
19
18
  @pages = PageHash.new
20
19
  @on_every_page_blocks = []
@@ -22,17 +21,14 @@ module Anemone
22
21
  @skip_link_patterns = []
23
22
  @after_crawl_blocks = []
24
23
 
25
- @options[:threads] ||= 4
26
- @options[:verbose] ||= false
27
-
28
24
  block.call(self) if block
29
25
  end
30
26
 
31
27
  #
32
28
  # Convenience method to start a new crawl
33
29
  #
34
- def self.crawl(root, options={}, &block)
35
- self.new(root, options) do |core|
30
+ def self.crawl(root, &block)
31
+ self.new(root) do |core|
36
32
  block.call(core) if block
37
33
  core.run
38
34
  core.do_after_crawl_blocks
@@ -91,7 +87,7 @@ module Anemone
91
87
  link_queue = Queue.new
92
88
  page_queue = Queue.new
93
89
 
94
- @options[:threads].times do |id|
90
+ Anemone.options.threads.times do |id|
95
91
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
96
92
  end
97
93
 
@@ -104,7 +100,7 @@ module Anemone
104
100
 
105
101
  @pages[page.url] = page
106
102
 
107
- puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
103
+ puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
108
104
 
109
105
  do_page_blocks(page)
110
106
 
data/lib/anemone/page.rb CHANGED
@@ -7,9 +7,13 @@ module Anemone
7
7
  attr_reader :url
8
8
  # Array of distinct A tag HREFs from the page
9
9
  attr_reader :links
10
- # Integer response code of the page
11
- attr_reader :code
10
+ #Body of the HTTP response
11
+ attr_reader :body
12
+ #Content-type of the HTTP response
13
+ attr_reader :content_type
12
14
 
15
+ # Integer response code of the page
16
+ attr_accessor :code
13
17
  # Array of redirect-aliases for the page
14
18
  attr_accessor :aliases
15
19
  # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
@@ -31,7 +35,7 @@ module Anemone
31
35
  aka = location
32
36
  end
33
37
 
34
- return Page.new(url, response, code, aka)
38
+ return Page.new(url, response.body, code, response['Content-Type'], aka)
35
39
  rescue
36
40
  return Page.new(url)
37
41
  end
@@ -40,18 +44,19 @@ module Anemone
40
44
  #
41
45
  # Create a new page
42
46
  #
43
- def initialize(url, response = nil, code = nil, aka = nil)
47
+ def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
44
48
  @url = url
45
- @response = response
49
+ @body = body unless Anemone.options.discard_page_bodies
46
50
  @code = code
51
+ @content_type = content_type
47
52
  @links = []
48
53
  @aliases = []
49
54
 
50
55
  @aliases << aka if !aka.nil?
51
56
 
52
57
  #get a list of distinct links on the page, in absolute url form
53
- if @response and @response.body
54
- Hpricot(@response.body).search('a').each do |a|
58
+ if body
59
+ Hpricot(body).search('a').each do |a|
55
60
  u = a['href']
56
61
  next if u.nil?
57
62
 
@@ -75,7 +80,10 @@ module Anemone
75
80
  # with a 200 response code
76
81
  #
77
82
  def alias_clone(url)
78
- Page.new(url, @response, 200, @url)
83
+ p = clone
84
+ p.add_alias!(@aka) if !@aka.nil?
85
+ p.code = 200
86
+ p
79
87
  end
80
88
 
81
89
  #
@@ -99,27 +107,13 @@ module Anemone
99
107
  results.concat([link].concat(page_hash[link].aliases))
100
108
  end
101
109
  end
102
-
103
- #
104
- # Returns the response body for the page
105
- #
106
- def body
107
- @response.body
108
- end
109
-
110
- #
111
- # Returns the +Content-Type+ header for the page
112
- #
113
- def content_type
114
- @response['Content-Type']
115
- end
116
110
 
117
111
  #
118
112
  # Returns +true+ if the page is a HTML document, returns +false+
119
113
  # otherwise.
120
114
  #
121
115
  def html?
122
- (content_type =~ /text\/html/) == 0
116
+ (@content_type =~ /text\/html/) == 0
123
117
  end
124
118
 
125
119
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-04-14 00:00:00 -05:00
12
+ date: 2009-04-30 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency