anemone 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/anemone/anemone.rb +23 -2
- data/lib/anemone/core.rb +5 -9
- data/lib/anemone/page.rb +17 -23
- metadata +2 -2
data/lib/anemone/anemone.rb
CHANGED
@@ -1,16 +1,37 @@
|
|
1
|
+
require 'ostruct'
|
1
2
|
require 'anemone/core'
|
2
3
|
|
3
4
|
module Anemone
|
4
5
|
# Version number
|
5
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.2'
|
6
7
|
|
7
8
|
# User-Agent string used for HTTP requests
|
8
9
|
USER_AGENT = "Anemone/#{self::VERSION}"
|
9
10
|
|
11
|
+
#module-wide options
|
12
|
+
def Anemone.options=(options)
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def Anemone.options
|
17
|
+
@options
|
18
|
+
end
|
19
|
+
|
10
20
|
#
|
11
21
|
# Convenience method to start a crawl using Core
|
12
22
|
#
|
13
23
|
def Anemone.crawl(url, options = {}, &block)
|
14
|
-
|
24
|
+
Anemone.options = OpenStruct.new(options)
|
25
|
+
|
26
|
+
#by default, run 4 Tentacle threads to fetch pages
|
27
|
+
Anemone.options.threads ||= 4
|
28
|
+
|
29
|
+
#disable verbose output by default
|
30
|
+
Anemone.options.verbose ||= false
|
31
|
+
|
32
|
+
#by default, throw away the page response body after scanning it for links, to save memory
|
33
|
+
Anemone.options.discard_page_bodies ||= true
|
34
|
+
|
35
|
+
Core.crawl(url, &block)
|
15
36
|
end
|
16
37
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -11,10 +11,9 @@ module Anemone
|
|
11
11
|
#
|
12
12
|
# Initialize the crawl with a starting *url*, *options*, and optional *block*
|
13
13
|
#
|
14
|
-
def initialize(url,
|
14
|
+
def initialize(url, &block)
|
15
15
|
url = URI(url) if url.is_a?(String)
|
16
16
|
@url = url
|
17
|
-
@options = options
|
18
17
|
@tentacles = []
|
19
18
|
@pages = PageHash.new
|
20
19
|
@on_every_page_blocks = []
|
@@ -22,17 +21,14 @@ module Anemone
|
|
22
21
|
@skip_link_patterns = []
|
23
22
|
@after_crawl_blocks = []
|
24
23
|
|
25
|
-
@options[:threads] ||= 4
|
26
|
-
@options[:verbose] ||= false
|
27
|
-
|
28
24
|
block.call(self) if block
|
29
25
|
end
|
30
26
|
|
31
27
|
#
|
32
28
|
# Convenience method to start a new crawl
|
33
29
|
#
|
34
|
-
def self.crawl(root,
|
35
|
-
self.new(root
|
30
|
+
def self.crawl(root, &block)
|
31
|
+
self.new(root) do |core|
|
36
32
|
block.call(core) if block
|
37
33
|
core.run
|
38
34
|
core.do_after_crawl_blocks
|
@@ -91,7 +87,7 @@ module Anemone
|
|
91
87
|
link_queue = Queue.new
|
92
88
|
page_queue = Queue.new
|
93
89
|
|
94
|
-
|
90
|
+
Anemone.options.threads.times do |id|
|
95
91
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
96
92
|
end
|
97
93
|
|
@@ -104,7 +100,7 @@ module Anemone
|
|
104
100
|
|
105
101
|
@pages[page.url] = page
|
106
102
|
|
107
|
-
puts "#{page.url} Queue: #{link_queue.size}" if
|
103
|
+
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
108
104
|
|
109
105
|
do_page_blocks(page)
|
110
106
|
|
data/lib/anemone/page.rb
CHANGED
@@ -7,9 +7,13 @@ module Anemone
|
|
7
7
|
attr_reader :url
|
8
8
|
# Array of distinct A tag HREFs from the page
|
9
9
|
attr_reader :links
|
10
|
-
|
11
|
-
|
10
|
+
#Body of the HTTP response
|
11
|
+
attr_reader :body
|
12
|
+
#Content-type of the HTTP response
|
13
|
+
attr_reader :content_type
|
12
14
|
|
15
|
+
# Integer response code of the page
|
16
|
+
attr_accessor :code
|
13
17
|
# Array of redirect-aliases for the page
|
14
18
|
attr_accessor :aliases
|
15
19
|
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
@@ -31,7 +35,7 @@ module Anemone
|
|
31
35
|
aka = location
|
32
36
|
end
|
33
37
|
|
34
|
-
return Page.new(url, response, code, aka)
|
38
|
+
return Page.new(url, response.body, code, response['Content-Type'], aka)
|
35
39
|
rescue
|
36
40
|
return Page.new(url)
|
37
41
|
end
|
@@ -40,18 +44,19 @@ module Anemone
|
|
40
44
|
#
|
41
45
|
# Create a new page
|
42
46
|
#
|
43
|
-
def initialize(url,
|
47
|
+
def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
|
44
48
|
@url = url
|
45
|
-
|
49
|
+
@body = body unless Anemone.options.discard_page_bodies
|
46
50
|
@code = code
|
51
|
+
@content_type = content_type
|
47
52
|
@links = []
|
48
53
|
@aliases = []
|
49
54
|
|
50
55
|
@aliases << aka if !aka.nil?
|
51
56
|
|
52
57
|
#get a list of distinct links on the page, in absolute url form
|
53
|
-
if
|
54
|
-
Hpricot(
|
58
|
+
if body
|
59
|
+
Hpricot(body).search('a').each do |a|
|
55
60
|
u = a['href']
|
56
61
|
next if u.nil?
|
57
62
|
|
@@ -75,7 +80,10 @@ module Anemone
|
|
75
80
|
# with a 200 response code
|
76
81
|
#
|
77
82
|
def alias_clone(url)
|
78
|
-
|
83
|
+
p = clone
|
84
|
+
p.add_alias!(@aka) if !@aka.nil?
|
85
|
+
p.code = 200
|
86
|
+
p
|
79
87
|
end
|
80
88
|
|
81
89
|
#
|
@@ -99,27 +107,13 @@ module Anemone
|
|
99
107
|
results.concat([link].concat(page_hash[link].aliases))
|
100
108
|
end
|
101
109
|
end
|
102
|
-
|
103
|
-
#
|
104
|
-
# Returns the response body for the page
|
105
|
-
#
|
106
|
-
def body
|
107
|
-
@response.body
|
108
|
-
end
|
109
|
-
|
110
|
-
#
|
111
|
-
# Returns the +Content-Type+ header for the page
|
112
|
-
#
|
113
|
-
def content_type
|
114
|
-
@response['Content-Type']
|
115
|
-
end
|
116
110
|
|
117
111
|
#
|
118
112
|
# Returns +true+ if the page is a HTML document, returns +false+
|
119
113
|
# otherwise.
|
120
114
|
#
|
121
115
|
def html?
|
122
|
-
(content_type =~ /text\/html/) == 0
|
116
|
+
(@content_type =~ /text\/html/) == 0
|
123
117
|
end
|
124
118
|
|
125
119
|
#
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-04-
|
12
|
+
date: 2009-04-30 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|