anemone 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/anemone/anemone.rb +23 -2
- data/lib/anemone/core.rb +5 -9
- data/lib/anemone/page.rb +17 -23
- metadata +2 -2
data/lib/anemone/anemone.rb
CHANGED
@@ -1,16 +1,37 @@
|
|
1
|
+
require 'ostruct'
|
1
2
|
require 'anemone/core'
|
2
3
|
|
3
4
|
module Anemone
|
4
5
|
# Version number
|
5
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.2'
|
6
7
|
|
7
8
|
# User-Agent string used for HTTP requests
|
8
9
|
USER_AGENT = "Anemone/#{self::VERSION}"
|
9
10
|
|
11
|
+
#module-wide options
|
12
|
+
def Anemone.options=(options)
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def Anemone.options
|
17
|
+
@options
|
18
|
+
end
|
19
|
+
|
10
20
|
#
|
11
21
|
# Convenience method to start a crawl using Core
|
12
22
|
#
|
13
23
|
def Anemone.crawl(url, options = {}, &block)
|
14
|
-
|
24
|
+
Anemone.options = OpenStruct.new(options)
|
25
|
+
|
26
|
+
#by default, run 4 Tentacle threads to fetch pages
|
27
|
+
Anemone.options.threads ||= 4
|
28
|
+
|
29
|
+
#disable verbose output by default
|
30
|
+
Anemone.options.verbose ||= false
|
31
|
+
|
32
|
+
#by default, throw away the page response body after scanning it for links, to save memory
|
33
|
+
Anemone.options.discard_page_bodies ||= true
|
34
|
+
|
35
|
+
Core.crawl(url, &block)
|
15
36
|
end
|
16
37
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -11,10 +11,9 @@ module Anemone
|
|
11
11
|
#
|
12
12
|
# Initialize the crawl with a starting *url*, *options*, and optional *block*
|
13
13
|
#
|
14
|
-
def initialize(url,
|
14
|
+
def initialize(url, &block)
|
15
15
|
url = URI(url) if url.is_a?(String)
|
16
16
|
@url = url
|
17
|
-
@options = options
|
18
17
|
@tentacles = []
|
19
18
|
@pages = PageHash.new
|
20
19
|
@on_every_page_blocks = []
|
@@ -22,17 +21,14 @@ module Anemone
|
|
22
21
|
@skip_link_patterns = []
|
23
22
|
@after_crawl_blocks = []
|
24
23
|
|
25
|
-
@options[:threads] ||= 4
|
26
|
-
@options[:verbose] ||= false
|
27
|
-
|
28
24
|
block.call(self) if block
|
29
25
|
end
|
30
26
|
|
31
27
|
#
|
32
28
|
# Convenience method to start a new crawl
|
33
29
|
#
|
34
|
-
def self.crawl(root,
|
35
|
-
self.new(root
|
30
|
+
def self.crawl(root, &block)
|
31
|
+
self.new(root) do |core|
|
36
32
|
block.call(core) if block
|
37
33
|
core.run
|
38
34
|
core.do_after_crawl_blocks
|
@@ -91,7 +87,7 @@ module Anemone
|
|
91
87
|
link_queue = Queue.new
|
92
88
|
page_queue = Queue.new
|
93
89
|
|
94
|
-
|
90
|
+
Anemone.options.threads.times do |id|
|
95
91
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
96
92
|
end
|
97
93
|
|
@@ -104,7 +100,7 @@ module Anemone
|
|
104
100
|
|
105
101
|
@pages[page.url] = page
|
106
102
|
|
107
|
-
puts "#{page.url} Queue: #{link_queue.size}" if
|
103
|
+
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
108
104
|
|
109
105
|
do_page_blocks(page)
|
110
106
|
|
data/lib/anemone/page.rb
CHANGED
@@ -7,9 +7,13 @@ module Anemone
|
|
7
7
|
attr_reader :url
|
8
8
|
# Array of distinct A tag HREFs from the page
|
9
9
|
attr_reader :links
|
10
|
-
|
11
|
-
|
10
|
+
#Body of the HTTP response
|
11
|
+
attr_reader :body
|
12
|
+
#Content-type of the HTTP response
|
13
|
+
attr_reader :content_type
|
12
14
|
|
15
|
+
# Integer response code of the page
|
16
|
+
attr_accessor :code
|
13
17
|
# Array of redirect-aliases for the page
|
14
18
|
attr_accessor :aliases
|
15
19
|
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
@@ -31,7 +35,7 @@ module Anemone
|
|
31
35
|
aka = location
|
32
36
|
end
|
33
37
|
|
34
|
-
return Page.new(url, response, code, aka)
|
38
|
+
return Page.new(url, response.body, code, response['Content-Type'], aka)
|
35
39
|
rescue
|
36
40
|
return Page.new(url)
|
37
41
|
end
|
@@ -40,18 +44,19 @@ module Anemone
|
|
40
44
|
#
|
41
45
|
# Create a new page
|
42
46
|
#
|
43
|
-
def initialize(url,
|
47
|
+
def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
|
44
48
|
@url = url
|
45
|
-
|
49
|
+
@body = body unless Anemone.options.discard_page_bodies
|
46
50
|
@code = code
|
51
|
+
@content_type = content_type
|
47
52
|
@links = []
|
48
53
|
@aliases = []
|
49
54
|
|
50
55
|
@aliases << aka if !aka.nil?
|
51
56
|
|
52
57
|
#get a list of distinct links on the page, in absolute url form
|
53
|
-
if
|
54
|
-
Hpricot(
|
58
|
+
if body
|
59
|
+
Hpricot(body).search('a').each do |a|
|
55
60
|
u = a['href']
|
56
61
|
next if u.nil?
|
57
62
|
|
@@ -75,7 +80,10 @@ module Anemone
|
|
75
80
|
# with a 200 response code
|
76
81
|
#
|
77
82
|
def alias_clone(url)
|
78
|
-
|
83
|
+
p = clone
|
84
|
+
p.add_alias!(@aka) if !@aka.nil?
|
85
|
+
p.code = 200
|
86
|
+
p
|
79
87
|
end
|
80
88
|
|
81
89
|
#
|
@@ -99,27 +107,13 @@ module Anemone
|
|
99
107
|
results.concat([link].concat(page_hash[link].aliases))
|
100
108
|
end
|
101
109
|
end
|
102
|
-
|
103
|
-
#
|
104
|
-
# Returns the response body for the page
|
105
|
-
#
|
106
|
-
def body
|
107
|
-
@response.body
|
108
|
-
end
|
109
|
-
|
110
|
-
#
|
111
|
-
# Returns the +Content-Type+ header for the page
|
112
|
-
#
|
113
|
-
def content_type
|
114
|
-
@response['Content-Type']
|
115
|
-
end
|
116
110
|
|
117
111
|
#
|
118
112
|
# Returns +true+ if the page is a HTML document, returns +false+
|
119
113
|
# otherwise.
|
120
114
|
#
|
121
115
|
def html?
|
122
|
-
(content_type =~ /text\/html/) == 0
|
116
|
+
(@content_type =~ /text\/html/) == 0
|
123
117
|
end
|
124
118
|
|
125
119
|
#
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-04-
|
12
|
+
date: 2009-04-30 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|