anemone 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +3 -2
- data/bin/anemone_url_list.rb +1 -1
- data/lib/anemone/anemone.rb +6 -6
- data/lib/anemone/core.rb +3 -1
- data/lib/anemone/page.rb +21 -34
- metadata +25 -15
- data/bin/anemone_url_list.rb~ +0 -58
data/README.txt
CHANGED
@@ -12,7 +12,8 @@ write your own specialized spider tasks quickly and easily.
|
|
12
12
|
* Allows exclusion of URLs based on regular expressions
|
13
13
|
|
14
14
|
== REQUIREMENTS
|
15
|
-
*
|
15
|
+
* nokogiri
|
16
|
+
* facets
|
16
17
|
|
17
18
|
== EXAMPLES
|
18
|
-
See the +bin+ directory for several examples of useful Anemone tasks.
|
19
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
data/bin/anemone_url_list.rb
CHANGED
@@ -43,7 +43,7 @@ opts = OptionParser.new
|
|
43
43
|
opts.on('-r', '--relative') { options.relative = true }
|
44
44
|
opts.parse!(ARGV)
|
45
45
|
|
46
|
-
Anemone.crawl(ARGV.last) do |anemone|
|
46
|
+
Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
|
47
47
|
anemone.on_every_page do |page|
|
48
48
|
if options.relative
|
49
49
|
puts page.url.path
|
data/lib/anemone/anemone.rb
CHANGED
@@ -3,7 +3,7 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.4'
|
7
7
|
|
8
8
|
# User-Agent string used for HTTP requests
|
9
9
|
USER_AGENT = "Anemone/#{self::VERSION}"
|
@@ -23,15 +23,15 @@ module Anemone
|
|
23
23
|
def Anemone.crawl(url, options = {}, &block)
|
24
24
|
Anemone.options = OpenStruct.new(options)
|
25
25
|
|
26
|
-
|
26
|
+
#by default, run 4 Tentacle threads to fetch pages
|
27
27
|
Anemone.options.threads ||= 4
|
28
28
|
|
29
|
-
|
29
|
+
#disable verbose output by default
|
30
30
|
Anemone.options.verbose ||= false
|
31
31
|
|
32
|
-
|
33
|
-
|
32
|
+
#by default, don't throw away the page response body after scanning it for links
|
33
|
+
Anemone.options.discard_page_bodies ||= false
|
34
34
|
|
35
35
|
Core.crawl(url, &block)
|
36
36
|
end
|
37
|
-
end
|
37
|
+
end
|
data/lib/anemone/core.rb
CHANGED
@@ -103,6 +103,8 @@ module Anemone
|
|
103
103
|
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
104
104
|
|
105
105
|
do_page_blocks(page)
|
106
|
+
|
107
|
+
page.doc = nil if Anemone.options.discard_page_bodies
|
106
108
|
|
107
109
|
page.links.each do |link|
|
108
110
|
if visit_link?(link)
|
@@ -131,7 +133,7 @@ module Anemone
|
|
131
133
|
end
|
132
134
|
|
133
135
|
end
|
134
|
-
|
136
|
+
|
135
137
|
@tentacles.each { |t| t.join }
|
136
138
|
|
137
139
|
self
|
data/lib/anemone/page.rb
CHANGED
@@ -1,25 +1,20 @@
|
|
1
1
|
require 'anemone/http'
|
2
|
-
require '
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'facets/ostructable'
|
3
4
|
|
4
5
|
module Anemone
|
5
6
|
class Page
|
7
|
+
include OpenStructable
|
8
|
+
|
6
9
|
# The URL of the page
|
7
10
|
attr_reader :url
|
8
11
|
# Array of distinct A tag HREFs from the page
|
9
12
|
attr_reader :links
|
10
|
-
#Body of the HTTP response
|
11
|
-
attr_reader :body
|
12
13
|
#Content-type of the HTTP response
|
13
14
|
attr_reader :content_type
|
14
|
-
#title of the page if it is an HTML document
|
15
|
-
attr_reader :title
|
16
|
-
#first h1 on the page, if present
|
17
|
-
attr_reader :h1
|
18
|
-
#first h2 on the page, if present
|
19
|
-
attr_reader :h2
|
20
|
-
#meta-description of the page, if present
|
21
|
-
attr_reader :description
|
22
15
|
|
16
|
+
#Nokogiri document for the HTML body
|
17
|
+
attr_accessor :doc
|
23
18
|
# Integer response code of the page
|
24
19
|
attr_accessor :code
|
25
20
|
# Array of redirect-aliases for the page
|
@@ -54,36 +49,28 @@ module Anemone
|
|
54
49
|
#
|
55
50
|
def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
|
56
51
|
@url = url
|
57
|
-
@body = body unless Anemone.options.discard_page_bodies
|
58
52
|
@code = code
|
59
53
|
@content_type = content_type
|
60
54
|
@links = []
|
61
55
|
@aliases = []
|
62
|
-
|
56
|
+
|
57
|
+
#create empty storage for OpenStructable
|
58
|
+
update({})
|
59
|
+
|
63
60
|
@aliases << aka if !aka.nil?
|
64
61
|
|
65
62
|
if body
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
#save page h1
|
73
|
-
h1_elem = h.at('h1')
|
74
|
-
@h1 = h1_elem.inner_html if !h1_elem.nil?
|
75
|
-
|
76
|
-
#save page h2
|
77
|
-
h2_elem = h.at('h2')
|
78
|
-
@h2 = h2_elem.inner_html if !h2_elem.nil?
|
63
|
+
begin
|
64
|
+
@doc = Nokogiri::HTML(body)
|
65
|
+
rescue
|
66
|
+
return
|
67
|
+
end
|
79
68
|
|
80
|
-
|
81
|
-
description_elem = h.at('meta[@name=description]')
|
82
|
-
@description = description_elem['content'] if !description_elem.nil?
|
69
|
+
return if @doc.nil?
|
83
70
|
|
84
71
|
#get a list of distinct links on the page, in absolute url form
|
85
|
-
|
86
|
-
u = a
|
72
|
+
@doc.css('a').each do |a|
|
73
|
+
u = a.attribute('href')
|
87
74
|
next if u.nil?
|
88
75
|
|
89
76
|
begin
|
@@ -106,9 +93,9 @@ module Anemone
|
|
106
93
|
#
|
107
94
|
def alias_clone(url)
|
108
95
|
p = clone
|
109
|
-
|
110
|
-
|
111
|
-
|
96
|
+
p.add_alias!(@aka) if !@aka.nil?
|
97
|
+
p.code = 200
|
98
|
+
p
|
112
99
|
end
|
113
100
|
|
114
101
|
#
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,18 +9,28 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-06-12 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 1.3.0
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: facets
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.5.0
|
24
34
|
version:
|
25
35
|
description:
|
26
36
|
email:
|
@@ -35,23 +45,23 @@ extensions: []
|
|
35
45
|
extra_rdoc_files:
|
36
46
|
- README.txt
|
37
47
|
files:
|
38
|
-
- bin/anemone_url_list.rb~
|
39
|
-
- bin/anemone_url_list.rb
|
40
|
-
- bin/anemone_serialize.rb
|
41
48
|
- bin/anemone_pagedepth.rb
|
42
|
-
- bin/
|
49
|
+
- bin/anemone_url_list.rb
|
43
50
|
- bin/anemone_cron.rb
|
44
|
-
-
|
45
|
-
-
|
51
|
+
- bin/anemone_count.rb
|
52
|
+
- bin/anemone_serialize.rb
|
53
|
+
- lib/anemone/tentacle.rb
|
46
54
|
- lib/anemone/page.rb
|
55
|
+
- lib/anemone/page_hash.rb
|
47
56
|
- lib/anemone/core.rb
|
48
|
-
- lib/anemone/anemone.rb
|
49
57
|
- lib/anemone/http.rb
|
50
|
-
- lib/anemone/
|
51
|
-
- lib/anemone
|
58
|
+
- lib/anemone/anemone.rb
|
59
|
+
- lib/anemone.rb
|
52
60
|
- README.txt
|
53
61
|
has_rdoc: true
|
54
62
|
homepage: http://anemone.rubyforge.org
|
63
|
+
licenses: []
|
64
|
+
|
55
65
|
post_install_message:
|
56
66
|
rdoc_options:
|
57
67
|
- -m
|
@@ -75,9 +85,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
85
|
requirements: []
|
76
86
|
|
77
87
|
rubyforge_project: anemone
|
78
|
-
rubygems_version: 1.3.
|
88
|
+
rubygems_version: 1.3.4
|
79
89
|
signing_key:
|
80
|
-
specification_version:
|
90
|
+
specification_version: 3
|
81
91
|
summary: Anemone web-spider framework
|
82
92
|
test_files: []
|
83
93
|
|
data/bin/anemone_url_list.rb~
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
-
# in the domain as they are encountered.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require 'ostruct'
|
20
|
-
|
21
|
-
def usage
|
22
|
-
puts <<END
|
23
|
-
Usage: anemone_url_list.rb [options] url
|
24
|
-
|
25
|
-
Options:
|
26
|
-
-r, --relative Output relative URLs (rather than absolute)
|
27
|
-
END
|
28
|
-
end
|
29
|
-
|
30
|
-
options = OpenStruct.new
|
31
|
-
options.relative = false
|
32
|
-
|
33
|
-
# make sure that the last option is a URL we can crawl
|
34
|
-
begin
|
35
|
-
URI(ARGV.last)
|
36
|
-
rescue
|
37
|
-
usage
|
38
|
-
Process.exit
|
39
|
-
end
|
40
|
-
|
41
|
-
# parse command-line options
|
42
|
-
opts = OptionParser.new
|
43
|
-
opts.on('-r', '--relative') { options.relative = true }
|
44
|
-
opts.parse!(ARGV)
|
45
|
-
|
46
|
-
Anemone.crawl(ARGV.last) do |anemone|
|
47
|
-
anemone.on_pages_like(/\/about\//, /\/experience\//) do |page|
|
48
|
-
puts "WOOZLE #{page.url}"
|
49
|
-
end
|
50
|
-
|
51
|
-
anemone.on_every_page do |page|
|
52
|
-
if options.relative
|
53
|
-
puts page.url.path
|
54
|
-
else
|
55
|
-
puts page.url
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|