anemone 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +3 -2
- data/bin/anemone_url_list.rb +1 -1
- data/lib/anemone/anemone.rb +6 -6
- data/lib/anemone/core.rb +3 -1
- data/lib/anemone/page.rb +21 -34
- metadata +25 -15
- data/bin/anemone_url_list.rb~ +0 -58
data/README.txt
CHANGED
@@ -12,7 +12,8 @@ write your own specialized spider tasks quickly and easily.
|
|
12
12
|
* Allows exclusion of URLs based on regular expressions
|
13
13
|
|
14
14
|
== REQUIREMENTS
|
15
|
-
*
|
15
|
+
* nokogiri
|
16
|
+
* facets
|
16
17
|
|
17
18
|
== EXAMPLES
|
18
|
-
See the +bin+ directory for several examples of useful Anemone tasks.
|
19
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
data/bin/anemone_url_list.rb
CHANGED
@@ -43,7 +43,7 @@ opts = OptionParser.new
|
|
43
43
|
opts.on('-r', '--relative') { options.relative = true }
|
44
44
|
opts.parse!(ARGV)
|
45
45
|
|
46
|
-
Anemone.crawl(ARGV.last) do |anemone|
|
46
|
+
Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
|
47
47
|
anemone.on_every_page do |page|
|
48
48
|
if options.relative
|
49
49
|
puts page.url.path
|
data/lib/anemone/anemone.rb
CHANGED
@@ -3,7 +3,7 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.4'
|
7
7
|
|
8
8
|
# User-Agent string used for HTTP requests
|
9
9
|
USER_AGENT = "Anemone/#{self::VERSION}"
|
@@ -23,15 +23,15 @@ module Anemone
|
|
23
23
|
def Anemone.crawl(url, options = {}, &block)
|
24
24
|
Anemone.options = OpenStruct.new(options)
|
25
25
|
|
26
|
-
|
26
|
+
#by default, run 4 Tentacle threads to fetch pages
|
27
27
|
Anemone.options.threads ||= 4
|
28
28
|
|
29
|
-
|
29
|
+
#disable verbose output by default
|
30
30
|
Anemone.options.verbose ||= false
|
31
31
|
|
32
|
-
|
33
|
-
|
32
|
+
#by default, don't throw away the page response body after scanning it for links
|
33
|
+
Anemone.options.discard_page_bodies ||= false
|
34
34
|
|
35
35
|
Core.crawl(url, &block)
|
36
36
|
end
|
37
|
-
end
|
37
|
+
end
|
data/lib/anemone/core.rb
CHANGED
@@ -103,6 +103,8 @@ module Anemone
|
|
103
103
|
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
104
104
|
|
105
105
|
do_page_blocks(page)
|
106
|
+
|
107
|
+
page.doc = nil if Anemone.options.discard_page_bodies
|
106
108
|
|
107
109
|
page.links.each do |link|
|
108
110
|
if visit_link?(link)
|
@@ -131,7 +133,7 @@ module Anemone
|
|
131
133
|
end
|
132
134
|
|
133
135
|
end
|
134
|
-
|
136
|
+
|
135
137
|
@tentacles.each { |t| t.join }
|
136
138
|
|
137
139
|
self
|
data/lib/anemone/page.rb
CHANGED
@@ -1,25 +1,20 @@
|
|
1
1
|
require 'anemone/http'
|
2
|
-
require '
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'facets/ostructable'
|
3
4
|
|
4
5
|
module Anemone
|
5
6
|
class Page
|
7
|
+
include OpenStructable
|
8
|
+
|
6
9
|
# The URL of the page
|
7
10
|
attr_reader :url
|
8
11
|
# Array of distinct A tag HREFs from the page
|
9
12
|
attr_reader :links
|
10
|
-
#Body of the HTTP response
|
11
|
-
attr_reader :body
|
12
13
|
#Content-type of the HTTP response
|
13
14
|
attr_reader :content_type
|
14
|
-
#title of the page if it is an HTML document
|
15
|
-
attr_reader :title
|
16
|
-
#first h1 on the page, if present
|
17
|
-
attr_reader :h1
|
18
|
-
#first h2 on the page, if present
|
19
|
-
attr_reader :h2
|
20
|
-
#meta-description of the page, if present
|
21
|
-
attr_reader :description
|
22
15
|
|
16
|
+
#Nokogiri document for the HTML body
|
17
|
+
attr_accessor :doc
|
23
18
|
# Integer response code of the page
|
24
19
|
attr_accessor :code
|
25
20
|
# Array of redirect-aliases for the page
|
@@ -54,36 +49,28 @@ module Anemone
|
|
54
49
|
#
|
55
50
|
def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
|
56
51
|
@url = url
|
57
|
-
@body = body unless Anemone.options.discard_page_bodies
|
58
52
|
@code = code
|
59
53
|
@content_type = content_type
|
60
54
|
@links = []
|
61
55
|
@aliases = []
|
62
|
-
|
56
|
+
|
57
|
+
#create empty storage for OpenStructable
|
58
|
+
update({})
|
59
|
+
|
63
60
|
@aliases << aka if !aka.nil?
|
64
61
|
|
65
62
|
if body
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
#save page h1
|
73
|
-
h1_elem = h.at('h1')
|
74
|
-
@h1 = h1_elem.inner_html if !h1_elem.nil?
|
75
|
-
|
76
|
-
#save page h2
|
77
|
-
h2_elem = h.at('h2')
|
78
|
-
@h2 = h2_elem.inner_html if !h2_elem.nil?
|
63
|
+
begin
|
64
|
+
@doc = Nokogiri::HTML(body)
|
65
|
+
rescue
|
66
|
+
return
|
67
|
+
end
|
79
68
|
|
80
|
-
|
81
|
-
description_elem = h.at('meta[@name=description]')
|
82
|
-
@description = description_elem['content'] if !description_elem.nil?
|
69
|
+
return if @doc.nil?
|
83
70
|
|
84
71
|
#get a list of distinct links on the page, in absolute url form
|
85
|
-
|
86
|
-
u = a
|
72
|
+
@doc.css('a').each do |a|
|
73
|
+
u = a.attribute('href')
|
87
74
|
next if u.nil?
|
88
75
|
|
89
76
|
begin
|
@@ -106,9 +93,9 @@ module Anemone
|
|
106
93
|
#
|
107
94
|
def alias_clone(url)
|
108
95
|
p = clone
|
109
|
-
|
110
|
-
|
111
|
-
|
96
|
+
p.add_alias!(@aka) if !@aka.nil?
|
97
|
+
p.code = 200
|
98
|
+
p
|
112
99
|
end
|
113
100
|
|
114
101
|
#
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,18 +9,28 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-06-12 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 1.3.0
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: facets
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.5.0
|
24
34
|
version:
|
25
35
|
description:
|
26
36
|
email:
|
@@ -35,23 +45,23 @@ extensions: []
|
|
35
45
|
extra_rdoc_files:
|
36
46
|
- README.txt
|
37
47
|
files:
|
38
|
-
- bin/anemone_url_list.rb~
|
39
|
-
- bin/anemone_url_list.rb
|
40
|
-
- bin/anemone_serialize.rb
|
41
48
|
- bin/anemone_pagedepth.rb
|
42
|
-
- bin/
|
49
|
+
- bin/anemone_url_list.rb
|
43
50
|
- bin/anemone_cron.rb
|
44
|
-
-
|
45
|
-
-
|
51
|
+
- bin/anemone_count.rb
|
52
|
+
- bin/anemone_serialize.rb
|
53
|
+
- lib/anemone/tentacle.rb
|
46
54
|
- lib/anemone/page.rb
|
55
|
+
- lib/anemone/page_hash.rb
|
47
56
|
- lib/anemone/core.rb
|
48
|
-
- lib/anemone/anemone.rb
|
49
57
|
- lib/anemone/http.rb
|
50
|
-
- lib/anemone/
|
51
|
-
- lib/anemone
|
58
|
+
- lib/anemone/anemone.rb
|
59
|
+
- lib/anemone.rb
|
52
60
|
- README.txt
|
53
61
|
has_rdoc: true
|
54
62
|
homepage: http://anemone.rubyforge.org
|
63
|
+
licenses: []
|
64
|
+
|
55
65
|
post_install_message:
|
56
66
|
rdoc_options:
|
57
67
|
- -m
|
@@ -75,9 +85,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
85
|
requirements: []
|
76
86
|
|
77
87
|
rubyforge_project: anemone
|
78
|
-
rubygems_version: 1.3.
|
88
|
+
rubygems_version: 1.3.4
|
79
89
|
signing_key:
|
80
|
-
specification_version:
|
90
|
+
specification_version: 3
|
81
91
|
summary: Anemone web-spider framework
|
82
92
|
test_files: []
|
83
93
|
|
data/bin/anemone_url_list.rb~
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
-
# in the domain as they are encountered.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require 'ostruct'
|
20
|
-
|
21
|
-
def usage
|
22
|
-
puts <<END
|
23
|
-
Usage: anemone_url_list.rb [options] url
|
24
|
-
|
25
|
-
Options:
|
26
|
-
-r, --relative Output relative URLs (rather than absolute)
|
27
|
-
END
|
28
|
-
end
|
29
|
-
|
30
|
-
options = OpenStruct.new
|
31
|
-
options.relative = false
|
32
|
-
|
33
|
-
# make sure that the last option is a URL we can crawl
|
34
|
-
begin
|
35
|
-
URI(ARGV.last)
|
36
|
-
rescue
|
37
|
-
usage
|
38
|
-
Process.exit
|
39
|
-
end
|
40
|
-
|
41
|
-
# parse command-line options
|
42
|
-
opts = OptionParser.new
|
43
|
-
opts.on('-r', '--relative') { options.relative = true }
|
44
|
-
opts.parse!(ARGV)
|
45
|
-
|
46
|
-
Anemone.crawl(ARGV.last) do |anemone|
|
47
|
-
anemone.on_pages_like(/\/about\//, /\/experience\//) do |page|
|
48
|
-
puts "WOOZLE #{page.url}"
|
49
|
-
end
|
50
|
-
|
51
|
-
anemone.on_every_page do |page|
|
52
|
-
if options.relative
|
53
|
-
puts page.url.path
|
54
|
-
else
|
55
|
-
puts page.url
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|