anemone 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.txt CHANGED
@@ -12,7 +12,8 @@ write your own specialized spider tasks quickly and easily.
12
12
  * Allows exclusion of URLs based on regular expressions
13
13
 
14
14
  == REQUIREMENTS
15
- * hpricot
15
+ * nokogiri
16
+ * facets
16
17
 
17
18
  == EXAMPLES
18
- See the +bin+ directory for several examples of useful Anemone tasks.
19
+ See the +bin+ directory for several examples of useful Anemone tasks.
@@ -43,7 +43,7 @@ opts = OptionParser.new
43
43
  opts.on('-r', '--relative') { options.relative = true }
44
44
  opts.parse!(ARGV)
45
45
 
46
- Anemone.crawl(ARGV.last) do |anemone|
46
+ Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
47
47
  anemone.on_every_page do |page|
48
48
  if options.relative
49
49
  puts page.url.path
@@ -3,7 +3,7 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.0.2'
6
+ VERSION = '0.0.4'
7
7
 
8
8
  # User-Agent string used for HTTP requests
9
9
  USER_AGENT = "Anemone/#{self::VERSION}"
@@ -23,15 +23,15 @@ module Anemone
23
23
  def Anemone.crawl(url, options = {}, &block)
24
24
  Anemone.options = OpenStruct.new(options)
25
25
 
26
- #by default, run 4 Tentacle threads to fetch pages
26
+ #by default, run 4 Tentacle threads to fetch pages
27
27
  Anemone.options.threads ||= 4
28
28
 
29
- #disable verbose output by default
29
+ #disable verbose output by default
30
30
  Anemone.options.verbose ||= false
31
31
 
32
- #by default, don't throw away the page response body after scanning it for links
33
- Anemone.options.discard_page_bodies ||= false
32
+ #by default, don't throw away the page response body after scanning it for links
33
+ Anemone.options.discard_page_bodies ||= false
34
34
 
35
35
  Core.crawl(url, &block)
36
36
  end
37
- end
37
+ end
data/lib/anemone/core.rb CHANGED
@@ -103,6 +103,8 @@ module Anemone
103
103
  puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
104
104
 
105
105
  do_page_blocks(page)
106
+
107
+ page.doc = nil if Anemone.options.discard_page_bodies
106
108
 
107
109
  page.links.each do |link|
108
110
  if visit_link?(link)
@@ -131,7 +133,7 @@ module Anemone
131
133
  end
132
134
 
133
135
  end
134
-
136
+
135
137
  @tentacles.each { |t| t.join }
136
138
 
137
139
  self
data/lib/anemone/page.rb CHANGED
@@ -1,25 +1,20 @@
1
1
  require 'anemone/http'
2
- require 'hpricot'
2
+ require 'nokogiri'
3
+ require 'facets/ostructable'
3
4
 
4
5
  module Anemone
5
6
  class Page
7
+ include OpenStructable
8
+
6
9
  # The URL of the page
7
10
  attr_reader :url
8
11
  # Array of distinct A tag HREFs from the page
9
12
  attr_reader :links
10
- #Body of the HTTP response
11
- attr_reader :body
12
13
  #Content-type of the HTTP response
13
14
  attr_reader :content_type
14
- #title of the page if it is an HTML document
15
- attr_reader :title
16
- #first h1 on the page, if present
17
- attr_reader :h1
18
- #first h2 on the page, if present
19
- attr_reader :h2
20
- #meta-description of the page, if present
21
- attr_reader :description
22
15
 
16
+ #Nokogiri document for the HTML body
17
+ attr_accessor :doc
23
18
  # Integer response code of the page
24
19
  attr_accessor :code
25
20
  # Array of redirect-aliases for the page
@@ -54,36 +49,28 @@ module Anemone
54
49
  #
55
50
  def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
56
51
  @url = url
57
- @body = body unless Anemone.options.discard_page_bodies
58
52
  @code = code
59
53
  @content_type = content_type
60
54
  @links = []
61
55
  @aliases = []
62
-
56
+
57
+ #create empty storage for OpenStructable
58
+ update({})
59
+
63
60
  @aliases << aka if !aka.nil?
64
61
 
65
62
  if body
66
- h = Hpricot(body)
67
-
68
- #save page title
69
- title_elem = h.at('title')
70
- @title = title_elem.inner_html if !title_elem.nil?
71
-
72
- #save page h1
73
- h1_elem = h.at('h1')
74
- @h1 = h1_elem.inner_html if !h1_elem.nil?
75
-
76
- #save page h2
77
- h2_elem = h.at('h2')
78
- @h2 = h2_elem.inner_html if !h2_elem.nil?
63
+ begin
64
+ @doc = Nokogiri::HTML(body)
65
+ rescue
66
+ return
67
+ end
79
68
 
80
- #save page meta-description
81
- description_elem = h.at('meta[@name=description]')
82
- @description = description_elem['content'] if !description_elem.nil?
69
+ return if @doc.nil?
83
70
 
84
71
  #get a list of distinct links on the page, in absolute url form
85
- h.search('a').each do |a|
86
- u = a['href']
72
+ @doc.css('a').each do |a|
73
+ u = a.attribute('href')
87
74
  next if u.nil?
88
75
 
89
76
  begin
@@ -106,9 +93,9 @@ module Anemone
106
93
  #
107
94
  def alias_clone(url)
108
95
  p = clone
109
- p.add_alias!(@aka) if !@aka.nil?
110
- p.code = 200
111
- p
96
+ p.add_alias!(@aka) if !@aka.nil?
97
+ p.code = 200
98
+ p
112
99
  end
113
100
 
114
101
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,18 +9,28 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-05-31 00:00:00 -05:00
12
+ date: 2009-06-12 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
- name: hpricot
16
+ name: nokogiri
17
17
  type: :runtime
18
18
  version_requirement:
19
19
  version_requirements: !ruby/object:Gem::Requirement
20
20
  requirements:
21
21
  - - ">="
22
22
  - !ruby/object:Gem::Version
23
- version: 0.7.0
23
+ version: 1.3.0
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: facets
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 2.5.0
24
34
  version:
25
35
  description:
26
36
  email:
@@ -35,23 +45,23 @@ extensions: []
35
45
  extra_rdoc_files:
36
46
  - README.txt
37
47
  files:
38
- - bin/anemone_url_list.rb~
39
- - bin/anemone_url_list.rb
40
- - bin/anemone_serialize.rb
41
48
  - bin/anemone_pagedepth.rb
42
- - bin/anemone_count.rb
49
+ - bin/anemone_url_list.rb
43
50
  - bin/anemone_cron.rb
44
- - lib/anemone.rb
45
- - lib/anemone
51
+ - bin/anemone_count.rb
52
+ - bin/anemone_serialize.rb
53
+ - lib/anemone/tentacle.rb
46
54
  - lib/anemone/page.rb
55
+ - lib/anemone/page_hash.rb
47
56
  - lib/anemone/core.rb
48
- - lib/anemone/anemone.rb
49
57
  - lib/anemone/http.rb
50
- - lib/anemone/tentacle.rb
51
- - lib/anemone/page_hash.rb
58
+ - lib/anemone/anemone.rb
59
+ - lib/anemone.rb
52
60
  - README.txt
53
61
  has_rdoc: true
54
62
  homepage: http://anemone.rubyforge.org
63
+ licenses: []
64
+
55
65
  post_install_message:
56
66
  rdoc_options:
57
67
  - -m
@@ -75,9 +85,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
85
  requirements: []
76
86
 
77
87
  rubyforge_project: anemone
78
- rubygems_version: 1.3.1
88
+ rubygems_version: 1.3.4
79
89
  signing_key:
80
- specification_version: 2
90
+ specification_version: 3
81
91
  summary: Anemone web-spider framework
82
92
  test_files: []
83
93
 
@@ -1,58 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the URL of each page
4
- # in the domain as they are encountered.
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'ostruct'
20
-
21
- def usage
22
- puts <<END
23
- Usage: anemone_url_list.rb [options] url
24
-
25
- Options:
26
- -r, --relative Output relative URLs (rather than absolute)
27
- END
28
- end
29
-
30
- options = OpenStruct.new
31
- options.relative = false
32
-
33
- # make sure that the last option is a URL we can crawl
34
- begin
35
- URI(ARGV.last)
36
- rescue
37
- usage
38
- Process.exit
39
- end
40
-
41
- # parse command-line options
42
- opts = OptionParser.new
43
- opts.on('-r', '--relative') { options.relative = true }
44
- opts.parse!(ARGV)
45
-
46
- Anemone.crawl(ARGV.last) do |anemone|
47
- anemone.on_pages_like(/\/about\//, /\/experience\//) do |page|
48
- puts "WOOZLE #{page.url}"
49
- end
50
-
51
- anemone.on_every_page do |page|
52
- if options.relative
53
- puts page.url.path
54
- else
55
- puts page.url
56
- end
57
- end
58
- end