anemone 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.txt CHANGED
@@ -12,7 +12,8 @@ write your own specialized spider tasks quickly and easily.
12
12
  * Allows exclusion of URLs based on regular expressions
13
13
 
14
14
  == REQUIREMENTS
15
- * hpricot
15
+ * nokogiri
16
+ * facets
16
17
 
17
18
  == EXAMPLES
18
- See the +bin+ directory for several examples of useful Anemone tasks.
19
+ See the +bin+ directory for several examples of useful Anemone tasks.
@@ -43,7 +43,7 @@ opts = OptionParser.new
43
43
  opts.on('-r', '--relative') { options.relative = true }
44
44
  opts.parse!(ARGV)
45
45
 
46
- Anemone.crawl(ARGV.last) do |anemone|
46
+ Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
47
47
  anemone.on_every_page do |page|
48
48
  if options.relative
49
49
  puts page.url.path
@@ -3,7 +3,7 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.0.2'
6
+ VERSION = '0.0.4'
7
7
 
8
8
  # User-Agent string used for HTTP requests
9
9
  USER_AGENT = "Anemone/#{self::VERSION}"
@@ -23,15 +23,15 @@ module Anemone
23
23
  def Anemone.crawl(url, options = {}, &block)
24
24
  Anemone.options = OpenStruct.new(options)
25
25
 
26
- #by default, run 4 Tentacle threads to fetch pages
26
+ #by default, run 4 Tentacle threads to fetch pages
27
27
  Anemone.options.threads ||= 4
28
28
 
29
- #disable verbose output by default
29
+ #disable verbose output by default
30
30
  Anemone.options.verbose ||= false
31
31
 
32
- #by default, don't throw away the page response body after scanning it for links
33
- Anemone.options.discard_page_bodies ||= false
32
+ #by default, don't throw away the page response body after scanning it for links
33
+ Anemone.options.discard_page_bodies ||= false
34
34
 
35
35
  Core.crawl(url, &block)
36
36
  end
37
- end
37
+ end
data/lib/anemone/core.rb CHANGED
@@ -103,6 +103,8 @@ module Anemone
103
103
  puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
104
104
 
105
105
  do_page_blocks(page)
106
+
107
+ page.doc = nil if Anemone.options.discard_page_bodies
106
108
 
107
109
  page.links.each do |link|
108
110
  if visit_link?(link)
@@ -131,7 +133,7 @@ module Anemone
131
133
  end
132
134
 
133
135
  end
134
-
136
+
135
137
  @tentacles.each { |t| t.join }
136
138
 
137
139
  self
data/lib/anemone/page.rb CHANGED
@@ -1,25 +1,20 @@
1
1
  require 'anemone/http'
2
- require 'hpricot'
2
+ require 'nokogiri'
3
+ require 'facets/ostructable'
3
4
 
4
5
  module Anemone
5
6
  class Page
7
+ include OpenStructable
8
+
6
9
  # The URL of the page
7
10
  attr_reader :url
8
11
  # Array of distinct A tag HREFs from the page
9
12
  attr_reader :links
10
- #Body of the HTTP response
11
- attr_reader :body
12
13
  #Content-type of the HTTP response
13
14
  attr_reader :content_type
14
- #title of the page if it is an HTML document
15
- attr_reader :title
16
- #first h1 on the page, if present
17
- attr_reader :h1
18
- #first h2 on the page, if present
19
- attr_reader :h2
20
- #meta-description of the page, if present
21
- attr_reader :description
22
15
 
16
+ #Nokogiri document for the HTML body
17
+ attr_accessor :doc
23
18
  # Integer response code of the page
24
19
  attr_accessor :code
25
20
  # Array of redirect-aliases for the page
@@ -54,36 +49,28 @@ module Anemone
54
49
  #
55
50
  def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
56
51
  @url = url
57
- @body = body unless Anemone.options.discard_page_bodies
58
52
  @code = code
59
53
  @content_type = content_type
60
54
  @links = []
61
55
  @aliases = []
62
-
56
+
57
+ #create empty storage for OpenStructable
58
+ update({})
59
+
63
60
  @aliases << aka if !aka.nil?
64
61
 
65
62
  if body
66
- h = Hpricot(body)
67
-
68
- #save page title
69
- title_elem = h.at('title')
70
- @title = title_elem.inner_html if !title_elem.nil?
71
-
72
- #save page h1
73
- h1_elem = h.at('h1')
74
- @h1 = h1_elem.inner_html if !h1_elem.nil?
75
-
76
- #save page h2
77
- h2_elem = h.at('h2')
78
- @h2 = h2_elem.inner_html if !h2_elem.nil?
63
+ begin
64
+ @doc = Nokogiri::HTML(body)
65
+ rescue
66
+ return
67
+ end
79
68
 
80
- #save page meta-description
81
- description_elem = h.at('meta[@name=description]')
82
- @description = description_elem['content'] if !description_elem.nil?
69
+ return if @doc.nil?
83
70
 
84
71
  #get a list of distinct links on the page, in absolute url form
85
- h.search('a').each do |a|
86
- u = a['href']
72
+ @doc.css('a').each do |a|
73
+ u = a.attribute('href')
87
74
  next if u.nil?
88
75
 
89
76
  begin
@@ -106,9 +93,9 @@ module Anemone
106
93
  #
107
94
  def alias_clone(url)
108
95
  p = clone
109
- p.add_alias!(@aka) if !@aka.nil?
110
- p.code = 200
111
- p
96
+ p.add_alias!(@aka) if !@aka.nil?
97
+ p.code = 200
98
+ p
112
99
  end
113
100
 
114
101
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,18 +9,28 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-05-31 00:00:00 -05:00
12
+ date: 2009-06-12 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
- name: hpricot
16
+ name: nokogiri
17
17
  type: :runtime
18
18
  version_requirement:
19
19
  version_requirements: !ruby/object:Gem::Requirement
20
20
  requirements:
21
21
  - - ">="
22
22
  - !ruby/object:Gem::Version
23
- version: 0.7.0
23
+ version: 1.3.0
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: facets
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 2.5.0
24
34
  version:
25
35
  description:
26
36
  email:
@@ -35,23 +45,23 @@ extensions: []
35
45
  extra_rdoc_files:
36
46
  - README.txt
37
47
  files:
38
- - bin/anemone_url_list.rb~
39
- - bin/anemone_url_list.rb
40
- - bin/anemone_serialize.rb
41
48
  - bin/anemone_pagedepth.rb
42
- - bin/anemone_count.rb
49
+ - bin/anemone_url_list.rb
43
50
  - bin/anemone_cron.rb
44
- - lib/anemone.rb
45
- - lib/anemone
51
+ - bin/anemone_count.rb
52
+ - bin/anemone_serialize.rb
53
+ - lib/anemone/tentacle.rb
46
54
  - lib/anemone/page.rb
55
+ - lib/anemone/page_hash.rb
47
56
  - lib/anemone/core.rb
48
- - lib/anemone/anemone.rb
49
57
  - lib/anemone/http.rb
50
- - lib/anemone/tentacle.rb
51
- - lib/anemone/page_hash.rb
58
+ - lib/anemone/anemone.rb
59
+ - lib/anemone.rb
52
60
  - README.txt
53
61
  has_rdoc: true
54
62
  homepage: http://anemone.rubyforge.org
63
+ licenses: []
64
+
55
65
  post_install_message:
56
66
  rdoc_options:
57
67
  - -m
@@ -75,9 +85,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
85
  requirements: []
76
86
 
77
87
  rubyforge_project: anemone
78
- rubygems_version: 1.3.1
88
+ rubygems_version: 1.3.4
79
89
  signing_key:
80
- specification_version: 2
90
+ specification_version: 3
81
91
  summary: Anemone web-spider framework
82
92
  test_files: []
83
93
 
@@ -1,58 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the URL of each page
4
- # in the domain as they are encountered.
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'ostruct'
20
-
21
- def usage
22
- puts <<END
23
- Usage: anemone_url_list.rb [options] url
24
-
25
- Options:
26
- -r, --relative Output relative URLs (rather than absolute)
27
- END
28
- end
29
-
30
- options = OpenStruct.new
31
- options.relative = false
32
-
33
- # make sure that the last option is a URL we can crawl
34
- begin
35
- URI(ARGV.last)
36
- rescue
37
- usage
38
- Process.exit
39
- end
40
-
41
- # parse command-line options
42
- opts = OptionParser.new
43
- opts.on('-r', '--relative') { options.relative = true }
44
- opts.parse!(ARGV)
45
-
46
- Anemone.crawl(ARGV.last) do |anemone|
47
- anemone.on_pages_like(/\/about\//, /\/experience\//) do |page|
48
- puts "WOOZLE #{page.url}"
49
- end
50
-
51
- anemone.on_every_page do |page|
52
- if options.relative
53
- puts page.url.path
54
- else
55
- puts page.url
56
- end
57
- end
58
- end