anemone 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
data/bin/anemone_cron.rb CHANGED
@@ -59,29 +59,27 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
59
59
  pages.each_value do |page|
60
60
  url = page.url.to_s
61
61
  not_found << url if page.not_found?
62
- end
63
- if !not_found.empty?
62
+ end
63
+ unless not_found.empty?
64
64
  puts "\n404's:"
65
- not_found.each do |url|
65
+
66
+ missing_links = pages.urls_linking_to(not_found)
67
+ missing_links.each do |url, links|
66
68
  if options.relative
67
69
  puts URI(url).path.to_s
68
- else
70
+ else
69
71
  puts url
70
72
  end
71
- num_linked_from = 0
72
- pages.urls_linking_to(url).each do |u|
73
+ links.slice(0..10).each do |u|
73
74
  u = u.path if options.relative
74
- num_linked_from += 1
75
75
  puts " linked from #{u}"
76
- if num_linked_from > 10
77
- puts " ..."
78
- break
79
- end
80
76
  end
77
+
78
+ puts " ..." if missing_links.size > 10
81
79
  end
82
-
80
+
83
81
  print "\n"
84
- end
82
+ end
85
83
 
86
84
  # remove redirect aliases, and calculate pagedepths
87
85
  pages = pages.shortest_paths!(root).uniq
@@ -3,7 +3,7 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.0.4'
6
+ VERSION = '0.0.5'
7
7
 
8
8
  # User-Agent string used for HTTP requests
9
9
  USER_AGENT = "Anemone/#{self::VERSION}"
data/lib/anemone/core.rb CHANGED
@@ -31,7 +31,6 @@ module Anemone
31
31
  self.new(root) do |core|
32
32
  block.call(core) if block
33
33
  core.run
34
- core.do_after_crawl_blocks
35
34
  return core
36
35
  end
37
36
  end
@@ -136,9 +135,13 @@ module Anemone
136
135
 
137
136
  @tentacles.each { |t| t.join }
138
137
 
138
+ do_after_crawl_blocks()
139
+
139
140
  self
140
141
  end
141
142
 
143
+ private
144
+
142
145
  #
143
146
  # Execute the after_crawl blocks
144
147
  #
data/lib/anemone/page.rb CHANGED
@@ -54,7 +54,7 @@ module Anemone
54
54
  @links = []
55
55
  @aliases = []
56
56
 
57
- #create empty storage for OpenStructable
57
+ #create empty storage for OpenStructable
58
58
  update({})
59
59
 
60
60
  @aliases << aka if !aka.nil?
@@ -70,7 +70,7 @@ module Anemone
70
70
 
71
71
  #get a list of distinct links on the page, in absolute url form
72
72
  @doc.css('a').each do |a|
73
- u = a.attribute('href')
73
+ u = a.attributes['href'].content if a.attributes['href']
74
74
  next if u.nil?
75
75
 
76
76
  begin
@@ -61,23 +61,56 @@ module Anemone
61
61
  end
62
62
 
63
63
  #
64
- # Return an Array of Page objects which link to the given url
64
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
65
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
65
66
  #
66
- def pages_linking_to url
67
- begin
68
- url = URI(url) if url.is_a?(String)
69
- rescue
70
- return []
67
+ def pages_linking_to(urls)
68
+ unless urls.is_a?(Array)
69
+ urls = [urls] unless urls.is_a?(Array)
70
+ single = true
71
+ end
72
+
73
+ urls.map! do |url|
74
+ if url.is_a?(String)
75
+ URI(url) rescue nil
76
+ else
77
+ url
78
+ end
79
+ end
80
+ urls.compact
81
+
82
+ links = {}
83
+ urls.each { |url| links[url] = [] }
84
+ values.each do |page|
85
+ urls.each { |url| links[url] << page if page.links.include?(url) }
86
+ end
87
+
88
+ if single and !links.empty?
89
+ return links.first
90
+ else
91
+ return links
71
92
  end
72
-
73
- values.delete_if { |p| !p.links.include?(url) }
74
93
  end
75
-
94
+
95
+ #
96
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
97
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
76
98
  #
77
- # Return an Array of URI objects of Pages linking to the given url
78
- def urls_linking_to url
79
- pages_linking_to(url).map{|p| p.url}
99
+ def urls_linking_to(urls)
100
+ unless urls.is_a?(Array)
101
+ urls = [urls] unless urls.is_a?(Array)
102
+ single = true
103
+ end
104
+
105
+ links = pages_linking_to(urls)
106
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
107
+
108
+ if single and !links.empty?
109
+ return links.first
110
+ else
111
+ return links
112
+ end
80
113
  end
81
-
114
+
82
115
  end
83
116
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-06-12 00:00:00 -05:00
12
+ date: 2009-06-16 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -43,7 +43,7 @@ executables:
43
43
  extensions: []
44
44
 
45
45
  extra_rdoc_files:
46
- - README.txt
46
+ - README.rdoc
47
47
  files:
48
48
  - bin/anemone_pagedepth.rb
49
49
  - bin/anemone_url_list.rb
@@ -57,7 +57,7 @@ files:
57
57
  - lib/anemone/http.rb
58
58
  - lib/anemone/anemone.rb
59
59
  - lib/anemone.rb
60
- - README.txt
60
+ - README.rdoc
61
61
  has_rdoc: true
62
62
  homepage: http://anemone.rubyforge.org
63
63
  licenses: []
@@ -65,7 +65,7 @@ licenses: []
65
65
  post_install_message:
66
66
  rdoc_options:
67
67
  - -m
68
- - README.txt
68
+ - README.rdoc
69
69
  - -t
70
70
  - Anemone
71
71
  require_paths: