anemone 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
File without changes
data/bin/anemone_cron.rb CHANGED
@@ -59,29 +59,27 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
59
59
  pages.each_value do |page|
60
60
  url = page.url.to_s
61
61
  not_found << url if page.not_found?
62
- end
63
- if !not_found.empty?
62
+ end
63
+ unless not_found.empty?
64
64
  puts "\n404's:"
65
- not_found.each do |url|
65
+
66
+ missing_links = pages.urls_linking_to(not_found)
67
+ missing_links.each do |url, links|
66
68
  if options.relative
67
69
  puts URI(url).path.to_s
68
- else
70
+ else
69
71
  puts url
70
72
  end
71
- num_linked_from = 0
72
- pages.urls_linking_to(url).each do |u|
73
+ links.slice(0..10).each do |u|
73
74
  u = u.path if options.relative
74
- num_linked_from += 1
75
75
  puts " linked from #{u}"
76
- if num_linked_from > 10
77
- puts " ..."
78
- break
79
- end
80
76
  end
77
+
78
+ puts " ..." if missing_links.size > 10
81
79
  end
82
-
80
+
83
81
  print "\n"
84
- end
82
+ end
85
83
 
86
84
  # remove redirect aliases, and calculate pagedepths
87
85
  pages = pages.shortest_paths!(root).uniq
@@ -3,7 +3,7 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.0.4'
6
+ VERSION = '0.0.5'
7
7
 
8
8
  # User-Agent string used for HTTP requests
9
9
  USER_AGENT = "Anemone/#{self::VERSION}"
data/lib/anemone/core.rb CHANGED
@@ -31,7 +31,6 @@ module Anemone
31
31
  self.new(root) do |core|
32
32
  block.call(core) if block
33
33
  core.run
34
- core.do_after_crawl_blocks
35
34
  return core
36
35
  end
37
36
  end
@@ -136,9 +135,13 @@ module Anemone
136
135
 
137
136
  @tentacles.each { |t| t.join }
138
137
 
138
+ do_after_crawl_blocks()
139
+
139
140
  self
140
141
  end
141
142
 
143
+ private
144
+
142
145
  #
143
146
  # Execute the after_crawl blocks
144
147
  #
data/lib/anemone/page.rb CHANGED
@@ -54,7 +54,7 @@ module Anemone
54
54
  @links = []
55
55
  @aliases = []
56
56
 
57
- #create empty storage for OpenStructable
57
+ #create empty storage for OpenStructable
58
58
  update({})
59
59
 
60
60
  @aliases << aka if !aka.nil?
@@ -70,7 +70,7 @@ module Anemone
70
70
 
71
71
  #get a list of distinct links on the page, in absolute url form
72
72
  @doc.css('a').each do |a|
73
- u = a.attribute('href')
73
+ u = a.attributes['href'].content if a.attributes['href']
74
74
  next if u.nil?
75
75
 
76
76
  begin
@@ -61,23 +61,56 @@ module Anemone
61
61
  end
62
62
 
63
63
  #
64
- # Return an Array of Page objects which link to the given url
64
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
65
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
65
66
  #
66
- def pages_linking_to url
67
- begin
68
- url = URI(url) if url.is_a?(String)
69
- rescue
70
- return []
67
+ def pages_linking_to(urls)
68
+ unless urls.is_a?(Array)
69
+ urls = [urls] unless urls.is_a?(Array)
70
+ single = true
71
+ end
72
+
73
+ urls.map! do |url|
74
+ if url.is_a?(String)
75
+ URI(url) rescue nil
76
+ else
77
+ url
78
+ end
79
+ end
80
+ urls.compact
81
+
82
+ links = {}
83
+ urls.each { |url| links[url] = [] }
84
+ values.each do |page|
85
+ urls.each { |url| links[url] << page if page.links.include?(url) }
86
+ end
87
+
88
+ if single and !links.empty?
89
+ return links.first
90
+ else
91
+ return links
71
92
  end
72
-
73
- values.delete_if { |p| !p.links.include?(url) }
74
93
  end
75
-
94
+
95
+ #
96
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
97
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
76
98
  #
77
- # Return an Array of URI objects of Pages linking to the given url
78
- def urls_linking_to url
79
- pages_linking_to(url).map{|p| p.url}
99
+ def urls_linking_to(urls)
100
+ unless urls.is_a?(Array)
101
+ urls = [urls] unless urls.is_a?(Array)
102
+ single = true
103
+ end
104
+
105
+ links = pages_linking_to(urls)
106
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
107
+
108
+ if single and !links.empty?
109
+ return links.first
110
+ else
111
+ return links
112
+ end
80
113
  end
81
-
114
+
82
115
  end
83
116
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-06-12 00:00:00 -05:00
12
+ date: 2009-06-16 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -43,7 +43,7 @@ executables:
43
43
  extensions: []
44
44
 
45
45
  extra_rdoc_files:
46
- - README.txt
46
+ - README.rdoc
47
47
  files:
48
48
  - bin/anemone_pagedepth.rb
49
49
  - bin/anemone_url_list.rb
@@ -57,7 +57,7 @@ files:
57
57
  - lib/anemone/http.rb
58
58
  - lib/anemone/anemone.rb
59
59
  - lib/anemone.rb
60
- - README.txt
60
+ - README.rdoc
61
61
  has_rdoc: true
62
62
  homepage: http://anemone.rubyforge.org
63
63
  licenses: []
@@ -65,7 +65,7 @@ licenses: []
65
65
  post_install_message:
66
66
  rdoc_options:
67
67
  - -m
68
- - README.txt
68
+ - README.rdoc
69
69
  - -t
70
70
  - Anemone
71
71
  require_paths: