anemone 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/{README.txt → README.rdoc} +0 -0
- data/bin/anemone_cron.rb +11 -13
- data/lib/anemone/anemone.rb +1 -1
- data/lib/anemone/core.rb +4 -1
- data/lib/anemone/page.rb +2 -2
- data/lib/anemone/page_hash.rb +46 -13
- metadata +5 -5
data/{README.txt → README.rdoc}
RENAMED
File without changes
|
data/bin/anemone_cron.rb
CHANGED
@@ -59,29 +59,27 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
|
59
59
|
pages.each_value do |page|
|
60
60
|
url = page.url.to_s
|
61
61
|
not_found << url if page.not_found?
|
62
|
-
end
|
63
|
-
|
62
|
+
end
|
63
|
+
unless not_found.empty?
|
64
64
|
puts "\n404's:"
|
65
|
-
|
65
|
+
|
66
|
+
missing_links = pages.urls_linking_to(not_found)
|
67
|
+
missing_links.each do |url, links|
|
66
68
|
if options.relative
|
67
69
|
puts URI(url).path.to_s
|
68
|
-
else
|
70
|
+
else
|
69
71
|
puts url
|
70
72
|
end
|
71
|
-
|
72
|
-
pages.urls_linking_to(url).each do |u|
|
73
|
+
links.slice(0..10).each do |u|
|
73
74
|
u = u.path if options.relative
|
74
|
-
num_linked_from += 1
|
75
75
|
puts " linked from #{u}"
|
76
|
-
if num_linked_from > 10
|
77
|
-
puts " ..."
|
78
|
-
break
|
79
|
-
end
|
80
76
|
end
|
77
|
+
|
78
|
+
puts " ..." if missing_links.size > 10
|
81
79
|
end
|
82
|
-
|
80
|
+
|
83
81
|
print "\n"
|
84
|
-
end
|
82
|
+
end
|
85
83
|
|
86
84
|
# remove redirect aliases, and calculate pagedepths
|
87
85
|
pages = pages.shortest_paths!(root).uniq
|
data/lib/anemone/anemone.rb
CHANGED
data/lib/anemone/core.rb
CHANGED
@@ -31,7 +31,6 @@ module Anemone
|
|
31
31
|
self.new(root) do |core|
|
32
32
|
block.call(core) if block
|
33
33
|
core.run
|
34
|
-
core.do_after_crawl_blocks
|
35
34
|
return core
|
36
35
|
end
|
37
36
|
end
|
@@ -136,9 +135,13 @@ module Anemone
|
|
136
135
|
|
137
136
|
@tentacles.each { |t| t.join }
|
138
137
|
|
138
|
+
do_after_crawl_blocks()
|
139
|
+
|
139
140
|
self
|
140
141
|
end
|
141
142
|
|
143
|
+
private
|
144
|
+
|
142
145
|
#
|
143
146
|
# Execute the after_crawl blocks
|
144
147
|
#
|
data/lib/anemone/page.rb
CHANGED
@@ -54,7 +54,7 @@ module Anemone
|
|
54
54
|
@links = []
|
55
55
|
@aliases = []
|
56
56
|
|
57
|
-
|
57
|
+
#create empty storage for OpenStructable
|
58
58
|
update({})
|
59
59
|
|
60
60
|
@aliases << aka if !aka.nil?
|
@@ -70,7 +70,7 @@ module Anemone
|
|
70
70
|
|
71
71
|
#get a list of distinct links on the page, in absolute url form
|
72
72
|
@doc.css('a').each do |a|
|
73
|
-
u = a.
|
73
|
+
u = a.attributes['href'].content if a.attributes['href']
|
74
74
|
next if u.nil?
|
75
75
|
|
76
76
|
begin
|
data/lib/anemone/page_hash.rb
CHANGED
@@ -61,23 +61,56 @@ module Anemone
|
|
61
61
|
end
|
62
62
|
|
63
63
|
#
|
64
|
-
#
|
64
|
+
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
65
|
+
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
65
66
|
#
|
66
|
-
def pages_linking_to
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
67
|
+
def pages_linking_to(urls)
|
68
|
+
unless urls.is_a?(Array)
|
69
|
+
urls = [urls] unless urls.is_a?(Array)
|
70
|
+
single = true
|
71
|
+
end
|
72
|
+
|
73
|
+
urls.map! do |url|
|
74
|
+
if url.is_a?(String)
|
75
|
+
URI(url) rescue nil
|
76
|
+
else
|
77
|
+
url
|
78
|
+
end
|
79
|
+
end
|
80
|
+
urls.compact
|
81
|
+
|
82
|
+
links = {}
|
83
|
+
urls.each { |url| links[url] = [] }
|
84
|
+
values.each do |page|
|
85
|
+
urls.each { |url| links[url] << page if page.links.include?(url) }
|
86
|
+
end
|
87
|
+
|
88
|
+
if single and !links.empty?
|
89
|
+
return links.first
|
90
|
+
else
|
91
|
+
return links
|
71
92
|
end
|
72
|
-
|
73
|
-
values.delete_if { |p| !p.links.include?(url) }
|
74
93
|
end
|
75
|
-
|
94
|
+
|
95
|
+
#
|
96
|
+
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
97
|
+
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
76
98
|
#
|
77
|
-
|
78
|
-
|
79
|
-
|
99
|
+
def urls_linking_to(urls)
|
100
|
+
unless urls.is_a?(Array)
|
101
|
+
urls = [urls] unless urls.is_a?(Array)
|
102
|
+
single = true
|
103
|
+
end
|
104
|
+
|
105
|
+
links = pages_linking_to(urls)
|
106
|
+
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
107
|
+
|
108
|
+
if single and !links.empty?
|
109
|
+
return links.first
|
110
|
+
else
|
111
|
+
return links
|
112
|
+
end
|
80
113
|
end
|
81
|
-
|
114
|
+
|
82
115
|
end
|
83
116
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-06-
|
12
|
+
date: 2009-06-16 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -43,7 +43,7 @@ executables:
|
|
43
43
|
extensions: []
|
44
44
|
|
45
45
|
extra_rdoc_files:
|
46
|
-
- README.
|
46
|
+
- README.rdoc
|
47
47
|
files:
|
48
48
|
- bin/anemone_pagedepth.rb
|
49
49
|
- bin/anemone_url_list.rb
|
@@ -57,7 +57,7 @@ files:
|
|
57
57
|
- lib/anemone/http.rb
|
58
58
|
- lib/anemone/anemone.rb
|
59
59
|
- lib/anemone.rb
|
60
|
-
- README.
|
60
|
+
- README.rdoc
|
61
61
|
has_rdoc: true
|
62
62
|
homepage: http://anemone.rubyforge.org
|
63
63
|
licenses: []
|
@@ -65,7 +65,7 @@ licenses: []
|
|
65
65
|
post_install_message:
|
66
66
|
rdoc_options:
|
67
67
|
- -m
|
68
|
-
- README.
|
68
|
+
- README.rdoc
|
69
69
|
- -t
|
70
70
|
- Anemone
|
71
71
|
require_paths:
|