anemone 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/{README.txt → README.rdoc} +0 -0
- data/bin/anemone_cron.rb +11 -13
- data/lib/anemone/anemone.rb +1 -1
- data/lib/anemone/core.rb +4 -1
- data/lib/anemone/page.rb +2 -2
- data/lib/anemone/page_hash.rb +46 -13
- metadata +5 -5
data/{README.txt → README.rdoc}
RENAMED
File without changes
|
data/bin/anemone_cron.rb
CHANGED
@@ -59,29 +59,27 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
|
59
59
|
pages.each_value do |page|
|
60
60
|
url = page.url.to_s
|
61
61
|
not_found << url if page.not_found?
|
62
|
-
end
|
63
|
-
|
62
|
+
end
|
63
|
+
unless not_found.empty?
|
64
64
|
puts "\n404's:"
|
65
|
-
|
65
|
+
|
66
|
+
missing_links = pages.urls_linking_to(not_found)
|
67
|
+
missing_links.each do |url, links|
|
66
68
|
if options.relative
|
67
69
|
puts URI(url).path.to_s
|
68
|
-
else
|
70
|
+
else
|
69
71
|
puts url
|
70
72
|
end
|
71
|
-
|
72
|
-
pages.urls_linking_to(url).each do |u|
|
73
|
+
links.slice(0..10).each do |u|
|
73
74
|
u = u.path if options.relative
|
74
|
-
num_linked_from += 1
|
75
75
|
puts " linked from #{u}"
|
76
|
-
if num_linked_from > 10
|
77
|
-
puts " ..."
|
78
|
-
break
|
79
|
-
end
|
80
76
|
end
|
77
|
+
|
78
|
+
puts " ..." if missing_links.size > 10
|
81
79
|
end
|
82
|
-
|
80
|
+
|
83
81
|
print "\n"
|
84
|
-
end
|
82
|
+
end
|
85
83
|
|
86
84
|
# remove redirect aliases, and calculate pagedepths
|
87
85
|
pages = pages.shortest_paths!(root).uniq
|
data/lib/anemone/anemone.rb
CHANGED
data/lib/anemone/core.rb
CHANGED
@@ -31,7 +31,6 @@ module Anemone
|
|
31
31
|
self.new(root) do |core|
|
32
32
|
block.call(core) if block
|
33
33
|
core.run
|
34
|
-
core.do_after_crawl_blocks
|
35
34
|
return core
|
36
35
|
end
|
37
36
|
end
|
@@ -136,9 +135,13 @@ module Anemone
|
|
136
135
|
|
137
136
|
@tentacles.each { |t| t.join }
|
138
137
|
|
138
|
+
do_after_crawl_blocks()
|
139
|
+
|
139
140
|
self
|
140
141
|
end
|
141
142
|
|
143
|
+
private
|
144
|
+
|
142
145
|
#
|
143
146
|
# Execute the after_crawl blocks
|
144
147
|
#
|
data/lib/anemone/page.rb
CHANGED
@@ -54,7 +54,7 @@ module Anemone
|
|
54
54
|
@links = []
|
55
55
|
@aliases = []
|
56
56
|
|
57
|
-
|
57
|
+
#create empty storage for OpenStructable
|
58
58
|
update({})
|
59
59
|
|
60
60
|
@aliases << aka if !aka.nil?
|
@@ -70,7 +70,7 @@ module Anemone
|
|
70
70
|
|
71
71
|
#get a list of distinct links on the page, in absolute url form
|
72
72
|
@doc.css('a').each do |a|
|
73
|
-
u = a.
|
73
|
+
u = a.attributes['href'].content if a.attributes['href']
|
74
74
|
next if u.nil?
|
75
75
|
|
76
76
|
begin
|
data/lib/anemone/page_hash.rb
CHANGED
@@ -61,23 +61,56 @@ module Anemone
|
|
61
61
|
end
|
62
62
|
|
63
63
|
#
|
64
|
-
#
|
64
|
+
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
65
|
+
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
65
66
|
#
|
66
|
-
def pages_linking_to
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
67
|
+
def pages_linking_to(urls)
|
68
|
+
unless urls.is_a?(Array)
|
69
|
+
urls = [urls] unless urls.is_a?(Array)
|
70
|
+
single = true
|
71
|
+
end
|
72
|
+
|
73
|
+
urls.map! do |url|
|
74
|
+
if url.is_a?(String)
|
75
|
+
URI(url) rescue nil
|
76
|
+
else
|
77
|
+
url
|
78
|
+
end
|
79
|
+
end
|
80
|
+
urls.compact
|
81
|
+
|
82
|
+
links = {}
|
83
|
+
urls.each { |url| links[url] = [] }
|
84
|
+
values.each do |page|
|
85
|
+
urls.each { |url| links[url] << page if page.links.include?(url) }
|
86
|
+
end
|
87
|
+
|
88
|
+
if single and !links.empty?
|
89
|
+
return links.first
|
90
|
+
else
|
91
|
+
return links
|
71
92
|
end
|
72
|
-
|
73
|
-
values.delete_if { |p| !p.links.include?(url) }
|
74
93
|
end
|
75
|
-
|
94
|
+
|
95
|
+
#
|
96
|
+
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
97
|
+
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
76
98
|
#
|
77
|
-
|
78
|
-
|
79
|
-
|
99
|
+
def urls_linking_to(urls)
|
100
|
+
unless urls.is_a?(Array)
|
101
|
+
urls = [urls] unless urls.is_a?(Array)
|
102
|
+
single = true
|
103
|
+
end
|
104
|
+
|
105
|
+
links = pages_linking_to(urls)
|
106
|
+
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
107
|
+
|
108
|
+
if single and !links.empty?
|
109
|
+
return links.first
|
110
|
+
else
|
111
|
+
return links
|
112
|
+
end
|
80
113
|
end
|
81
|
-
|
114
|
+
|
82
115
|
end
|
83
116
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-06-
|
12
|
+
date: 2009-06-16 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -43,7 +43,7 @@ executables:
|
|
43
43
|
extensions: []
|
44
44
|
|
45
45
|
extra_rdoc_files:
|
46
|
-
- README.
|
46
|
+
- README.rdoc
|
47
47
|
files:
|
48
48
|
- bin/anemone_pagedepth.rb
|
49
49
|
- bin/anemone_url_list.rb
|
@@ -57,7 +57,7 @@ files:
|
|
57
57
|
- lib/anemone/http.rb
|
58
58
|
- lib/anemone/anemone.rb
|
59
59
|
- lib/anemone.rb
|
60
|
-
- README.
|
60
|
+
- README.rdoc
|
61
61
|
has_rdoc: true
|
62
62
|
homepage: http://anemone.rubyforge.org
|
63
63
|
licenses: []
|
@@ -65,7 +65,7 @@ licenses: []
|
|
65
65
|
post_install_message:
|
66
66
|
rdoc_options:
|
67
67
|
- -m
|
68
|
-
- README.
|
68
|
+
- README.rdoc
|
69
69
|
- -t
|
70
70
|
- Anemone
|
71
71
|
require_paths:
|