chriskite-anemone 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +3 -2
- data/bin/anemone_cron.rb +11 -13
- metadata +1 -1
data/README.txt
CHANGED
@@ -12,7 +12,8 @@ write your own specialized spider tasks quickly and easily.
|
|
12
12
|
* Allows exclusion of URLs based on regular expressions
|
13
13
|
|
14
14
|
== REQUIREMENTS
|
15
|
-
*
|
15
|
+
* nokogiri
|
16
|
+
* facets
|
16
17
|
|
17
18
|
== EXAMPLES
|
18
|
-
See the +bin+ directory for several examples of useful Anemone tasks.
|
19
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
data/bin/anemone_cron.rb
CHANGED
@@ -59,29 +59,27 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
|
59
59
|
pages.each_value do |page|
|
60
60
|
url = page.url.to_s
|
61
61
|
not_found << url if page.not_found?
|
62
|
-
end
|
63
|
-
|
62
|
+
end
|
63
|
+
unless not_found.empty?
|
64
64
|
puts "\n404's:"
|
65
|
-
|
65
|
+
|
66
|
+
missing_links = pages.urls_linking_to(not_found)
|
67
|
+
missing_links.each do |url, links|
|
66
68
|
if options.relative
|
67
69
|
puts URI(url).path.to_s
|
68
|
-
else
|
70
|
+
else
|
69
71
|
puts url
|
70
72
|
end
|
71
|
-
|
72
|
-
pages.urls_linking_to(url).each do |u|
|
73
|
+
links.slice(0..10).each do |u|
|
73
74
|
u = u.path if options.relative
|
74
|
-
num_linked_from += 1
|
75
75
|
puts " linked from #{u}"
|
76
|
-
if num_linked_from > 10
|
77
|
-
puts " ..."
|
78
|
-
break
|
79
|
-
end
|
80
76
|
end
|
77
|
+
|
78
|
+
puts " ..." if missing_links.size > 10
|
81
79
|
end
|
82
|
-
|
80
|
+
|
83
81
|
print "\n"
|
84
|
-
end
|
82
|
+
end
|
85
83
|
|
86
84
|
# remove redirect aliases, and calculate pagedepths
|
87
85
|
pages = pages.shortest_paths!(root).uniq
|