chriskite-anemone 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +3 -2
- data/bin/anemone_cron.rb +11 -13
- metadata +1 -1
data/README.txt
CHANGED
@@ -12,7 +12,8 @@ write your own specialized spider tasks quickly and easily.
|
|
12
12
|
* Allows exclusion of URLs based on regular expressions
|
13
13
|
|
14
14
|
== REQUIREMENTS
|
15
|
-
*
|
15
|
+
* nokogiri
|
16
|
+
* facets
|
16
17
|
|
17
18
|
== EXAMPLES
|
18
|
-
See the +bin+ directory for several examples of useful Anemone tasks.
|
19
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
data/bin/anemone_cron.rb
CHANGED
@@ -59,29 +59,27 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
|
59
59
|
pages.each_value do |page|
|
60
60
|
url = page.url.to_s
|
61
61
|
not_found << url if page.not_found?
|
62
|
-
end
|
63
|
-
|
62
|
+
end
|
63
|
+
unless not_found.empty?
|
64
64
|
puts "\n404's:"
|
65
|
-
|
65
|
+
|
66
|
+
missing_links = pages.urls_linking_to(not_found)
|
67
|
+
missing_links.each do |url, links|
|
66
68
|
if options.relative
|
67
69
|
puts URI(url).path.to_s
|
68
|
-
else
|
70
|
+
else
|
69
71
|
puts url
|
70
72
|
end
|
71
|
-
|
72
|
-
pages.urls_linking_to(url).each do |u|
|
73
|
+
links.slice(0..10).each do |u|
|
73
74
|
u = u.path if options.relative
|
74
|
-
num_linked_from += 1
|
75
75
|
puts " linked from #{u}"
|
76
|
-
if num_linked_from > 10
|
77
|
-
puts " ..."
|
78
|
-
break
|
79
|
-
end
|
80
76
|
end
|
77
|
+
|
78
|
+
puts " ..." if missing_links.size > 10
|
81
79
|
end
|
82
|
-
|
80
|
+
|
83
81
|
print "\n"
|
84
|
-
end
|
82
|
+
end
|
85
83
|
|
86
84
|
# remove redirect aliases, and calculate pagedepths
|
87
85
|
pages = pages.shortest_paths!(root).uniq
|