rcrawl 0.4.7 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/TODO +0 -2
- data/lib/rcrawl/crawler.rb +7 -1
- data/lib/rcrawl/version.rb +1 -1
- metadata +3 -3
data/Rakefile
CHANGED
data/TODO
CHANGED
data/lib/rcrawl/crawler.rb
CHANGED
@@ -4,7 +4,7 @@ module Rcrawl
|
|
4
4
|
|
5
5
|
attr_accessor :links_to_visit, :site, :user_agent
|
6
6
|
attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
|
7
|
-
:errors
|
7
|
+
:errors, :meta
|
8
8
|
# Initializes various variables when a new Crawler object is instantiated
|
9
9
|
def initialize(site)
|
10
10
|
puts "Rcrawl Version #{VERSION} initializing..."
|
@@ -16,6 +16,7 @@ module Rcrawl
|
|
16
16
|
@user_agent = "Rcrawl/#{VERSION} (http://rubyforge.org/projects/rcrawl/)"
|
17
17
|
@sites = Hash.new
|
18
18
|
@errors = Hash.new
|
19
|
+
@meta = Hash.new
|
19
20
|
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
20
21
|
@links_to_visit << site
|
21
22
|
puts "Ready to crawl #{site}"
|
@@ -78,6 +79,7 @@ module Rcrawl
|
|
78
79
|
when "text/html"
|
79
80
|
link_extractor(document)
|
80
81
|
process_html(document)
|
82
|
+
page_meta(document)
|
81
83
|
else
|
82
84
|
print "... not HTML, skipping..."
|
83
85
|
end
|
@@ -133,6 +135,10 @@ module Rcrawl
|
|
133
135
|
|
134
136
|
end
|
135
137
|
|
138
|
+
def page_meta(document)
|
139
|
+
@meta[@url] = document.meta
|
140
|
+
end
|
141
|
+
|
136
142
|
# robots.txt parsing
|
137
143
|
def robot_safe?(url)
|
138
144
|
uri = URI.parse(url)
|
data/lib/rcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rcrawl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.5.0
|
7
|
+
date: 2006-10-02 00:00:00 -05:00
|
8
8
|
summary: A web crawler written in ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -29,8 +29,8 @@ post_install_message:
|
|
29
29
|
authors:
|
30
30
|
- Digital Duckies
|
31
31
|
files:
|
32
|
-
- lib/rcrawl.rb
|
33
32
|
- lib/rcrawl
|
33
|
+
- lib/rcrawl.rb
|
34
34
|
- lib/rcrawl/robot_rules.rb
|
35
35
|
- lib/rcrawl/crawler.rb
|
36
36
|
- lib/rcrawl/version.rb
|