rcrawl 0.4.7 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/TODO +0 -2
- data/lib/rcrawl/crawler.rb +7 -1
- data/lib/rcrawl/version.rb +1 -1
- metadata +3 -3
data/Rakefile
CHANGED
data/TODO
CHANGED
data/lib/rcrawl/crawler.rb
CHANGED
@@ -4,7 +4,7 @@ module Rcrawl
|
|
4
4
|
|
5
5
|
attr_accessor :links_to_visit, :site, :user_agent
|
6
6
|
attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
|
7
|
-
:errors
|
7
|
+
:errors, :meta
|
8
8
|
# Initializes various variables when a new Crawler object is instantiated
|
9
9
|
def initialize(site)
|
10
10
|
puts "Rcrawl Version #{VERSION} initializing..."
|
@@ -16,6 +16,7 @@ module Rcrawl
|
|
16
16
|
@user_agent = "Rcrawl/#{VERSION} (http://rubyforge.org/projects/rcrawl/)"
|
17
17
|
@sites = Hash.new
|
18
18
|
@errors = Hash.new
|
19
|
+
@meta = Hash.new
|
19
20
|
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
20
21
|
@links_to_visit << site
|
21
22
|
puts "Ready to crawl #{site}"
|
@@ -78,6 +79,7 @@ module Rcrawl
|
|
78
79
|
when "text/html"
|
79
80
|
link_extractor(document)
|
80
81
|
process_html(document)
|
82
|
+
page_meta(document)
|
81
83
|
else
|
82
84
|
print "... not HTML, skipping..."
|
83
85
|
end
|
@@ -133,6 +135,10 @@ module Rcrawl
|
|
133
135
|
|
134
136
|
end
|
135
137
|
|
138
|
+
def page_meta(document)
|
139
|
+
@meta[@url] = document.meta
|
140
|
+
end
|
141
|
+
|
136
142
|
# robots.txt parsing
|
137
143
|
def robot_safe?(url)
|
138
144
|
uri = URI.parse(url)
|
data/lib/rcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rcrawl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.5.0
|
7
|
+
date: 2006-10-02 00:00:00 -05:00
|
8
8
|
summary: A web crawler written in ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -29,8 +29,8 @@ post_install_message:
|
|
29
29
|
authors:
|
30
30
|
- Digital Duckies
|
31
31
|
files:
|
32
|
-
- lib/rcrawl.rb
|
33
32
|
- lib/rcrawl
|
33
|
+
- lib/rcrawl.rb
|
34
34
|
- lib/rcrawl/robot_rules.rb
|
35
35
|
- lib/rcrawl/crawler.rb
|
36
36
|
- lib/rcrawl/version.rb
|