rcrawl 0.4.7 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ end
18
18
 
19
19
  spec = Gem::Specification.new do |s|
20
20
  s.name = "rcrawl"
21
- s.version = "0.4.7"
21
+ s.version = "0.5.0"
22
22
  s.author = "Digital Duckies"
23
23
  s.email = "rcrawl@digitalduckies.net"
24
24
  s.homepage = "http://digitalduckies.net"
data/TODO CHANGED
@@ -3,5 +3,3 @@ Add max connections and max connections/second code
3
3
  Add referer code
4
4
  Add proxy code? Is this high up on anyone's list, or can it be put off for now?
5
5
  Logging code
6
- Store page headers and page metadata
7
-
@@ -4,7 +4,7 @@ module Rcrawl
4
4
 
5
5
  attr_accessor :links_to_visit, :site, :user_agent
6
6
  attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
7
- :errors
7
+ :errors, :meta
8
8
  # Initializes various variables when a new Crawler object is instantiated
9
9
  def initialize(site)
10
10
  puts "Rcrawl Version #{VERSION} initializing..."
@@ -16,6 +16,7 @@ module Rcrawl
16
16
  @user_agent = "Rcrawl/#{VERSION} (http://rubyforge.org/projects/rcrawl/)"
17
17
  @sites = Hash.new
18
18
  @errors = Hash.new
19
+ @meta = Hash.new
19
20
  @site = URI.parse(site) || raise("You didn't give me a site to crawl")
20
21
  @links_to_visit << site
21
22
  puts "Ready to crawl #{site}"
@@ -78,6 +79,7 @@ module Rcrawl
78
79
  when "text/html"
79
80
  link_extractor(document)
80
81
  process_html(document)
82
+ page_meta(document)
81
83
  else
82
84
  print "... not HTML, skipping..."
83
85
  end
@@ -133,6 +135,10 @@ module Rcrawl
133
135
 
134
136
  end
135
137
 
138
+ def page_meta(document)
139
+ @meta[@url] = document.meta
140
+ end
141
+
136
142
  # robots.txt parsing
137
143
  def robot_safe?(url)
138
144
  uri = URI.parse(url)
@@ -1,5 +1,5 @@
1
1
  module Rcrawl
2
2
  class Crawler
3
- VERSION = "0.4.6"
3
+ VERSION = "0.5.0"
4
4
  end
5
5
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rcrawl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.4.7
7
- date: 2006-09-27 00:00:00 -05:00
6
+ version: 0.5.0
7
+ date: 2006-10-02 00:00:00 -05:00
8
8
  summary: A web crawler written in ruby
9
9
  require_paths:
10
10
  - lib
@@ -29,8 +29,8 @@ post_install_message:
29
29
  authors:
30
30
  - Digital Duckies
31
31
  files:
32
- - lib/rcrawl.rb
33
32
  - lib/rcrawl
33
+ - lib/rcrawl.rb
34
34
  - lib/rcrawl/robot_rules.rb
35
35
  - lib/rcrawl/crawler.rb
36
36
  - lib/rcrawl/version.rb