rcrawl 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ end
18
18
 
19
19
  spec = Gem::Specification.new do |s|
20
20
  s.name = "rcrawl"
21
- s.version = "0.4.7"
21
+ s.version = "0.5.0"
22
22
  s.author = "Digital Duckies"
23
23
  s.email = "rcrawl@digitalduckies.net"
24
24
  s.homepage = "http://digitalduckies.net"
data/TODO CHANGED
@@ -3,5 +3,3 @@ Add max connections and max connections/second code
3
3
  Add referer code
4
4
  Add proxy code? Is this high up on anyone's list, or can it be put off for now?
5
5
  Logging code
6
- Store page headers and page metadata
7
-
@@ -4,7 +4,7 @@ module Rcrawl
4
4
 
5
5
  attr_accessor :links_to_visit, :site, :user_agent
6
6
  attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
7
- :errors
7
+ :errors, :meta
8
8
  # Initializes various variables when a new Crawler object is instantiated
9
9
  def initialize(site)
10
10
  puts "Rcrawl Version #{VERSION} initializing..."
@@ -16,6 +16,7 @@ module Rcrawl
16
16
  @user_agent = "Rcrawl/#{VERSION} (http://rubyforge.org/projects/rcrawl/)"
17
17
  @sites = Hash.new
18
18
  @errors = Hash.new
19
+ @meta = Hash.new
19
20
  @site = URI.parse(site) || raise("You didn't give me a site to crawl")
20
21
  @links_to_visit << site
21
22
  puts "Ready to crawl #{site}"
@@ -78,6 +79,7 @@ module Rcrawl
78
79
  when "text/html"
79
80
  link_extractor(document)
80
81
  process_html(document)
82
+ page_meta(document)
81
83
  else
82
84
  print "... not HTML, skipping..."
83
85
  end
@@ -133,6 +135,10 @@ module Rcrawl
133
135
 
134
136
  end
135
137
 
138
+ def page_meta(document)
139
+ @meta[@url] = document.meta
140
+ end
141
+
136
142
  # robots.txt parsing
137
143
  def robot_safe?(url)
138
144
  uri = URI.parse(url)
@@ -1,5 +1,5 @@
1
1
  module Rcrawl
2
2
  class Crawler
3
- VERSION = "0.4.6"
3
+ VERSION = "0.5.0"
4
4
  end
5
5
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rcrawl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.4.7
7
- date: 2006-09-27 00:00:00 -05:00
6
+ version: 0.5.0
7
+ date: 2006-10-02 00:00:00 -05:00
8
8
  summary: A web crawler written in ruby
9
9
  require_paths:
10
10
  - lib
@@ -29,8 +29,8 @@ post_install_message:
29
29
  authors:
30
30
  - Digital Duckies
31
31
  files:
32
- - lib/rcrawl.rb
33
32
  - lib/rcrawl
33
+ - lib/rcrawl.rb
34
34
  - lib/rcrawl/robot_rules.rb
35
35
  - lib/rcrawl/crawler.rb
36
36
  - lib/rcrawl/version.rb