simplecrawler 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/document.rb +1 -0
  2. data/lib/simplecrawler.rb +3 -1
  3. metadata +49 -42
@@ -6,6 +6,7 @@ module SimpleCrawler
6
6
  puts "Document"
7
7
  puts " .uri:\t\t#{uri}"
8
8
  puts " .fetched_at:\t#{fetched_at}"
9
+ puts " .http_status:\t#{http_status}"
9
10
  puts " .headers:"
10
11
  for header in headers
11
12
  puts " #{header[0]}: #{header[1]}"
@@ -23,7 +23,7 @@ module SimpleCrawler
23
23
  require File.dirname(__FILE__) + '/document'
24
24
 
25
25
  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
26
- VERSION = "0.1.1"
26
+ VERSION = "0.1.3"
27
27
 
28
28
  class Crawler
29
29
 
@@ -119,6 +119,7 @@ module SimpleCrawler
119
119
  end
120
120
 
121
121
  doc.headers = file.meta
122
+ doc.http_status = file.status
122
123
  doc.fetched_at = Time.now
123
124
  rescue Exception
124
125
  log("Error fetching [#{uri}]: #{$!}")
@@ -130,6 +131,7 @@ module SimpleCrawler
130
131
 
131
132
  def queue_local_links(doc)
132
133
  return if doc.data == nil
134
+ log("Queuing links for #{doc.uri}")
133
135
  Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
134
136
  doc = Hpricot(doc.data)
135
137
  links = doc.search("a[@href]")
metadata CHANGED
@@ -1,33 +1,34 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.2
3
- specification_version: 1
4
2
  name: simplecrawler
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.1.1
7
- date: 2007-08-30 00:00:00 +02:00
8
- summary: A generic library for web crawling.
9
- require_paths:
10
- - lib
11
- email: peter.krantzNODAMNSPAM@gmail.com
12
- homepage: http://www.peterkrantz.com/simplecrawler/wiki/
13
- rubyforge_project: simplecrawler
14
- description:
15
- autorequire: simplecrawler
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 1.8.2
24
- version:
4
+ version: 0.1.3
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Peter Krantz
8
+ autorequire: simplecrawler
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-01-26 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0.5"
23
+ version:
24
+ description:
25
+ email: peter.krantzNODAMNSPAM@gmail.com
26
+ executables: []
27
+
28
+ extensions: []
29
+
30
+ extra_rdoc_files: []
31
+
31
32
  files:
32
33
  - README
33
34
  - lib/document.rb
@@ -37,25 +38,31 @@ files:
37
38
  - examples/crawl.rb
38
39
  - examples/find_pdfs.rb
39
40
  - examples/list_site_links.rb
40
- test_files:
41
- - tests/simplecrawler_test.rb
41
+ has_rdoc: true
42
+ homepage: http://www.peterkrantz.com/simplecrawler/wiki/
43
+ post_install_message:
42
44
  rdoc_options: []
43
45
 
44
- extra_rdoc_files: []
45
-
46
- executables: []
47
-
48
- extensions: []
49
-
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 1.8.2
53
+ version:
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
50
60
  requirements: []
51
61
 
52
- dependencies:
53
- - !ruby/object:Gem::Dependency
54
- name: hpricot
55
- version_requirement:
56
- version_requirements: !ruby/object:Gem::Version::Requirement
57
- requirements:
58
- - - ">="
59
- - !ruby/object:Gem::Version
60
- version: "0.5"
61
- version:
62
+ rubyforge_project: simplecrawler
63
+ rubygems_version: 1.0.0
64
+ signing_key:
65
+ specification_version: 2
66
+ summary: A generic library for web crawling.
67
+ test_files:
68
+ - tests/simplecrawler_test.rb