simplecrawler 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/document.rb +1 -0
  2. data/lib/simplecrawler.rb +3 -1
  3. metadata +49 -42
@@ -6,6 +6,7 @@ module SimpleCrawler
6
6
  puts "Document"
7
7
  puts " .uri:\t\t#{uri}"
8
8
  puts " .fetched_at:\t#{fetched_at}"
9
+ puts " .http_status:\t#{http_status}"
9
10
  puts " .headers:"
10
11
  for header in headers
11
12
  puts " #{header[0]}: #{header[1]}"
@@ -23,7 +23,7 @@ module SimpleCrawler
23
23
  require File.dirname(__FILE__) + '/document'
24
24
 
25
25
  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
26
- VERSION = "0.1.1"
26
+ VERSION = "0.1.3"
27
27
 
28
28
  class Crawler
29
29
 
@@ -119,6 +119,7 @@ module SimpleCrawler
119
119
  end
120
120
 
121
121
  doc.headers = file.meta
122
+ doc.http_status = file.status
122
123
  doc.fetched_at = Time.now
123
124
  rescue Exception
124
125
  log("Error fetching [#{uri}]: #{$!}")
@@ -130,6 +131,7 @@ module SimpleCrawler
130
131
 
131
132
  def queue_local_links(doc)
132
133
  return if doc.data == nil
134
+ log("Queuing links for #{doc.uri}")
133
135
  Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
134
136
  doc = Hpricot(doc.data)
135
137
  links = doc.search("a[@href]")
metadata CHANGED
@@ -1,33 +1,34 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.2
3
- specification_version: 1
4
2
  name: simplecrawler
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.1.1
7
- date: 2007-08-30 00:00:00 +02:00
8
- summary: A generic library for web crawling.
9
- require_paths:
10
- - lib
11
- email: peter.krantzNODAMNSPAM@gmail.com
12
- homepage: http://www.peterkrantz.com/simplecrawler/wiki/
13
- rubyforge_project: simplecrawler
14
- description:
15
- autorequire: simplecrawler
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 1.8.2
24
- version:
4
+ version: 0.1.3
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Peter Krantz
8
+ autorequire: simplecrawler
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-01-26 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0.5"
23
+ version:
24
+ description:
25
+ email: peter.krantzNODAMNSPAM@gmail.com
26
+ executables: []
27
+
28
+ extensions: []
29
+
30
+ extra_rdoc_files: []
31
+
31
32
  files:
32
33
  - README
33
34
  - lib/document.rb
@@ -37,25 +38,31 @@ files:
37
38
  - examples/crawl.rb
38
39
  - examples/find_pdfs.rb
39
40
  - examples/list_site_links.rb
40
- test_files:
41
- - tests/simplecrawler_test.rb
41
+ has_rdoc: true
42
+ homepage: http://www.peterkrantz.com/simplecrawler/wiki/
43
+ post_install_message:
42
44
  rdoc_options: []
43
45
 
44
- extra_rdoc_files: []
45
-
46
- executables: []
47
-
48
- extensions: []
49
-
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 1.8.2
53
+ version:
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
50
60
  requirements: []
51
61
 
52
- dependencies:
53
- - !ruby/object:Gem::Dependency
54
- name: hpricot
55
- version_requirement:
56
- version_requirements: !ruby/object:Gem::Version::Requirement
57
- requirements:
58
- - - ">="
59
- - !ruby/object:Gem::Version
60
- version: "0.5"
61
- version:
62
+ rubyforge_project: simplecrawler
63
+ rubygems_version: 1.0.0
64
+ signing_key:
65
+ specification_version: 2
66
+ summary: A generic library for web crawling.
67
+ test_files:
68
+ - tests/simplecrawler_test.rb