simplecrawler 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/document.rb +1 -0
- data/lib/simplecrawler.rb +3 -1
- metadata +49 -42
data/lib/document.rb
CHANGED
data/lib/simplecrawler.rb
CHANGED
@@ -23,7 +23,7 @@ module SimpleCrawler
|
|
23
23
|
require File.dirname(__FILE__) + '/document'
|
24
24
|
|
25
25
|
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
26
|
-
VERSION = "0.1.
|
26
|
+
VERSION = "0.1.3"
|
27
27
|
|
28
28
|
class Crawler
|
29
29
|
|
@@ -119,6 +119,7 @@ module SimpleCrawler
|
|
119
119
|
end
|
120
120
|
|
121
121
|
doc.headers = file.meta
|
122
|
+
doc.http_status = file.status
|
122
123
|
doc.fetched_at = Time.now
|
123
124
|
rescue Exception
|
124
125
|
log("Error fetching [#{uri}]: #{$!}")
|
@@ -130,6 +131,7 @@ module SimpleCrawler
|
|
130
131
|
|
131
132
|
def queue_local_links(doc)
|
132
133
|
return if doc.data == nil
|
134
|
+
log("Queuing links for #{doc.uri}")
|
133
135
|
Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
|
134
136
|
doc = Hpricot(doc.data)
|
135
137
|
links = doc.search("a[@href]")
|
metadata
CHANGED
@@ -1,33 +1,34 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.2
|
3
|
-
specification_version: 1
|
4
2
|
name: simplecrawler
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date: 2007-08-30 00:00:00 +02:00
|
8
|
-
summary: A generic library for web crawling.
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: peter.krantzNODAMNSPAM@gmail.com
|
12
|
-
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
13
|
-
rubyforge_project: simplecrawler
|
14
|
-
description:
|
15
|
-
autorequire: simplecrawler
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 1.8.2
|
24
|
-
version:
|
4
|
+
version: 0.1.3
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Peter Krantz
|
8
|
+
autorequire: simplecrawler
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-01-26 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0.5"
|
23
|
+
version:
|
24
|
+
description:
|
25
|
+
email: peter.krantzNODAMNSPAM@gmail.com
|
26
|
+
executables: []
|
27
|
+
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files: []
|
31
|
+
|
31
32
|
files:
|
32
33
|
- README
|
33
34
|
- lib/document.rb
|
@@ -37,25 +38,31 @@ files:
|
|
37
38
|
- examples/crawl.rb
|
38
39
|
- examples/find_pdfs.rb
|
39
40
|
- examples/list_site_links.rb
|
40
|
-
|
41
|
-
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
43
|
+
post_install_message:
|
42
44
|
rdoc_options: []
|
43
45
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.8.2
|
53
|
+
version:
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
50
60
|
requirements: []
|
51
61
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: "0.5"
|
61
|
-
version:
|
62
|
+
rubyforge_project: simplecrawler
|
63
|
+
rubygems_version: 1.0.0
|
64
|
+
signing_key:
|
65
|
+
specification_version: 2
|
66
|
+
summary: A generic library for web crawling.
|
67
|
+
test_files:
|
68
|
+
- tests/simplecrawler_test.rb
|