simplecrawler 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/document.rb +1 -0
- data/lib/simplecrawler.rb +3 -1
- metadata +49 -42
data/lib/document.rb
CHANGED
data/lib/simplecrawler.rb
CHANGED
@@ -23,7 +23,7 @@ module SimpleCrawler
|
|
23
23
|
require File.dirname(__FILE__) + '/document'
|
24
24
|
|
25
25
|
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
26
|
-
VERSION = "0.1.
|
26
|
+
VERSION = "0.1.3"
|
27
27
|
|
28
28
|
class Crawler
|
29
29
|
|
@@ -119,6 +119,7 @@ module SimpleCrawler
|
|
119
119
|
end
|
120
120
|
|
121
121
|
doc.headers = file.meta
|
122
|
+
doc.http_status = file.status
|
122
123
|
doc.fetched_at = Time.now
|
123
124
|
rescue Exception
|
124
125
|
log("Error fetching [#{uri}]: #{$!}")
|
@@ -130,6 +131,7 @@ module SimpleCrawler
|
|
130
131
|
|
131
132
|
def queue_local_links(doc)
|
132
133
|
return if doc.data == nil
|
134
|
+
log("Queuing links for #{doc.uri}")
|
133
135
|
Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
|
134
136
|
doc = Hpricot(doc.data)
|
135
137
|
links = doc.search("a[@href]")
|
metadata
CHANGED
@@ -1,33 +1,34 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.2
|
3
|
-
specification_version: 1
|
4
2
|
name: simplecrawler
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date: 2007-08-30 00:00:00 +02:00
|
8
|
-
summary: A generic library for web crawling.
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: peter.krantzNODAMNSPAM@gmail.com
|
12
|
-
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
13
|
-
rubyforge_project: simplecrawler
|
14
|
-
description:
|
15
|
-
autorequire: simplecrawler
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 1.8.2
|
24
|
-
version:
|
4
|
+
version: 0.1.3
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Peter Krantz
|
8
|
+
autorequire: simplecrawler
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-01-26 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0.5"
|
23
|
+
version:
|
24
|
+
description:
|
25
|
+
email: peter.krantzNODAMNSPAM@gmail.com
|
26
|
+
executables: []
|
27
|
+
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files: []
|
31
|
+
|
31
32
|
files:
|
32
33
|
- README
|
33
34
|
- lib/document.rb
|
@@ -37,25 +38,31 @@ files:
|
|
37
38
|
- examples/crawl.rb
|
38
39
|
- examples/find_pdfs.rb
|
39
40
|
- examples/list_site_links.rb
|
40
|
-
|
41
|
-
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
43
|
+
post_install_message:
|
42
44
|
rdoc_options: []
|
43
45
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.8.2
|
53
|
+
version:
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
50
60
|
requirements: []
|
51
61
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: "0.5"
|
61
|
-
version:
|
62
|
+
rubyforge_project: simplecrawler
|
63
|
+
rubygems_version: 1.0.0
|
64
|
+
signing_key:
|
65
|
+
specification_version: 2
|
66
|
+
summary: A generic library for web crawling.
|
67
|
+
test_files:
|
68
|
+
- tests/simplecrawler_test.rb
|