simplecrawler 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/simplecrawler.rb +19 -3
- data/tests/simplecrawler_test.rb +8 -0
- metadata +4 -3
data/lib/simplecrawler.rb
CHANGED
@@ -23,11 +23,11 @@ module SimpleCrawler
|
|
23
23
|
require File.dirname(__FILE__) + '/document'
|
24
24
|
|
25
25
|
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
26
|
-
VERSION = "0.1.
|
26
|
+
VERSION = "0.1.6"
|
27
27
|
|
28
28
|
class Crawler
|
29
29
|
|
30
|
-
attr_accessor :user_agent, :skip_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
|
30
|
+
attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
|
31
31
|
|
32
32
|
def initialize(url)
|
33
33
|
@load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
|
@@ -80,6 +80,19 @@ module SimpleCrawler
|
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
83
|
+
#Check if uri is in at least one of the include patterns
|
84
|
+
if @include_patterns
|
85
|
+
match_found = false
|
86
|
+
for include_pattern in @include_patterns
|
87
|
+
re = Regexp.new(include_pattern)
|
88
|
+
if re.match(uri.path)
|
89
|
+
match_found = true
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
return true unless match_found
|
94
|
+
end
|
95
|
+
|
83
96
|
return false
|
84
97
|
end
|
85
98
|
|
@@ -106,15 +119,17 @@ module SimpleCrawler
|
|
106
119
|
uri.path = path if path != "/"
|
107
120
|
doc.uri = uri
|
108
121
|
|
109
|
-
log("
|
122
|
+
log("Opening #{uri}")
|
110
123
|
|
111
124
|
file = open(uri)
|
112
125
|
|
113
126
|
mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
|
114
127
|
|
115
128
|
if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
|
129
|
+
log("Loading data from #{uri}")
|
116
130
|
doc.data = file.read
|
117
131
|
else
|
132
|
+
log("Skipping data for #{uri}")
|
118
133
|
doc.data = nil
|
119
134
|
end
|
120
135
|
|
@@ -140,6 +155,7 @@ module SimpleCrawler
|
|
140
155
|
begin
|
141
156
|
uri = URI.parse(link.attributes["href"])
|
142
157
|
add_uri(uri)
|
158
|
+
log(" Added #{uri}")
|
143
159
|
rescue
|
144
160
|
#skip this link
|
145
161
|
end
|
data/tests/simplecrawler_test.rb
CHANGED
@@ -50,6 +50,14 @@ class SimpleCrawlerTest < Test::Unit::TestCase
|
|
50
50
|
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
|
51
51
|
end
|
52
52
|
|
53
|
+
def test_include_pattern
|
54
|
+
@simplecrawler.include_patterns = ["\\/test\\/", "docs"]
|
55
|
+
assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
|
56
|
+
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test/doc.htm"))
|
57
|
+
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/docs/doc.htm"))
|
58
|
+
end
|
59
|
+
|
60
|
+
|
53
61
|
|
54
62
|
def test_addded_paths_shuld_be_distinct
|
55
63
|
@simplecrawler.add_uri("http://www.example.com/") # This path is already in the queue
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simplecrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Krantz
|
@@ -9,11 +9,12 @@ autorequire: simplecrawler
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-11-28 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: hpricot
|
17
|
+
type: :runtime
|
17
18
|
version_requirement:
|
18
19
|
version_requirements: !ruby/object:Gem::Requirement
|
19
20
|
requirements:
|
@@ -60,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
60
61
|
requirements: []
|
61
62
|
|
62
63
|
rubyforge_project: simplecrawler
|
63
|
-
rubygems_version: 1.
|
64
|
+
rubygems_version: 1.3.1
|
64
65
|
signing_key:
|
65
66
|
specification_version: 2
|
66
67
|
summary: A generic library for web crawling.
|