spidr 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog.md +9 -0
- data/lib/spidr/body.rb +2 -2
- data/lib/spidr/filters.rb +1 -1
- data/lib/spidr/headers.rb +68 -23
- data/lib/spidr/version.rb +1 -1
- data/spec/page_spec.rb +2 -2
- data/spidr.gemspec +8 -4
- metadata +43 -44
data/ChangeLog.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
### 0.4.0 / 2011-08-07
|
2
|
+
|
3
|
+
* Added {Spidr::Headers#content_charset}.
|
4
|
+
* Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
|
5
|
+
This ensures that Nokogiri will preserve the body encoding.
|
6
|
+
* Made {Spidr::Headers#is_content_type?} public.
|
7
|
+
* Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
|
8
|
+
or the sub-type.
|
9
|
+
|
1
10
|
### 0.3.2 / 2011-06-20
|
2
11
|
|
3
12
|
* Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
|
data/lib/spidr/body.rb
CHANGED
@@ -27,9 +27,9 @@ module Spidr
|
|
27
27
|
unless body.empty?
|
28
28
|
begin
|
29
29
|
if html?
|
30
|
-
@doc ||= Nokogiri::HTML(body)
|
30
|
+
@doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
|
31
31
|
elsif (rss? || atom? || xml? || xsl?)
|
32
|
-
@doc ||= Nokogiri::XML(body)
|
32
|
+
@doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
|
33
33
|
end
|
34
34
|
rescue
|
35
35
|
end
|
data/lib/spidr/filters.rb
CHANGED
data/lib/spidr/headers.rb
CHANGED
@@ -115,6 +115,67 @@ module Spidr
|
|
115
115
|
(headers['content-type'] || [])
|
116
116
|
end
|
117
117
|
|
118
|
+
#
|
119
|
+
# The charset included in the Content-Type.
|
120
|
+
#
|
121
|
+
# @return [String, nil]
|
122
|
+
# The charset of the content.
|
123
|
+
#
|
124
|
+
# @since 0.4.0
|
125
|
+
#
|
126
|
+
def content_charset
|
127
|
+
content_types.each do |value|
|
128
|
+
if value.include?(';')
|
129
|
+
value.split(';').each do |param|
|
130
|
+
param.strip!
|
131
|
+
|
132
|
+
if param.start_with?('charset=')
|
133
|
+
return param.split('=',2).last
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
return nil
|
140
|
+
end
|
141
|
+
|
142
|
+
#
|
143
|
+
# Determines if any of the content-types of the page include a given
|
144
|
+
# type.
|
145
|
+
#
|
146
|
+
# @param [String] type
|
147
|
+
# The content-type to test for.
|
148
|
+
#
|
149
|
+
# @return [Boolean]
|
150
|
+
# Specifies whether the page includes the given content-type.
|
151
|
+
#
|
152
|
+
# @example Match the Content-Type
|
153
|
+
# page.is_content_type?('application/json')
|
154
|
+
#
|
155
|
+
# @example Match the sub-type of the Content-Type
|
156
|
+
# page.is_content_type?('json')
|
157
|
+
#
|
158
|
+
# @since 0.4.0
|
159
|
+
#
|
160
|
+
def is_content_type?(type)
|
161
|
+
if type.include?('/')
|
162
|
+
# otherwise only match the first param
|
163
|
+
content_types.any? do |value|
|
164
|
+
value = value.split(';',2).first
|
165
|
+
|
166
|
+
value == type
|
167
|
+
end
|
168
|
+
else
|
169
|
+
# otherwise only match the sub-type
|
170
|
+
content_types.any? do |value|
|
171
|
+
value = value.split(';',2).first
|
172
|
+
value = value.split('/',2).last
|
173
|
+
|
174
|
+
value == type
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
118
179
|
#
|
119
180
|
# Determines if the page is plain-text.
|
120
181
|
#
|
@@ -291,35 +352,19 @@ module Spidr
|
|
291
352
|
def cookie_params
|
292
353
|
params = {}
|
293
354
|
|
294
|
-
cookies.each do |
|
295
|
-
|
296
|
-
|
355
|
+
cookies.each do |value|
|
356
|
+
value.split(';').each do |param|
|
357
|
+
param.strip!
|
358
|
+
|
359
|
+
name, value = param.split('=',2)
|
297
360
|
|
298
|
-
unless RESERVED_COOKIE_NAMES.include?(
|
299
|
-
params[
|
361
|
+
unless RESERVED_COOKIE_NAMES.include?(name)
|
362
|
+
params[name] = (value || '')
|
300
363
|
end
|
301
364
|
end
|
302
365
|
end
|
303
366
|
|
304
367
|
return params
|
305
368
|
end
|
306
|
-
|
307
|
-
protected
|
308
|
-
|
309
|
-
#
|
310
|
-
# Determines if any of the content-types of the page include a given
|
311
|
-
# type.
|
312
|
-
#
|
313
|
-
# @param [String] type
|
314
|
-
# The content-type to test for.
|
315
|
-
#
|
316
|
-
# @return [Boolean]
|
317
|
-
# Specifies whether the page includes the given content-type.
|
318
|
-
#
|
319
|
-
# @since 0.2.4
|
320
|
-
#
|
321
|
-
def is_content_type?(type)
|
322
|
-
content_types.any? { |content| content.include?(type) }
|
323
|
-
end
|
324
369
|
end
|
325
370
|
end
|
data/lib/spidr/version.rb
CHANGED
data/spec/page_spec.rb
CHANGED
@@ -17,7 +17,7 @@ describe Page do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
it "should have a content-type" do
|
20
|
-
@page.content_type.should
|
20
|
+
@page.content_type.should include('text/html')
|
21
21
|
end
|
22
22
|
|
23
23
|
it "should be a html page" do
|
@@ -54,7 +54,7 @@ describe Page do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
it "should have a content-type" do
|
57
|
-
@page.content_type.should
|
57
|
+
@page.content_type.should include('text/plain')
|
58
58
|
end
|
59
59
|
|
60
60
|
it "should be a txt page" do
|
data/spidr.gemspec
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
require 'yaml'
|
4
4
|
|
5
5
|
Gem::Specification.new do |gemspec|
|
6
|
+
root = File.dirname(__FILE__)
|
7
|
+
lib_dir = File.join(root,'lib')
|
6
8
|
files = if File.directory?('.git')
|
7
9
|
`git ls-files`.split($/)
|
8
10
|
elsif File.directory?('.hg')
|
@@ -23,12 +25,12 @@ Gem::Specification.new do |gemspec|
|
|
23
25
|
}
|
24
26
|
|
25
27
|
version = {
|
26
|
-
:file => '
|
28
|
+
:file => 'spidr/version',
|
27
29
|
:constant => 'Spidr::VERSION'
|
28
30
|
}
|
29
31
|
|
30
32
|
defaults = {
|
31
|
-
'name' => File.basename(
|
33
|
+
'name' => File.basename(root),
|
32
34
|
'files' => files,
|
33
35
|
'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
|
34
36
|
'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
|
@@ -40,8 +42,10 @@ Gem::Specification.new do |gemspec|
|
|
40
42
|
gemspec.name = metadata.fetch('name',defaults[:name])
|
41
43
|
gemspec.version = if metadata['version']
|
42
44
|
metadata['version']
|
43
|
-
|
44
|
-
|
45
|
+
else
|
46
|
+
$LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
|
47
|
+
|
48
|
+
require version[:file]
|
45
49
|
eval(version[:constant])
|
46
50
|
end
|
47
51
|
|
metadata
CHANGED
@@ -1,60 +1,60 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0
|
4
5
|
prerelease:
|
5
|
-
version: 0.3.2
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- Postmodern
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2011-08-07 00:00:00.000000000 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|
17
|
-
requirement: &
|
17
|
+
requirement: &15697980 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
|
-
requirements:
|
19
|
+
requirements:
|
20
20
|
- - ~>
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version:
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '1.3'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
26
|
-
- !ruby/object:Gem::Dependency
|
25
|
+
version_requirements: *15697980
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
27
|
name: bundler
|
28
|
-
requirement: &
|
28
|
+
requirement: &15693140 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
|
-
requirements:
|
30
|
+
requirements:
|
31
31
|
- - ~>
|
32
|
-
- !ruby/object:Gem::Version
|
32
|
+
- !ruby/object:Gem::Version
|
33
33
|
version: 1.0.0
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
37
|
-
- !ruby/object:Gem::Dependency
|
36
|
+
version_requirements: *15693140
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
38
|
name: yard
|
39
|
-
requirement: &
|
39
|
+
requirement: &15668040 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
|
-
requirements:
|
41
|
+
requirements:
|
42
42
|
- - ~>
|
43
|
-
- !ruby/object:Gem::Version
|
43
|
+
- !ruby/object:Gem::Version
|
44
44
|
version: 0.6.0
|
45
45
|
type: :development
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
48
|
-
description: Spidr is a versatile Ruby web spidering library that can spider a site,
|
49
|
-
|
47
|
+
version_requirements: *15668040
|
48
|
+
description: Spidr is a versatile Ruby web spidering library that can spider a site,
|
49
|
+
multiple domains, certain links or infinitely. Spidr is designed to be fast and
|
50
|
+
easy to use.
|
51
|
+
email:
|
50
52
|
- postmodern.mod3@gmail.com
|
51
53
|
executables: []
|
52
|
-
|
53
54
|
extensions: []
|
54
|
-
|
55
|
-
extra_rdoc_files:
|
55
|
+
extra_rdoc_files:
|
56
56
|
- README.md
|
57
|
-
files:
|
57
|
+
files:
|
58
58
|
- .rspec
|
59
59
|
- .yardopts
|
60
60
|
- ChangeLog.md
|
@@ -105,34 +105,33 @@ files:
|
|
105
105
|
- spec/spec_helper.rb
|
106
106
|
- spec/spidr_spec.rb
|
107
107
|
- spidr.gemspec
|
108
|
+
has_rdoc: yard
|
108
109
|
homepage: http://github.com/postmodern/spidr
|
109
|
-
licenses:
|
110
|
+
licenses:
|
110
111
|
- MIT
|
111
112
|
post_install_message:
|
112
113
|
rdoc_options: []
|
113
|
-
|
114
|
-
require_paths:
|
114
|
+
require_paths:
|
115
115
|
- lib
|
116
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
117
117
|
none: false
|
118
|
-
requirements:
|
119
|
-
- -
|
120
|
-
- !ruby/object:Gem::Version
|
121
|
-
version:
|
122
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - ! '>='
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: '0'
|
122
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
123
|
none: false
|
124
|
-
requirements:
|
125
|
-
- -
|
126
|
-
- !ruby/object:Gem::Version
|
124
|
+
requirements:
|
125
|
+
- - ! '>='
|
126
|
+
- !ruby/object:Gem::Version
|
127
127
|
version: 1.3.6
|
128
128
|
requirements: []
|
129
|
-
|
130
129
|
rubyforge_project: spidr
|
131
|
-
rubygems_version: 1.
|
130
|
+
rubygems_version: 1.6.2
|
132
131
|
signing_key:
|
133
132
|
specification_version: 3
|
134
133
|
summary: A versatile Ruby web spidering library
|
135
|
-
test_files:
|
134
|
+
test_files:
|
136
135
|
- spec/agent_spec.rb
|
137
136
|
- spec/actions_spec.rb
|
138
137
|
- spec/rules_spec.rb
|