spidr 0.3.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog.md +9 -0
- data/lib/spidr/body.rb +2 -2
- data/lib/spidr/filters.rb +1 -1
- data/lib/spidr/headers.rb +68 -23
- data/lib/spidr/version.rb +1 -1
- data/spec/page_spec.rb +2 -2
- data/spidr.gemspec +8 -4
- metadata +43 -44
data/ChangeLog.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
### 0.4.0 / 2011-08-07
|
2
|
+
|
3
|
+
* Added {Spidr::Headers#content_charset}.
|
4
|
+
* Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
|
5
|
+
This ensures that Nokogiri will preserve the body encoding.
|
6
|
+
* Made {Spidr::Headers#is_content_type?} public.
|
7
|
+
* Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
|
8
|
+
or the sub-type.
|
9
|
+
|
1
10
|
### 0.3.2 / 2011-06-20
|
2
11
|
|
3
12
|
* Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
|
data/lib/spidr/body.rb
CHANGED
@@ -27,9 +27,9 @@ module Spidr
|
|
27
27
|
unless body.empty?
|
28
28
|
begin
|
29
29
|
if html?
|
30
|
-
@doc ||= Nokogiri::HTML(body)
|
30
|
+
@doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
|
31
31
|
elsif (rss? || atom? || xml? || xsl?)
|
32
|
-
@doc ||= Nokogiri::XML(body)
|
32
|
+
@doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
|
33
33
|
end
|
34
34
|
rescue
|
35
35
|
end
|
data/lib/spidr/filters.rb
CHANGED
data/lib/spidr/headers.rb
CHANGED
@@ -115,6 +115,67 @@ module Spidr
|
|
115
115
|
(headers['content-type'] || [])
|
116
116
|
end
|
117
117
|
|
118
|
+
#
|
119
|
+
# The charset included in the Content-Type.
|
120
|
+
#
|
121
|
+
# @return [String, nil]
|
122
|
+
# The charset of the content.
|
123
|
+
#
|
124
|
+
# @since 0.4.0
|
125
|
+
#
|
126
|
+
def content_charset
|
127
|
+
content_types.each do |value|
|
128
|
+
if value.include?(';')
|
129
|
+
value.split(';').each do |param|
|
130
|
+
param.strip!
|
131
|
+
|
132
|
+
if param.start_with?('charset=')
|
133
|
+
return param.split('=',2).last
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
return nil
|
140
|
+
end
|
141
|
+
|
142
|
+
#
|
143
|
+
# Determines if any of the content-types of the page include a given
|
144
|
+
# type.
|
145
|
+
#
|
146
|
+
# @param [String] type
|
147
|
+
# The content-type to test for.
|
148
|
+
#
|
149
|
+
# @return [Boolean]
|
150
|
+
# Specifies whether the page includes the given content-type.
|
151
|
+
#
|
152
|
+
# @example Match the Content-Type
|
153
|
+
# page.is_content_type?('application/json')
|
154
|
+
#
|
155
|
+
# @example Match the sub-type of the Content-Type
|
156
|
+
# page.is_content_type?('json')
|
157
|
+
#
|
158
|
+
# @since 0.4.0
|
159
|
+
#
|
160
|
+
def is_content_type?(type)
|
161
|
+
if type.include?('/')
|
162
|
+
# otherwise only match the first param
|
163
|
+
content_types.any? do |value|
|
164
|
+
value = value.split(';',2).first
|
165
|
+
|
166
|
+
value == type
|
167
|
+
end
|
168
|
+
else
|
169
|
+
# otherwise only match the sub-type
|
170
|
+
content_types.any? do |value|
|
171
|
+
value = value.split(';',2).first
|
172
|
+
value = value.split('/',2).last
|
173
|
+
|
174
|
+
value == type
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
118
179
|
#
|
119
180
|
# Determines if the page is plain-text.
|
120
181
|
#
|
@@ -291,35 +352,19 @@ module Spidr
|
|
291
352
|
def cookie_params
|
292
353
|
params = {}
|
293
354
|
|
294
|
-
cookies.each do |
|
295
|
-
|
296
|
-
|
355
|
+
cookies.each do |value|
|
356
|
+
value.split(';').each do |param|
|
357
|
+
param.strip!
|
358
|
+
|
359
|
+
name, value = param.split('=',2)
|
297
360
|
|
298
|
-
unless RESERVED_COOKIE_NAMES.include?(
|
299
|
-
params[
|
361
|
+
unless RESERVED_COOKIE_NAMES.include?(name)
|
362
|
+
params[name] = (value || '')
|
300
363
|
end
|
301
364
|
end
|
302
365
|
end
|
303
366
|
|
304
367
|
return params
|
305
368
|
end
|
306
|
-
|
307
|
-
protected
|
308
|
-
|
309
|
-
#
|
310
|
-
# Determines if any of the content-types of the page include a given
|
311
|
-
# type.
|
312
|
-
#
|
313
|
-
# @param [String] type
|
314
|
-
# The content-type to test for.
|
315
|
-
#
|
316
|
-
# @return [Boolean]
|
317
|
-
# Specifies whether the page includes the given content-type.
|
318
|
-
#
|
319
|
-
# @since 0.2.4
|
320
|
-
#
|
321
|
-
def is_content_type?(type)
|
322
|
-
content_types.any? { |content| content.include?(type) }
|
323
|
-
end
|
324
369
|
end
|
325
370
|
end
|
data/lib/spidr/version.rb
CHANGED
data/spec/page_spec.rb
CHANGED
@@ -17,7 +17,7 @@ describe Page do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
it "should have a content-type" do
|
20
|
-
@page.content_type.should
|
20
|
+
@page.content_type.should include('text/html')
|
21
21
|
end
|
22
22
|
|
23
23
|
it "should be a html page" do
|
@@ -54,7 +54,7 @@ describe Page do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
it "should have a content-type" do
|
57
|
-
@page.content_type.should
|
57
|
+
@page.content_type.should include('text/plain')
|
58
58
|
end
|
59
59
|
|
60
60
|
it "should be a txt page" do
|
data/spidr.gemspec
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
require 'yaml'
|
4
4
|
|
5
5
|
Gem::Specification.new do |gemspec|
|
6
|
+
root = File.dirname(__FILE__)
|
7
|
+
lib_dir = File.join(root,'lib')
|
6
8
|
files = if File.directory?('.git')
|
7
9
|
`git ls-files`.split($/)
|
8
10
|
elsif File.directory?('.hg')
|
@@ -23,12 +25,12 @@ Gem::Specification.new do |gemspec|
|
|
23
25
|
}
|
24
26
|
|
25
27
|
version = {
|
26
|
-
:file => '
|
28
|
+
:file => 'spidr/version',
|
27
29
|
:constant => 'Spidr::VERSION'
|
28
30
|
}
|
29
31
|
|
30
32
|
defaults = {
|
31
|
-
'name' => File.basename(
|
33
|
+
'name' => File.basename(root),
|
32
34
|
'files' => files,
|
33
35
|
'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
|
34
36
|
'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
|
@@ -40,8 +42,10 @@ Gem::Specification.new do |gemspec|
|
|
40
42
|
gemspec.name = metadata.fetch('name',defaults[:name])
|
41
43
|
gemspec.version = if metadata['version']
|
42
44
|
metadata['version']
|
43
|
-
|
44
|
-
|
45
|
+
else
|
46
|
+
$LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
|
47
|
+
|
48
|
+
require version[:file]
|
45
49
|
eval(version[:constant])
|
46
50
|
end
|
47
51
|
|
metadata
CHANGED
@@ -1,60 +1,60 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0
|
4
5
|
prerelease:
|
5
|
-
version: 0.3.2
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- Postmodern
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2011-08-07 00:00:00.000000000 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|
17
|
-
requirement: &
|
17
|
+
requirement: &15697980 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
|
-
requirements:
|
19
|
+
requirements:
|
20
20
|
- - ~>
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version:
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '1.3'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
26
|
-
- !ruby/object:Gem::Dependency
|
25
|
+
version_requirements: *15697980
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
27
|
name: bundler
|
28
|
-
requirement: &
|
28
|
+
requirement: &15693140 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
|
-
requirements:
|
30
|
+
requirements:
|
31
31
|
- - ~>
|
32
|
-
- !ruby/object:Gem::Version
|
32
|
+
- !ruby/object:Gem::Version
|
33
33
|
version: 1.0.0
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
37
|
-
- !ruby/object:Gem::Dependency
|
36
|
+
version_requirements: *15693140
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
38
|
name: yard
|
39
|
-
requirement: &
|
39
|
+
requirement: &15668040 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
|
-
requirements:
|
41
|
+
requirements:
|
42
42
|
- - ~>
|
43
|
-
- !ruby/object:Gem::Version
|
43
|
+
- !ruby/object:Gem::Version
|
44
44
|
version: 0.6.0
|
45
45
|
type: :development
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
48
|
-
description: Spidr is a versatile Ruby web spidering library that can spider a site,
|
49
|
-
|
47
|
+
version_requirements: *15668040
|
48
|
+
description: Spidr is a versatile Ruby web spidering library that can spider a site,
|
49
|
+
multiple domains, certain links or infinitely. Spidr is designed to be fast and
|
50
|
+
easy to use.
|
51
|
+
email:
|
50
52
|
- postmodern.mod3@gmail.com
|
51
53
|
executables: []
|
52
|
-
|
53
54
|
extensions: []
|
54
|
-
|
55
|
-
extra_rdoc_files:
|
55
|
+
extra_rdoc_files:
|
56
56
|
- README.md
|
57
|
-
files:
|
57
|
+
files:
|
58
58
|
- .rspec
|
59
59
|
- .yardopts
|
60
60
|
- ChangeLog.md
|
@@ -105,34 +105,33 @@ files:
|
|
105
105
|
- spec/spec_helper.rb
|
106
106
|
- spec/spidr_spec.rb
|
107
107
|
- spidr.gemspec
|
108
|
+
has_rdoc: yard
|
108
109
|
homepage: http://github.com/postmodern/spidr
|
109
|
-
licenses:
|
110
|
+
licenses:
|
110
111
|
- MIT
|
111
112
|
post_install_message:
|
112
113
|
rdoc_options: []
|
113
|
-
|
114
|
-
require_paths:
|
114
|
+
require_paths:
|
115
115
|
- lib
|
116
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
117
117
|
none: false
|
118
|
-
requirements:
|
119
|
-
- -
|
120
|
-
- !ruby/object:Gem::Version
|
121
|
-
version:
|
122
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - ! '>='
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: '0'
|
122
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
123
|
none: false
|
124
|
-
requirements:
|
125
|
-
- -
|
126
|
-
- !ruby/object:Gem::Version
|
124
|
+
requirements:
|
125
|
+
- - ! '>='
|
126
|
+
- !ruby/object:Gem::Version
|
127
127
|
version: 1.3.6
|
128
128
|
requirements: []
|
129
|
-
|
130
129
|
rubyforge_project: spidr
|
131
|
-
rubygems_version: 1.
|
130
|
+
rubygems_version: 1.6.2
|
132
131
|
signing_key:
|
133
132
|
specification_version: 3
|
134
133
|
summary: A versatile Ruby web spidering library
|
135
|
-
test_files:
|
134
|
+
test_files:
|
136
135
|
- spec/agent_spec.rb
|
137
136
|
- spec/actions_spec.rb
|
138
137
|
- spec/rules_spec.rb
|