spidr 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,12 @@
1
+ ### 0.4.0 / 2011-08-07
2
+
3
+ * Added {Spidr::Headers#content_charset}.
4
+ * Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
5
+ This ensures that Nokogiri will preserve the body encoding.
6
+ * Made {Spidr::Headers#is_content_type?} public.
7
+ * Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
8
+ or the sub-type.
9
+
1
10
  ### 0.3.2 / 2011-06-20
2
11
 
3
12
  * Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
@@ -27,9 +27,9 @@ module Spidr
27
27
  unless body.empty?
28
28
  begin
29
29
  if html?
30
- @doc ||= Nokogiri::HTML(body)
30
+ @doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
31
31
  elsif (rss? || atom? || xml? || xsl?)
32
- @doc ||= Nokogiri::XML(body)
32
+ @doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
33
33
  end
34
34
  rescue
35
35
  end
@@ -414,7 +414,7 @@ module Spidr
414
414
  rescue Gem::LoadError => e
415
415
  raise(e)
416
416
  rescue ::LoadError
417
- STDERR.puts "Warning: cannot load 'net/https', https support disabled"
417
+ warn "Warning: cannot load 'net/https', https support disabled"
418
418
  end
419
419
  end
420
420
 
@@ -115,6 +115,67 @@ module Spidr
115
115
  (headers['content-type'] || [])
116
116
  end
117
117
 
118
+ #
119
+ # The charset included in the Content-Type.
120
+ #
121
+ # @return [String, nil]
122
+ # The charset of the content.
123
+ #
124
+ # @since 0.4.0
125
+ #
126
+ def content_charset
127
+ content_types.each do |value|
128
+ if value.include?(';')
129
+ value.split(';').each do |param|
130
+ param.strip!
131
+
132
+ if param.start_with?('charset=')
133
+ return param.split('=',2).last
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ return nil
140
+ end
141
+
142
+ #
143
+ # Determines if any of the content-types of the page include a given
144
+ # type.
145
+ #
146
+ # @param [String] type
147
+ # The content-type to test for.
148
+ #
149
+ # @return [Boolean]
150
+ # Specifies whether the page includes the given content-type.
151
+ #
152
+ # @example Match the Content-Type
153
+ # page.is_content_type?('application/json')
154
+ #
155
+ # @example Match the sub-type of the Content-Type
156
+ # page.is_content_type?('json')
157
+ #
158
+ # @since 0.4.0
159
+ #
160
+ def is_content_type?(type)
161
+ if type.include?('/')
162
+ # otherwise only match the first param
163
+ content_types.any? do |value|
164
+ value = value.split(';',2).first
165
+
166
+ value == type
167
+ end
168
+ else
169
+ # otherwise only match the sub-type
170
+ content_types.any? do |value|
171
+ value = value.split(';',2).first
172
+ value = value.split('/',2).last
173
+
174
+ value == type
175
+ end
176
+ end
177
+ end
178
+
118
179
  #
119
180
  # Determines if the page is plain-text.
120
181
  #
@@ -291,35 +352,19 @@ module Spidr
291
352
  def cookie_params
292
353
  params = {}
293
354
 
294
- cookies.each do |cookie|
295
- cookie.split('; ').each do |key_value|
296
- key, value = key_value.split('=',2)
355
+ cookies.each do |value|
356
+ value.split(';').each do |param|
357
+ param.strip!
358
+
359
+ name, value = param.split('=',2)
297
360
 
298
- unless RESERVED_COOKIE_NAMES.include?(key)
299
- params[key] = (value || '')
361
+ unless RESERVED_COOKIE_NAMES.include?(name)
362
+ params[name] = (value || '')
300
363
  end
301
364
  end
302
365
  end
303
366
 
304
367
  return params
305
368
  end
306
-
307
- protected
308
-
309
- #
310
- # Determines if any of the content-types of the page include a given
311
- # type.
312
- #
313
- # @param [String] type
314
- # The content-type to test for.
315
- #
316
- # @return [Boolean]
317
- # Specifies whether the page includes the given content-type.
318
- #
319
- # @since 0.2.4
320
- #
321
- def is_content_type?(type)
322
- content_types.any? { |content| content.include?(type) }
323
- end
324
369
  end
325
370
  end
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.3.2'
3
+ VERSION = '0.4.0'
4
4
  end
@@ -17,7 +17,7 @@ describe Page do
17
17
  end
18
18
 
19
19
  it "should have a content-type" do
20
- @page.content_type.should =~ /text\/html/
20
+ @page.content_type.should include('text/html')
21
21
  end
22
22
 
23
23
  it "should be a html page" do
@@ -54,7 +54,7 @@ describe Page do
54
54
  end
55
55
 
56
56
  it "should have a content-type" do
57
- @page.content_type.should =~ /text\/plain/
57
+ @page.content_type.should include('text/plain')
58
58
  end
59
59
 
60
60
  it "should be a txt page" do
@@ -3,6 +3,8 @@
3
3
  require 'yaml'
4
4
 
5
5
  Gem::Specification.new do |gemspec|
6
+ root = File.dirname(__FILE__)
7
+ lib_dir = File.join(root,'lib')
6
8
  files = if File.directory?('.git')
7
9
  `git ls-files`.split($/)
8
10
  elsif File.directory?('.hg')
@@ -23,12 +25,12 @@ Gem::Specification.new do |gemspec|
23
25
  }
24
26
 
25
27
  version = {
26
- :file => 'lib/spidr/version.rb',
28
+ :file => 'spidr/version',
27
29
  :constant => 'Spidr::VERSION'
28
30
  }
29
31
 
30
32
  defaults = {
31
- 'name' => File.basename(File.dirname(__FILE__)),
33
+ 'name' => File.basename(root),
32
34
  'files' => files,
33
35
  'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
34
36
  'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
@@ -40,8 +42,10 @@ Gem::Specification.new do |gemspec|
40
42
  gemspec.name = metadata.fetch('name',defaults[:name])
41
43
  gemspec.version = if metadata['version']
42
44
  metadata['version']
43
- elsif File.file?(version[:file])
44
- require File.join('.',version[:file])
45
+ else
46
+ $LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
47
+
48
+ require version[:file]
45
49
  eval(version[:constant])
46
50
  end
47
51
 
metadata CHANGED
@@ -1,60 +1,60 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
4
5
  prerelease:
5
- version: 0.3.2
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Postmodern
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-06-20 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
12
+ date: 2011-08-07 00:00:00.000000000 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
16
  name: nokogiri
17
- requirement: &id001 !ruby/object:Gem::Requirement
17
+ requirement: &15697980 !ruby/object:Gem::Requirement
18
18
  none: false
19
- requirements:
19
+ requirements:
20
20
  - - ~>
21
- - !ruby/object:Gem::Version
22
- version: "1.3"
21
+ - !ruby/object:Gem::Version
22
+ version: '1.3'
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *id001
26
- - !ruby/object:Gem::Dependency
25
+ version_requirements: *15697980
26
+ - !ruby/object:Gem::Dependency
27
27
  name: bundler
28
- requirement: &id002 !ruby/object:Gem::Requirement
28
+ requirement: &15693140 !ruby/object:Gem::Requirement
29
29
  none: false
30
- requirements:
30
+ requirements:
31
31
  - - ~>
32
- - !ruby/object:Gem::Version
32
+ - !ruby/object:Gem::Version
33
33
  version: 1.0.0
34
34
  type: :development
35
35
  prerelease: false
36
- version_requirements: *id002
37
- - !ruby/object:Gem::Dependency
36
+ version_requirements: *15693140
37
+ - !ruby/object:Gem::Dependency
38
38
  name: yard
39
- requirement: &id003 !ruby/object:Gem::Requirement
39
+ requirement: &15668040 !ruby/object:Gem::Requirement
40
40
  none: false
41
- requirements:
41
+ requirements:
42
42
  - - ~>
43
- - !ruby/object:Gem::Version
43
+ - !ruby/object:Gem::Version
44
44
  version: 0.6.0
45
45
  type: :development
46
46
  prerelease: false
47
- version_requirements: *id003
48
- description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
49
- email:
47
+ version_requirements: *15668040
48
+ description: Spidr is a versatile Ruby web spidering library that can spider a site,
49
+ multiple domains, certain links or infinitely. Spidr is designed to be fast and
50
+ easy to use.
51
+ email:
50
52
  - postmodern.mod3@gmail.com
51
53
  executables: []
52
-
53
54
  extensions: []
54
-
55
- extra_rdoc_files:
55
+ extra_rdoc_files:
56
56
  - README.md
57
- files:
57
+ files:
58
58
  - .rspec
59
59
  - .yardopts
60
60
  - ChangeLog.md
@@ -105,34 +105,33 @@ files:
105
105
  - spec/spec_helper.rb
106
106
  - spec/spidr_spec.rb
107
107
  - spidr.gemspec
108
+ has_rdoc: yard
108
109
  homepage: http://github.com/postmodern/spidr
109
- licenses:
110
+ licenses:
110
111
  - MIT
111
112
  post_install_message:
112
113
  rdoc_options: []
113
-
114
- require_paths:
114
+ require_paths:
115
115
  - lib
116
- required_ruby_version: !ruby/object:Gem::Requirement
116
+ required_ruby_version: !ruby/object:Gem::Requirement
117
117
  none: false
118
- requirements:
119
- - - ">="
120
- - !ruby/object:Gem::Version
121
- version: "0"
122
- required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ! '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
123
  none: false
124
- requirements:
125
- - - ">="
126
- - !ruby/object:Gem::Version
124
+ requirements:
125
+ - - ! '>='
126
+ - !ruby/object:Gem::Version
127
127
  version: 1.3.6
128
128
  requirements: []
129
-
130
129
  rubyforge_project: spidr
131
- rubygems_version: 1.8.5
130
+ rubygems_version: 1.6.2
132
131
  signing_key:
133
132
  specification_version: 3
134
133
  summary: A versatile Ruby web spidering library
135
- test_files:
134
+ test_files:
136
135
  - spec/agent_spec.rb
137
136
  - spec/actions_spec.rb
138
137
  - spec/rules_spec.rb