spidr 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,12 @@
1
+ ### 0.4.0 / 2011-08-07
2
+
3
+ * Added {Spidr::Headers#content_charset}.
4
+ * Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
5
+ This ensures that Nokogiri will preserve the body encoding.
6
+ * Made {Spidr::Headers#is_content_type?} public.
7
+ * Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
8
+ or the sub-type.
9
+
1
10
  ### 0.3.2 / 2011-06-20
2
11
 
3
12
  * Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
@@ -27,9 +27,9 @@ module Spidr
27
27
  unless body.empty?
28
28
  begin
29
29
  if html?
30
- @doc ||= Nokogiri::HTML(body)
30
+ @doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
31
31
  elsif (rss? || atom? || xml? || xsl?)
32
- @doc ||= Nokogiri::XML(body)
32
+ @doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
33
33
  end
34
34
  rescue
35
35
  end
@@ -414,7 +414,7 @@ module Spidr
414
414
  rescue Gem::LoadError => e
415
415
  raise(e)
416
416
  rescue ::LoadError
417
- STDERR.puts "Warning: cannot load 'net/https', https support disabled"
417
+ warn "Warning: cannot load 'net/https', https support disabled"
418
418
  end
419
419
  end
420
420
 
@@ -115,6 +115,67 @@ module Spidr
115
115
  (headers['content-type'] || [])
116
116
  end
117
117
 
118
+ #
119
+ # The charset included in the Content-Type.
120
+ #
121
+ # @return [String, nil]
122
+ # The charset of the content.
123
+ #
124
+ # @since 0.4.0
125
+ #
126
+ def content_charset
127
+ content_types.each do |value|
128
+ if value.include?(';')
129
+ value.split(';').each do |param|
130
+ param.strip!
131
+
132
+ if param.start_with?('charset=')
133
+ return param.split('=',2).last
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ return nil
140
+ end
141
+
142
+ #
143
+ # Determines if any of the content-types of the page include a given
144
+ # type.
145
+ #
146
+ # @param [String] type
147
+ # The content-type to test for.
148
+ #
149
+ # @return [Boolean]
150
+ # Specifies whether the page includes the given content-type.
151
+ #
152
+ # @example Match the Content-Type
153
+ # page.is_content_type?('application/json')
154
+ #
155
+ # @example Match the sub-type of the Content-Type
156
+ # page.is_content_type?('json')
157
+ #
158
+ # @since 0.4.0
159
+ #
160
+ def is_content_type?(type)
161
+ if type.include?('/')
162
+ # otherwise only match the first param
163
+ content_types.any? do |value|
164
+ value = value.split(';',2).first
165
+
166
+ value == type
167
+ end
168
+ else
169
+ # otherwise only match the sub-type
170
+ content_types.any? do |value|
171
+ value = value.split(';',2).first
172
+ value = value.split('/',2).last
173
+
174
+ value == type
175
+ end
176
+ end
177
+ end
178
+
118
179
  #
119
180
  # Determines if the page is plain-text.
120
181
  #
@@ -291,35 +352,19 @@ module Spidr
291
352
  def cookie_params
292
353
  params = {}
293
354
 
294
- cookies.each do |cookie|
295
- cookie.split('; ').each do |key_value|
296
- key, value = key_value.split('=',2)
355
+ cookies.each do |value|
356
+ value.split(';').each do |param|
357
+ param.strip!
358
+
359
+ name, value = param.split('=',2)
297
360
 
298
- unless RESERVED_COOKIE_NAMES.include?(key)
299
- params[key] = (value || '')
361
+ unless RESERVED_COOKIE_NAMES.include?(name)
362
+ params[name] = (value || '')
300
363
  end
301
364
  end
302
365
  end
303
366
 
304
367
  return params
305
368
  end
306
-
307
- protected
308
-
309
- #
310
- # Determines if any of the content-types of the page include a given
311
- # type.
312
- #
313
- # @param [String] type
314
- # The content-type to test for.
315
- #
316
- # @return [Boolean]
317
- # Specifies whether the page includes the given content-type.
318
- #
319
- # @since 0.2.4
320
- #
321
- def is_content_type?(type)
322
- content_types.any? { |content| content.include?(type) }
323
- end
324
369
  end
325
370
  end
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.3.2'
3
+ VERSION = '0.4.0'
4
4
  end
@@ -17,7 +17,7 @@ describe Page do
17
17
  end
18
18
 
19
19
  it "should have a content-type" do
20
- @page.content_type.should =~ /text\/html/
20
+ @page.content_type.should include('text/html')
21
21
  end
22
22
 
23
23
  it "should be a html page" do
@@ -54,7 +54,7 @@ describe Page do
54
54
  end
55
55
 
56
56
  it "should have a content-type" do
57
- @page.content_type.should =~ /text\/plain/
57
+ @page.content_type.should include('text/plain')
58
58
  end
59
59
 
60
60
  it "should be a txt page" do
@@ -3,6 +3,8 @@
3
3
  require 'yaml'
4
4
 
5
5
  Gem::Specification.new do |gemspec|
6
+ root = File.dirname(__FILE__)
7
+ lib_dir = File.join(root,'lib')
6
8
  files = if File.directory?('.git')
7
9
  `git ls-files`.split($/)
8
10
  elsif File.directory?('.hg')
@@ -23,12 +25,12 @@ Gem::Specification.new do |gemspec|
23
25
  }
24
26
 
25
27
  version = {
26
- :file => 'lib/spidr/version.rb',
28
+ :file => 'spidr/version',
27
29
  :constant => 'Spidr::VERSION'
28
30
  }
29
31
 
30
32
  defaults = {
31
- 'name' => File.basename(File.dirname(__FILE__)),
33
+ 'name' => File.basename(root),
32
34
  'files' => files,
33
35
  'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
34
36
  'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
@@ -40,8 +42,10 @@ Gem::Specification.new do |gemspec|
40
42
  gemspec.name = metadata.fetch('name',defaults[:name])
41
43
  gemspec.version = if metadata['version']
42
44
  metadata['version']
43
- elsif File.file?(version[:file])
44
- require File.join('.',version[:file])
45
+ else
46
+ $LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
47
+
48
+ require version[:file]
45
49
  eval(version[:constant])
46
50
  end
47
51
 
metadata CHANGED
@@ -1,60 +1,60 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
4
5
  prerelease:
5
- version: 0.3.2
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Postmodern
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-06-20 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
12
+ date: 2011-08-07 00:00:00.000000000 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
16
  name: nokogiri
17
- requirement: &id001 !ruby/object:Gem::Requirement
17
+ requirement: &15697980 !ruby/object:Gem::Requirement
18
18
  none: false
19
- requirements:
19
+ requirements:
20
20
  - - ~>
21
- - !ruby/object:Gem::Version
22
- version: "1.3"
21
+ - !ruby/object:Gem::Version
22
+ version: '1.3'
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *id001
26
- - !ruby/object:Gem::Dependency
25
+ version_requirements: *15697980
26
+ - !ruby/object:Gem::Dependency
27
27
  name: bundler
28
- requirement: &id002 !ruby/object:Gem::Requirement
28
+ requirement: &15693140 !ruby/object:Gem::Requirement
29
29
  none: false
30
- requirements:
30
+ requirements:
31
31
  - - ~>
32
- - !ruby/object:Gem::Version
32
+ - !ruby/object:Gem::Version
33
33
  version: 1.0.0
34
34
  type: :development
35
35
  prerelease: false
36
- version_requirements: *id002
37
- - !ruby/object:Gem::Dependency
36
+ version_requirements: *15693140
37
+ - !ruby/object:Gem::Dependency
38
38
  name: yard
39
- requirement: &id003 !ruby/object:Gem::Requirement
39
+ requirement: &15668040 !ruby/object:Gem::Requirement
40
40
  none: false
41
- requirements:
41
+ requirements:
42
42
  - - ~>
43
- - !ruby/object:Gem::Version
43
+ - !ruby/object:Gem::Version
44
44
  version: 0.6.0
45
45
  type: :development
46
46
  prerelease: false
47
- version_requirements: *id003
48
- description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
49
- email:
47
+ version_requirements: *15668040
48
+ description: Spidr is a versatile Ruby web spidering library that can spider a site,
49
+ multiple domains, certain links or infinitely. Spidr is designed to be fast and
50
+ easy to use.
51
+ email:
50
52
  - postmodern.mod3@gmail.com
51
53
  executables: []
52
-
53
54
  extensions: []
54
-
55
- extra_rdoc_files:
55
+ extra_rdoc_files:
56
56
  - README.md
57
- files:
57
+ files:
58
58
  - .rspec
59
59
  - .yardopts
60
60
  - ChangeLog.md
@@ -105,34 +105,33 @@ files:
105
105
  - spec/spec_helper.rb
106
106
  - spec/spidr_spec.rb
107
107
  - spidr.gemspec
108
+ has_rdoc: yard
108
109
  homepage: http://github.com/postmodern/spidr
109
- licenses:
110
+ licenses:
110
111
  - MIT
111
112
  post_install_message:
112
113
  rdoc_options: []
113
-
114
- require_paths:
114
+ require_paths:
115
115
  - lib
116
- required_ruby_version: !ruby/object:Gem::Requirement
116
+ required_ruby_version: !ruby/object:Gem::Requirement
117
117
  none: false
118
- requirements:
119
- - - ">="
120
- - !ruby/object:Gem::Version
121
- version: "0"
122
- required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ! '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
123
  none: false
124
- requirements:
125
- - - ">="
126
- - !ruby/object:Gem::Version
124
+ requirements:
125
+ - - ! '>='
126
+ - !ruby/object:Gem::Version
127
127
  version: 1.3.6
128
128
  requirements: []
129
-
130
129
  rubyforge_project: spidr
131
- rubygems_version: 1.8.5
130
+ rubygems_version: 1.6.2
132
131
  signing_key:
133
132
  specification_version: 3
134
133
  summary: A versatile Ruby web spidering library
135
- test_files:
134
+ test_files:
136
135
  - spec/agent_spec.rb
137
136
  - spec/actions_spec.rb
138
137
  - spec/rules_spec.rb