rgrove-sanitize 1.0.8.3 → 1.0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/HISTORY CHANGED
@@ -1,8 +1,10 @@
1
1
  Sanitize History
2
2
  ================================================================================
3
3
 
4
- Version 1.0.8.2 (git)
4
+ Version 1.0.8.4 (git)
5
5
  * Migrated from Hpricot to Nokogiri. Requires libxml2 >= 2.7.2 [Adam Hooper]
6
+ * Added an :output config setting to allow the output format to be specified.
7
+ Supported formats are :xhtml (the default) and :html (which outputs HTML4).
6
8
  * Changed protocol regex to ensure Sanitize doesn't kill URLs with colons in
7
9
  path segments. [Peter Cooper]
8
10
 
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  #--
2
3
  # Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
3
4
  #
@@ -29,18 +30,6 @@ require 'sanitize/config/relaxed'
29
30
 
30
31
  class Sanitize
31
32
 
32
- # Characters that should be replaced with entities in text nodes.
33
- ENTITY_MAP = {
34
- '<' => '&lt;',
35
- '>' => '&gt;',
36
- '"' => '&quot;',
37
- "'" => '&#39;'
38
- }
39
-
40
- # Matches an unencoded ampersand that is not part of a valid character entity
41
- # reference.
42
- REGEX_AMPERSAND = /&(?!(?:[a-z]+[0-9]{0,2}|#[0-9]+|#x[0-9a-f]+);)/i
43
-
44
33
  # Matches an attribute value that could be treated by a browser as a URL
45
34
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
46
35
  # or more characters followed by a colon is considered a match, even if the
@@ -124,7 +113,24 @@ class Sanitize
124
113
  end
125
114
  end
126
115
 
127
- result = fragment.to_xhtml(:encoding => 'UTF-8', :indent => 0).gsub(/>\n/, '>')
116
+ if @config[:output] == :xhtml
117
+ output_method = fragment.method(:to_xhtml)
118
+ elsif @config[:output] == :html
119
+ output_method = fragment.method(:to_html)
120
+ else
121
+ raise Error, "unsupported output format: #{@config[:output]}"
122
+ end
123
+
124
+ if RUBY_VERSION >= '1.9'
125
+ # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII
126
+ # string no matter what we ask for. This will be fixed in 1.4.0, but for
127
+ # now we have to hack around it to prevent errors.
128
+ result = output_method.call(:encoding => 'utf-8', :indent => 0).force_encoding('utf-8')
129
+ result.gsub!(">\n", '>')
130
+ else
131
+ result = output_method.call(:encoding => 'utf-8', :indent => 0).gsub(">\n", '>')
132
+ end
133
+
128
134
  return result == html ? nil : html[0, html.length] = result
129
135
  end
130
136
 
@@ -146,18 +152,6 @@ class Sanitize
146
152
  sanitize = Sanitize.new(config)
147
153
  sanitize.clean!(html)
148
154
  end
149
-
150
- # Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
151
- # references and returns the encoded string.
152
- def encode_html(html)
153
- str = html.dup
154
-
155
- # Encode special chars.
156
- ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
157
-
158
- # Convert unencoded ampersands to entity references.
159
- str.gsub(REGEX_AMPERSAND, '&amp;')
160
- end
161
155
  end
162
156
 
163
157
  end
@@ -28,17 +28,21 @@ class Sanitize
28
28
  # comments.
29
29
  :allow_comments => false,
30
30
 
31
- # HTML elements to allow. By default, no elements are allowed (which means
32
- # that all HTML will be stripped).
33
- :elements => [],
31
+ # HTML attributes to add to specific elements. By default, no attributes
32
+ # are added.
33
+ :add_attributes => {},
34
34
 
35
35
  # HTML attributes to allow in specific elements. By default, no attributes
36
36
  # are allowed.
37
37
  :attributes => {},
38
38
 
39
- # HTML attributes to add to specific elements. By default, no attributes
40
- # are added.
41
- :add_attributes => {},
39
+ # HTML elements to allow. By default, no elements are allowed (which means
40
+ # that all HTML will be stripped).
41
+ :elements => [],
42
+
43
+ # Output format. Supported formats are :html and :xhtml (which is the
44
+ # default).
45
+ :output => :xhtml,
42
46
 
43
47
  # URL handling protocols to allow in specific attributes. By default, no
44
48
  # protocols are allowed. Use :relative in place of a protocol if you want
@@ -1,3 +1,3 @@
1
1
  class Sanitize
2
- VERSION = '1.0.8.3'
2
+ VERSION = '1.0.8.4'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rgrove-sanitize
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.8.3
4
+ version: 1.0.8.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Grove
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-04 00:00:00 -07:00
12
+ date: 2009-09-17 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -62,6 +62,7 @@ files:
62
62
  - lib/sanitize.rb
63
63
  has_rdoc: false
64
64
  homepage: http://github.com/rgrove/sanitize/
65
+ licenses:
65
66
  post_install_message:
66
67
  rdoc_options: []
67
68
 
@@ -82,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
82
83
  requirements: []
83
84
 
84
85
  rubyforge_project:
85
- rubygems_version: 1.2.0
86
+ rubygems_version: 1.3.5
86
87
  signing_key:
87
88
  specification_version: 3
88
89
  summary: Whitelist-based HTML sanitizer.