guess_html_encoding 0.0.10 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f24b82e186d3e1a58cd2061c7cb1eef2f5b5d1b0
4
- data.tar.gz: cbfd0284000e074ef621763a36ca2be60cbed218
3
+ metadata.gz: c4a5a5d5cd40292d68650b9dd471adb932f424cd
4
+ data.tar.gz: b6f65abc65007e6cf570520eb78cf24adeb6d6bf
5
5
  SHA512:
6
- metadata.gz: 4d68030d7c0af216faa1e1dc029c65b6557287a8349aa89ec2a7a98833de4178a838693d2bf3e866b966edd597951e0d31c26f3a4c33daab30c7afa93692b7a5
7
- data.tar.gz: e2ddc685bae62c4cc6e962dd79a4f69863aef512b556957b2ab91113b492c3b07d7315f880c57179c8a9aea30d0279823983a21bbdcc6cbbc697cfc9ef2ada30
6
+ metadata.gz: 35d9b1b1b2b42b4b17bcaa49cf908143ba566a214f2637f2b335ba0157191ad52d20b632c9d4ee5de82aeecc088ba3bd6c6023b46d994af69c5c1680353deecd
7
+ data.tar.gz: 91534f086571eac16bd248bb22189e7a88af4b66438dd1ad690c98f4ed8d2122a6572231c6573e87cd7fecd84af0b38e1c1c52764456ddc20708daa848a96593
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- guess_html_encoding (0.0.9)
4
+ guess_html_encoding (0.0.11)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -19,11 +19,9 @@ module GuessHtmlEncoding
19
19
  end
20
20
 
21
21
  if out.nil? || out.empty? || !encoding_loaded?(out)
22
- if html =~ /<meta[^>]*HTTP-EQUIV=["']?Content-Type["']?[^>]*content=["']([^'"]*)["']/i && $1 =~ /charset=([\w\d-]+);?/i
23
- out = $1
24
- elsif html =~ /<meta\s+charset=["']([\w\d-]+)?/i
25
- out = $1
26
- end
22
+
23
+ out = HTMLScanner.new(html[0,2500]).encoding || out
24
+
27
25
  out.upcase! unless out.nil?
28
26
  end
29
27
 
@@ -55,4 +53,282 @@ module GuessHtmlEncoding
55
53
  def self.encoding_loaded?(encoding)
56
54
  !!Encoding.find(encoding) rescue nil
57
55
  end
56
+
57
+ class HTMLScanner
58
+
59
+ def initialize(html)
60
+ @html = html
61
+ end
62
+
63
+ # Returns the encoding sniffed from the content of an HTML page, as determined using an
64
+ # implemention of the algorithm to 'prescan a byte stream to determine its encoding', as
65
+ # specified by the HTML specification:
66
+ # http://www.w3.org/html/wg/drafts/html/master/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
67
+ def encoding
68
+
69
+ position = 0
70
+ charset = nil
71
+ length = @html.length
72
+
73
+ done = false
74
+
75
+ while position < length && !done
76
+
77
+ # First look for a standard HTML comment (ie <!-- blah -->)
78
+ if @html[position, 4] == '<!--'
79
+
80
+ position += 2
81
+
82
+ position += (@html[position, length].index('-->') || length)
83
+
84
+ # Then look for the start of a meta tag
85
+ elsif @html[position, 6] =~ /\A\<meta[\s\/]/i
86
+
87
+ charset, position_increment = charset_from_meta(@html[position + 5, length])
88
+
89
+ break if charset
90
+
91
+ position += position_increment
92
+
93
+ # Then look for <! or </ or <?
94
+ elsif @html[position, 2] =~ /\A\<[\!\/\?]/
95
+
96
+ # Advance position to the first > that appears next in string, or end
97
+ position += @html[position, length].index('>') || length
98
+
99
+ else
100
+ # Do nothing. (This is just here to make the algorithm easier to follow)
101
+ end
102
+
103
+ # Advance position to next character
104
+ position += 1
105
+ end
106
+
107
+ charset
108
+ end
109
+
110
+ private
111
+
112
+
113
+ # Given a string which starts with the space or slash following a `<meta`,
114
+ # look for a charset and returns it along with the position of the next
115
+ # character following the closing `>` character
116
+ def charset_from_meta(string)
117
+
118
+ position = 0
119
+ attribute_list = {}
120
+ got_pragma = false
121
+ need_pragma = nil
122
+ charset = nil
123
+ length = string.length
124
+
125
+ while position < length
126
+
127
+ attribute, position_increment = attribute(string[position, length])
128
+
129
+ position += position_increment.to_i
130
+
131
+ if attribute == nil
132
+
133
+ break
134
+
135
+ elsif attribute_list[attribute[:attribute_name]]
136
+
137
+ # Do nothing
138
+
139
+ else
140
+
141
+ # found a new attribute. Add it to the list
142
+ attribute_list[attribute[:attribute_name]] = attribute[:attribute_value]
143
+
144
+ if attribute[:attribute_name] == 'http-equiv'
145
+
146
+ got_pragma = true
147
+
148
+ elsif attribute[:attribute_name] == 'content'
149
+
150
+ content_charset = charset_from_meta_content(attribute[:attribute_value])
151
+
152
+ if content_charset && charset == nil
153
+ charset = content_charset
154
+ need_pragma = true
155
+ end
156
+
157
+ elsif attribute[:attribute_name] == 'charset'
158
+
159
+ charset = attribute[:attribute_value]
160
+ need_pragma = false
161
+
162
+ end
163
+
164
+ end
165
+
166
+ end
167
+
168
+ if need_pragma == nil || (need_pragma == true && got_pragma == false)
169
+ [nil, position]
170
+ else
171
+ [charset, position]
172
+ end
173
+
174
+ end
175
+
176
+ # Given a string representing the 'content' attribute value of a meta tag
177
+ # with an `http-equiv` attribute, returns the charset specified within that
178
+ # value, or nil.
179
+ def charset_from_meta_content(string)
180
+
181
+ charset_match = string.match(/charset\s*\=\s*(.+)/i)
182
+
183
+ if charset_match
184
+
185
+ charset_value = charset_match[1]
186
+
187
+ charset_value[/\A\"(.*)\"/, 1] ||
188
+ charset_value[/\A\'(.*)\'/, 1] ||
189
+ charset_value[/(.*)[\s;]/, 1] ||
190
+ charset_value[/(.*)/, 1]
191
+ else
192
+ nil
193
+ end
194
+
195
+ end
196
+
197
+ # Given a string, returns the first attribute in the sting (as a hash), and
198
+ # the position of the next character in the string
199
+ def attribute(string)
200
+
201
+ attribute_name = ""
202
+ attribute_value = ""
203
+
204
+ length = string.length
205
+ position = 0
206
+
207
+ return [nil, nil] if length == 0
208
+
209
+ while position < (length)
210
+
211
+ # If character matches 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), 0x20 (ASCII space), or 0x2F (ASCII /) then advance position
212
+ if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}\u{2f}]/
213
+
214
+ position += 1
215
+
216
+ elsif string[position] == '>'
217
+
218
+ attribute_name = nil
219
+ break
220
+
221
+ else
222
+
223
+ while position < length
224
+
225
+ if string[position] == '=' && attribute_name != ''
226
+
227
+ attribute_value, position_increment = attribute_value(string[position + 1, length])
228
+
229
+ position += position_increment + 1
230
+
231
+ break
232
+
233
+ elsif string[position] =~ /[\>\/]/
234
+
235
+ break
236
+
237
+ elsif string[position] =~ /[A-Z]/
238
+
239
+ attribute_name += string[position].downcase
240
+ position += 1
241
+
242
+ else
243
+ attribute_name += string[position]
244
+ position += 1
245
+ end
246
+
247
+ end
248
+
249
+ break
250
+
251
+ end
252
+
253
+ end
254
+
255
+ if attribute_name
256
+ [{attribute_name: attribute_name, attribute_value: attribute_value}, position]
257
+ else
258
+ [nil, position]
259
+ end
260
+
261
+ end
262
+
263
+ # Given a string, this returns the attribute value from the start of the string,
264
+ # and the position of the following character in the string
265
+ def attribute_value(string)
266
+
267
+ attribute_value = ''
268
+ position = 0
269
+ length = string.length
270
+
271
+ while position < length
272
+
273
+ # x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), or 0x20 (ASCII space) then advance position to the next byte, then, repeat this step.
274
+ if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}]/
275
+
276
+ position += 1
277
+
278
+ elsif string[position] =~ /['"]/
279
+
280
+ attribute_value, position = quoted_value(string[position, length])
281
+ break
282
+
283
+ elsif string[position] == '>'
284
+ position += 1
285
+ break
286
+
287
+ else
288
+ attribute_value, position = unquoted_value(string[position, length])
289
+ break
290
+ end
291
+ end
292
+
293
+ [attribute_value, position]
294
+ end
295
+
296
+ # Given a string, at the start of which is quoted attribute value, returns
297
+ # that attribute value, and the position of the next character in the string
298
+ # (following the second matching quote mark)
299
+ def quoted_value(string)
300
+
301
+ attribute_value = ""
302
+ quote_type = string[0]
303
+ position = 1
304
+ length = string.length
305
+
306
+ while position < length
307
+
308
+ if string[position] == quote_type
309
+ position += 1
310
+ break
311
+ else
312
+ attribute_value += downcase_A_to_Z_only(string[position])
313
+ position += 1
314
+ end
315
+
316
+ end
317
+
318
+ [attribute_value, position]
319
+ end
320
+
321
+ # Given a string, at the start of which is an unquoted attribute value, returns
322
+ # that attribute value, and the position of the next character in the string
323
+ def unquoted_value(string)
324
+ downcased_value = downcase_A_to_Z_only(string[/\A[^\t\u{0A}\u{0C}\u{0D}\u{20}\>]*/])
325
+ [downcased_value, downcased_value.length]
326
+ end
327
+
328
+ # Downcases the A-Z characters only (eg not É -> é)
329
+ def downcase_A_to_Z_only(string)
330
+ string.gsub(/([A-Z])/) { |match| match.downcase }
331
+ end
332
+
333
+ end
58
334
  end
@@ -1,3 +1,3 @@
1
1
  module GuessHtmlEncoding
2
- VERSION = "0.0.10"
2
+ VERSION = "0.0.11"
3
3
  end
@@ -3,6 +3,96 @@ require 'spec_helper'
3
3
 
4
4
  describe "GuessHtmlEncoding" do
5
5
  describe "#guess" do
6
+
7
+ it 'should use an uppercased unquoted meta tag' do
8
+ expect(GuessHtmlEncoding.guess('<META CHARSET=UTF-8>')).to eql('UTF-8')
9
+ end
10
+
11
+ it 'should use a quoted meta tag' do
12
+ expect(GuessHtmlEncoding.guess('<meta charset="UTF-8">')).to eql('UTF-8')
13
+ end
14
+
15
+ it 'should use a http-equiv meta tag' do
16
+ expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8">')).to eql('UTF-8')
17
+ end
18
+
19
+ it 'should use a http-equiv meta tag with semi-colons in the content value' do
20
+ expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="text/html; charset=UTF-8;">')).to eql('UTF-8')
21
+ end
22
+
23
+ it 'should use a http-equiv meta tag with attributes in unusual order' do
24
+ expect(GuessHtmlEncoding.guess('<meta content="text/html; charset=UTF-8;" http-equiv="content-type">')).to eql('UTF-8')
25
+ end
26
+
27
+ it 'should use a http-equiv meta tag with attributes in unusual order' do
28
+ expect(GuessHtmlEncoding.guess('<meta><meta charset="UTF-8">')).to eql('UTF-8')
29
+ end
30
+
31
+ it 'should use the first meta tag with a charset value' do
32
+ expect(GuessHtmlEncoding.guess('<meta charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-9')
33
+ end
34
+
35
+ it 'should use a meta http-equiv tag with spaces in the content value' do
36
+ expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=' text/html ; charset = UTF-8;'>")).to eql('UTF-8')
37
+ end
38
+
39
+ it 'should use a meta http-equiv tag with newlines in the content value' do
40
+ expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='\t\ncharset=UTF-8\n'>")).to eql('UTF-8')
41
+ end
42
+
43
+ it 'should use a meta http-equiv tag with double quotes in the content value' do
44
+ expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='text/html; charset=\"UTF-8\">")).to eql('UTF-8')
45
+ end
46
+
47
+ it 'should use a meta http-equiv tag with single quotes in the content value' do
48
+ expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=\"text/html; charset='UTF-8'\">")).to eql('UTF-8')
49
+ end
50
+
51
+ it 'should use the first charset attribute' do
52
+ expect(GuessHtmlEncoding.guess('<meta charset="UTF-9" charset="UTF-8">>')).to eql('UTF-9')
53
+ end
54
+
55
+ it 'should use the charset value over the content value' do
56
+ expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8" charset="UTF-9">')).to eql('UTF-9')
57
+ end
58
+
59
+ it 'should use the charset value if it appears before http-equiv' do
60
+ expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" charset="UTF-9" http-equiv="content-type" >')).to eql('UTF-9')
61
+ end
62
+
63
+ it 'should ignore meta tags with content attribute but no http-equiv' do
64
+ expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" ><meta charset="UTF-9">')).to eql('UTF-9')
65
+ end
66
+
67
+ it 'should ignore a commented-out meta tag' do
68
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!--<meta charset="UTF-9">--><meta charset="UTF-8">')).to eql('UTF-8')
69
+ end
70
+
71
+ it 'should ignore a minimal comment' do
72
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html><html><!--><meta charset="UTF-9"></html>')).to eql('UTF-9')
73
+ end
74
+
75
+ it 'should ignore an oddly commented out meta tag using <! >' do
76
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
77
+ end
78
+
79
+ it 'should ignore an oddly commented out meta tag using </ >' do
80
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html></<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
81
+ end
82
+
83
+ it 'should ignore an oddly commented out meta tag using <? ?>' do
84
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html><?<meta charset="UTF-9">?><meta charset="UTF-8">')).to eql('UTF-8')
85
+ end
86
+
87
+ it 'should ignore a <metadata> tag' do
88
+ expect(GuessHtmlEncoding.guess('<metadata test="yes" charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-8')
89
+ end
90
+
91
+ it 'should only search the first 2500 characters' do
92
+ html = 2500.times.collect { ' ' }.join + '<meta charset="UTF-8">'
93
+ expect(GuessHtmlEncoding.guess(html)).to eql(nil)
94
+ end
95
+
6
96
  it "can use headers" do
7
97
  guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
8
98
  "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guess_html_encoding
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Cantino (Iteration Labs, LLC)
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-14 00:00:00.000000000 Z
11
+ date: 2015-02-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec