guess_html_encoding 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/guess_html_encoding.rb +281 -5
- data/lib/guess_html_encoding/version.rb +1 -1
- data/spec/guess_html_encoding_spec.rb +90 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4a5a5d5cd40292d68650b9dd471adb932f424cd
|
4
|
+
data.tar.gz: b6f65abc65007e6cf570520eb78cf24adeb6d6bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35d9b1b1b2b42b4b17bcaa49cf908143ba566a214f2637f2b335ba0157191ad52d20b632c9d4ee5de82aeecc088ba3bd6c6023b46d994af69c5c1680353deecd
|
7
|
+
data.tar.gz: 91534f086571eac16bd248bb22189e7a88af4b66438dd1ad690c98f4ed8d2122a6572231c6573e87cd7fecd84af0b38e1c1c52764456ddc20708daa848a96593
|
data/Gemfile.lock
CHANGED
data/lib/guess_html_encoding.rb
CHANGED
@@ -19,11 +19,9 @@ module GuessHtmlEncoding
|
|
19
19
|
end
|
20
20
|
|
21
21
|
if out.nil? || out.empty? || !encoding_loaded?(out)
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
out = $1
|
26
|
-
end
|
22
|
+
|
23
|
+
out = HTMLScanner.new(html[0,2500]).encoding || out
|
24
|
+
|
27
25
|
out.upcase! unless out.nil?
|
28
26
|
end
|
29
27
|
|
@@ -55,4 +53,282 @@ module GuessHtmlEncoding
|
|
55
53
|
def self.encoding_loaded?(encoding)
|
56
54
|
!!Encoding.find(encoding) rescue nil
|
57
55
|
end
|
56
|
+
|
57
|
+
class HTMLScanner
|
58
|
+
|
59
|
+
def initialize(html)
|
60
|
+
@html = html
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns the encoding sniffed from the content of an HTML page, as determined using an
|
64
|
+
# implemention of the algorithm to 'prescan a byte stream to determine its encoding', as
|
65
|
+
# specified by the HTML specification:
|
66
|
+
# http://www.w3.org/html/wg/drafts/html/master/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
|
67
|
+
def encoding
|
68
|
+
|
69
|
+
position = 0
|
70
|
+
charset = nil
|
71
|
+
length = @html.length
|
72
|
+
|
73
|
+
done = false
|
74
|
+
|
75
|
+
while position < length && !done
|
76
|
+
|
77
|
+
# First look for a standard HTML comment (ie <!-- blah -->)
|
78
|
+
if @html[position, 4] == '<!--'
|
79
|
+
|
80
|
+
position += 2
|
81
|
+
|
82
|
+
position += (@html[position, length].index('-->') || length)
|
83
|
+
|
84
|
+
# Then look for the start of a meta tag
|
85
|
+
elsif @html[position, 6] =~ /\A\<meta[\s\/]/i
|
86
|
+
|
87
|
+
charset, position_increment = charset_from_meta(@html[position + 5, length])
|
88
|
+
|
89
|
+
break if charset
|
90
|
+
|
91
|
+
position += position_increment
|
92
|
+
|
93
|
+
# Then look for <! or </ or <?
|
94
|
+
elsif @html[position, 2] =~ /\A\<[\!\/\?]/
|
95
|
+
|
96
|
+
# Advance position to the first > that appears next in string, or end
|
97
|
+
position += @html[position, length].index('>') || length
|
98
|
+
|
99
|
+
else
|
100
|
+
# Do nothing. (This is just here to make the algorithm easier to follow)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Advance position to next character
|
104
|
+
position += 1
|
105
|
+
end
|
106
|
+
|
107
|
+
charset
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
|
113
|
+
# Given a string which starts with the space or slash following a `<meta`,
|
114
|
+
# look for a charset and returns it along with the position of the next
|
115
|
+
# character following the closing `>` character
|
116
|
+
def charset_from_meta(string)
|
117
|
+
|
118
|
+
position = 0
|
119
|
+
attribute_list = {}
|
120
|
+
got_pragma = false
|
121
|
+
need_pragma = nil
|
122
|
+
charset = nil
|
123
|
+
length = string.length
|
124
|
+
|
125
|
+
while position < length
|
126
|
+
|
127
|
+
attribute, position_increment = attribute(string[position, length])
|
128
|
+
|
129
|
+
position += position_increment.to_i
|
130
|
+
|
131
|
+
if attribute == nil
|
132
|
+
|
133
|
+
break
|
134
|
+
|
135
|
+
elsif attribute_list[attribute[:attribute_name]]
|
136
|
+
|
137
|
+
# Do nothing
|
138
|
+
|
139
|
+
else
|
140
|
+
|
141
|
+
# found a new attribute. Add it to the list
|
142
|
+
attribute_list[attribute[:attribute_name]] = attribute[:attribute_value]
|
143
|
+
|
144
|
+
if attribute[:attribute_name] == 'http-equiv'
|
145
|
+
|
146
|
+
got_pragma = true
|
147
|
+
|
148
|
+
elsif attribute[:attribute_name] == 'content'
|
149
|
+
|
150
|
+
content_charset = charset_from_meta_content(attribute[:attribute_value])
|
151
|
+
|
152
|
+
if content_charset && charset == nil
|
153
|
+
charset = content_charset
|
154
|
+
need_pragma = true
|
155
|
+
end
|
156
|
+
|
157
|
+
elsif attribute[:attribute_name] == 'charset'
|
158
|
+
|
159
|
+
charset = attribute[:attribute_value]
|
160
|
+
need_pragma = false
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
if need_pragma == nil || (need_pragma == true && got_pragma == false)
|
169
|
+
[nil, position]
|
170
|
+
else
|
171
|
+
[charset, position]
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
# Given a string representing the 'content' attribute value of a meta tag
|
177
|
+
# with an `http-equiv` attribute, returns the charset specified within that
|
178
|
+
# value, or nil.
|
179
|
+
def charset_from_meta_content(string)
|
180
|
+
|
181
|
+
charset_match = string.match(/charset\s*\=\s*(.+)/i)
|
182
|
+
|
183
|
+
if charset_match
|
184
|
+
|
185
|
+
charset_value = charset_match[1]
|
186
|
+
|
187
|
+
charset_value[/\A\"(.*)\"/, 1] ||
|
188
|
+
charset_value[/\A\'(.*)\'/, 1] ||
|
189
|
+
charset_value[/(.*)[\s;]/, 1] ||
|
190
|
+
charset_value[/(.*)/, 1]
|
191
|
+
else
|
192
|
+
nil
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
# Given a string, returns the first attribute in the sting (as a hash), and
|
198
|
+
# the position of the next character in the string
|
199
|
+
def attribute(string)
|
200
|
+
|
201
|
+
attribute_name = ""
|
202
|
+
attribute_value = ""
|
203
|
+
|
204
|
+
length = string.length
|
205
|
+
position = 0
|
206
|
+
|
207
|
+
return [nil, nil] if length == 0
|
208
|
+
|
209
|
+
while position < (length)
|
210
|
+
|
211
|
+
# If character matches 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), 0x20 (ASCII space), or 0x2F (ASCII /) then advance position
|
212
|
+
if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}\u{2f}]/
|
213
|
+
|
214
|
+
position += 1
|
215
|
+
|
216
|
+
elsif string[position] == '>'
|
217
|
+
|
218
|
+
attribute_name = nil
|
219
|
+
break
|
220
|
+
|
221
|
+
else
|
222
|
+
|
223
|
+
while position < length
|
224
|
+
|
225
|
+
if string[position] == '=' && attribute_name != ''
|
226
|
+
|
227
|
+
attribute_value, position_increment = attribute_value(string[position + 1, length])
|
228
|
+
|
229
|
+
position += position_increment + 1
|
230
|
+
|
231
|
+
break
|
232
|
+
|
233
|
+
elsif string[position] =~ /[\>\/]/
|
234
|
+
|
235
|
+
break
|
236
|
+
|
237
|
+
elsif string[position] =~ /[A-Z]/
|
238
|
+
|
239
|
+
attribute_name += string[position].downcase
|
240
|
+
position += 1
|
241
|
+
|
242
|
+
else
|
243
|
+
attribute_name += string[position]
|
244
|
+
position += 1
|
245
|
+
end
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
break
|
250
|
+
|
251
|
+
end
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
if attribute_name
|
256
|
+
[{attribute_name: attribute_name, attribute_value: attribute_value}, position]
|
257
|
+
else
|
258
|
+
[nil, position]
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|
262
|
+
|
263
|
+
# Given a string, this returns the attribute value from the start of the string,
|
264
|
+
# and the position of the following character in the string
|
265
|
+
def attribute_value(string)
|
266
|
+
|
267
|
+
attribute_value = ''
|
268
|
+
position = 0
|
269
|
+
length = string.length
|
270
|
+
|
271
|
+
while position < length
|
272
|
+
|
273
|
+
# x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), or 0x20 (ASCII space) then advance position to the next byte, then, repeat this step.
|
274
|
+
if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}]/
|
275
|
+
|
276
|
+
position += 1
|
277
|
+
|
278
|
+
elsif string[position] =~ /['"]/
|
279
|
+
|
280
|
+
attribute_value, position = quoted_value(string[position, length])
|
281
|
+
break
|
282
|
+
|
283
|
+
elsif string[position] == '>'
|
284
|
+
position += 1
|
285
|
+
break
|
286
|
+
|
287
|
+
else
|
288
|
+
attribute_value, position = unquoted_value(string[position, length])
|
289
|
+
break
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
[attribute_value, position]
|
294
|
+
end
|
295
|
+
|
296
|
+
# Given a string, at the start of which is quoted attribute value, returns
|
297
|
+
# that attribute value, and the position of the next character in the string
|
298
|
+
# (following the second matching quote mark)
|
299
|
+
def quoted_value(string)
|
300
|
+
|
301
|
+
attribute_value = ""
|
302
|
+
quote_type = string[0]
|
303
|
+
position = 1
|
304
|
+
length = string.length
|
305
|
+
|
306
|
+
while position < length
|
307
|
+
|
308
|
+
if string[position] == quote_type
|
309
|
+
position += 1
|
310
|
+
break
|
311
|
+
else
|
312
|
+
attribute_value += downcase_A_to_Z_only(string[position])
|
313
|
+
position += 1
|
314
|
+
end
|
315
|
+
|
316
|
+
end
|
317
|
+
|
318
|
+
[attribute_value, position]
|
319
|
+
end
|
320
|
+
|
321
|
+
# Given a string, at the start of which is an unquoted attribute value, returns
|
322
|
+
# that attribute value, and the position of the next character in the string
|
323
|
+
def unquoted_value(string)
|
324
|
+
downcased_value = downcase_A_to_Z_only(string[/\A[^\t\u{0A}\u{0C}\u{0D}\u{20}\>]*/])
|
325
|
+
[downcased_value, downcased_value.length]
|
326
|
+
end
|
327
|
+
|
328
|
+
# Downcases the A-Z characters only (eg not É -> é)
|
329
|
+
def downcase_A_to_Z_only(string)
|
330
|
+
string.gsub(/([A-Z])/) { |match| match.downcase }
|
331
|
+
end
|
332
|
+
|
333
|
+
end
|
58
334
|
end
|
@@ -3,6 +3,96 @@ require 'spec_helper'
|
|
3
3
|
|
4
4
|
describe "GuessHtmlEncoding" do
|
5
5
|
describe "#guess" do
|
6
|
+
|
7
|
+
it 'should use an uppercased unquoted meta tag' do
|
8
|
+
expect(GuessHtmlEncoding.guess('<META CHARSET=UTF-8>')).to eql('UTF-8')
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should use a quoted meta tag' do
|
12
|
+
expect(GuessHtmlEncoding.guess('<meta charset="UTF-8">')).to eql('UTF-8')
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should use a http-equiv meta tag' do
|
16
|
+
expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8">')).to eql('UTF-8')
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should use a http-equiv meta tag with semi-colons in the content value' do
|
20
|
+
expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="text/html; charset=UTF-8;">')).to eql('UTF-8')
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should use a http-equiv meta tag with attributes in unusual order' do
|
24
|
+
expect(GuessHtmlEncoding.guess('<meta content="text/html; charset=UTF-8;" http-equiv="content-type">')).to eql('UTF-8')
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should use a http-equiv meta tag with attributes in unusual order' do
|
28
|
+
expect(GuessHtmlEncoding.guess('<meta><meta charset="UTF-8">')).to eql('UTF-8')
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should use the first meta tag with a charset value' do
|
32
|
+
expect(GuessHtmlEncoding.guess('<meta charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-9')
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should use a meta http-equiv tag with spaces in the content value' do
|
36
|
+
expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=' text/html ; charset = UTF-8;'>")).to eql('UTF-8')
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should use a meta http-equiv tag with newlines in the content value' do
|
40
|
+
expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='\t\ncharset=UTF-8\n'>")).to eql('UTF-8')
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should use a meta http-equiv tag with double quotes in the content value' do
|
44
|
+
expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='text/html; charset=\"UTF-8\">")).to eql('UTF-8')
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should use a meta http-equiv tag with single quotes in the content value' do
|
48
|
+
expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=\"text/html; charset='UTF-8'\">")).to eql('UTF-8')
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should use the first charset attribute' do
|
52
|
+
expect(GuessHtmlEncoding.guess('<meta charset="UTF-9" charset="UTF-8">>')).to eql('UTF-9')
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should use the charset value over the content value' do
|
56
|
+
expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8" charset="UTF-9">')).to eql('UTF-9')
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'should use the charset value if it appears before http-equiv' do
|
60
|
+
expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" charset="UTF-9" http-equiv="content-type" >')).to eql('UTF-9')
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'should ignore meta tags with content attribute but no http-equiv' do
|
64
|
+
expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" ><meta charset="UTF-9">')).to eql('UTF-9')
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should ignore a commented-out meta tag' do
|
68
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!--<meta charset="UTF-9">--><meta charset="UTF-8">')).to eql('UTF-8')
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should ignore a minimal comment' do
|
72
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html><html><!--><meta charset="UTF-9"></html>')).to eql('UTF-9')
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should ignore an oddly commented out meta tag using <! >' do
|
76
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should ignore an oddly commented out meta tag using </ >' do
|
80
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html></<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
|
81
|
+
end
|
82
|
+
|
83
|
+
it 'should ignore an oddly commented out meta tag using <? ?>' do
|
84
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html><?<meta charset="UTF-9">?><meta charset="UTF-8">')).to eql('UTF-8')
|
85
|
+
end
|
86
|
+
|
87
|
+
it 'should ignore a <metadata> tag' do
|
88
|
+
expect(GuessHtmlEncoding.guess('<metadata test="yes" charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-8')
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'should only search the first 2500 characters' do
|
92
|
+
html = 2500.times.collect { ' ' }.join + '<meta charset="UTF-8">'
|
93
|
+
expect(GuessHtmlEncoding.guess(html)).to eql(nil)
|
94
|
+
end
|
95
|
+
|
6
96
|
it "can use headers" do
|
7
97
|
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
8
98
|
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guess_html_encoding
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Cantino (Iteration Labs, LLC)
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|