guess_html_encoding 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/guess_html_encoding.rb +281 -5
- data/lib/guess_html_encoding/version.rb +1 -1
- data/spec/guess_html_encoding_spec.rb +90 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4a5a5d5cd40292d68650b9dd471adb932f424cd
|
4
|
+
data.tar.gz: b6f65abc65007e6cf570520eb78cf24adeb6d6bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35d9b1b1b2b42b4b17bcaa49cf908143ba566a214f2637f2b335ba0157191ad52d20b632c9d4ee5de82aeecc088ba3bd6c6023b46d994af69c5c1680353deecd
|
7
|
+
data.tar.gz: 91534f086571eac16bd248bb22189e7a88af4b66438dd1ad690c98f4ed8d2122a6572231c6573e87cd7fecd84af0b38e1c1c52764456ddc20708daa848a96593
|
data/Gemfile.lock
CHANGED
data/lib/guess_html_encoding.rb
CHANGED
@@ -19,11 +19,9 @@ module GuessHtmlEncoding
|
|
19
19
|
end
|
20
20
|
|
21
21
|
if out.nil? || out.empty? || !encoding_loaded?(out)
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
out = $1
|
26
|
-
end
|
22
|
+
|
23
|
+
out = HTMLScanner.new(html[0,2500]).encoding || out
|
24
|
+
|
27
25
|
out.upcase! unless out.nil?
|
28
26
|
end
|
29
27
|
|
@@ -55,4 +53,282 @@ module GuessHtmlEncoding
|
|
55
53
|
def self.encoding_loaded?(encoding)
|
56
54
|
!!Encoding.find(encoding) rescue nil
|
57
55
|
end
|
56
|
+
|
57
|
+
class HTMLScanner
|
58
|
+
|
59
|
+
def initialize(html)
|
60
|
+
@html = html
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns the encoding sniffed from the content of an HTML page, as determined using an
|
64
|
+
# implemention of the algorithm to 'prescan a byte stream to determine its encoding', as
|
65
|
+
# specified by the HTML specification:
|
66
|
+
# http://www.w3.org/html/wg/drafts/html/master/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
|
67
|
+
def encoding
|
68
|
+
|
69
|
+
position = 0
|
70
|
+
charset = nil
|
71
|
+
length = @html.length
|
72
|
+
|
73
|
+
done = false
|
74
|
+
|
75
|
+
while position < length && !done
|
76
|
+
|
77
|
+
# First look for a standard HTML comment (ie <!-- blah -->)
|
78
|
+
if @html[position, 4] == '<!--'
|
79
|
+
|
80
|
+
position += 2
|
81
|
+
|
82
|
+
position += (@html[position, length].index('-->') || length)
|
83
|
+
|
84
|
+
# Then look for the start of a meta tag
|
85
|
+
elsif @html[position, 6] =~ /\A\<meta[\s\/]/i
|
86
|
+
|
87
|
+
charset, position_increment = charset_from_meta(@html[position + 5, length])
|
88
|
+
|
89
|
+
break if charset
|
90
|
+
|
91
|
+
position += position_increment
|
92
|
+
|
93
|
+
# Then look for <! or </ or <?
|
94
|
+
elsif @html[position, 2] =~ /\A\<[\!\/\?]/
|
95
|
+
|
96
|
+
# Advance position to the first > that appears next in string, or end
|
97
|
+
position += @html[position, length].index('>') || length
|
98
|
+
|
99
|
+
else
|
100
|
+
# Do nothing. (This is just here to make the algorithm easier to follow)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Advance position to next character
|
104
|
+
position += 1
|
105
|
+
end
|
106
|
+
|
107
|
+
charset
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
|
113
|
+
# Given a string which starts with the space or slash following a `<meta`,
|
114
|
+
# look for a charset and returns it along with the position of the next
|
115
|
+
# character following the closing `>` character
|
116
|
+
def charset_from_meta(string)
|
117
|
+
|
118
|
+
position = 0
|
119
|
+
attribute_list = {}
|
120
|
+
got_pragma = false
|
121
|
+
need_pragma = nil
|
122
|
+
charset = nil
|
123
|
+
length = string.length
|
124
|
+
|
125
|
+
while position < length
|
126
|
+
|
127
|
+
attribute, position_increment = attribute(string[position, length])
|
128
|
+
|
129
|
+
position += position_increment.to_i
|
130
|
+
|
131
|
+
if attribute == nil
|
132
|
+
|
133
|
+
break
|
134
|
+
|
135
|
+
elsif attribute_list[attribute[:attribute_name]]
|
136
|
+
|
137
|
+
# Do nothing
|
138
|
+
|
139
|
+
else
|
140
|
+
|
141
|
+
# found a new attribute. Add it to the list
|
142
|
+
attribute_list[attribute[:attribute_name]] = attribute[:attribute_value]
|
143
|
+
|
144
|
+
if attribute[:attribute_name] == 'http-equiv'
|
145
|
+
|
146
|
+
got_pragma = true
|
147
|
+
|
148
|
+
elsif attribute[:attribute_name] == 'content'
|
149
|
+
|
150
|
+
content_charset = charset_from_meta_content(attribute[:attribute_value])
|
151
|
+
|
152
|
+
if content_charset && charset == nil
|
153
|
+
charset = content_charset
|
154
|
+
need_pragma = true
|
155
|
+
end
|
156
|
+
|
157
|
+
elsif attribute[:attribute_name] == 'charset'
|
158
|
+
|
159
|
+
charset = attribute[:attribute_value]
|
160
|
+
need_pragma = false
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
if need_pragma == nil || (need_pragma == true && got_pragma == false)
|
169
|
+
[nil, position]
|
170
|
+
else
|
171
|
+
[charset, position]
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
# Given a string representing the 'content' attribute value of a meta tag
|
177
|
+
# with an `http-equiv` attribute, returns the charset specified within that
|
178
|
+
# value, or nil.
|
179
|
+
def charset_from_meta_content(string)
|
180
|
+
|
181
|
+
charset_match = string.match(/charset\s*\=\s*(.+)/i)
|
182
|
+
|
183
|
+
if charset_match
|
184
|
+
|
185
|
+
charset_value = charset_match[1]
|
186
|
+
|
187
|
+
charset_value[/\A\"(.*)\"/, 1] ||
|
188
|
+
charset_value[/\A\'(.*)\'/, 1] ||
|
189
|
+
charset_value[/(.*)[\s;]/, 1] ||
|
190
|
+
charset_value[/(.*)/, 1]
|
191
|
+
else
|
192
|
+
nil
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
# Given a string, returns the first attribute in the sting (as a hash), and
|
198
|
+
# the position of the next character in the string
|
199
|
+
def attribute(string)
|
200
|
+
|
201
|
+
attribute_name = ""
|
202
|
+
attribute_value = ""
|
203
|
+
|
204
|
+
length = string.length
|
205
|
+
position = 0
|
206
|
+
|
207
|
+
return [nil, nil] if length == 0
|
208
|
+
|
209
|
+
while position < (length)
|
210
|
+
|
211
|
+
# If character matches 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), 0x20 (ASCII space), or 0x2F (ASCII /) then advance position
|
212
|
+
if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}\u{2f}]/
|
213
|
+
|
214
|
+
position += 1
|
215
|
+
|
216
|
+
elsif string[position] == '>'
|
217
|
+
|
218
|
+
attribute_name = nil
|
219
|
+
break
|
220
|
+
|
221
|
+
else
|
222
|
+
|
223
|
+
while position < length
|
224
|
+
|
225
|
+
if string[position] == '=' && attribute_name != ''
|
226
|
+
|
227
|
+
attribute_value, position_increment = attribute_value(string[position + 1, length])
|
228
|
+
|
229
|
+
position += position_increment + 1
|
230
|
+
|
231
|
+
break
|
232
|
+
|
233
|
+
elsif string[position] =~ /[\>\/]/
|
234
|
+
|
235
|
+
break
|
236
|
+
|
237
|
+
elsif string[position] =~ /[A-Z]/
|
238
|
+
|
239
|
+
attribute_name += string[position].downcase
|
240
|
+
position += 1
|
241
|
+
|
242
|
+
else
|
243
|
+
attribute_name += string[position]
|
244
|
+
position += 1
|
245
|
+
end
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
break
|
250
|
+
|
251
|
+
end
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
if attribute_name
|
256
|
+
[{attribute_name: attribute_name, attribute_value: attribute_value}, position]
|
257
|
+
else
|
258
|
+
[nil, position]
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|
262
|
+
|
263
|
+
# Given a string, this returns the attribute value from the start of the string,
|
264
|
+
# and the position of the following character in the string
|
265
|
+
def attribute_value(string)
|
266
|
+
|
267
|
+
attribute_value = ''
|
268
|
+
position = 0
|
269
|
+
length = string.length
|
270
|
+
|
271
|
+
while position < length
|
272
|
+
|
273
|
+
# x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), or 0x20 (ASCII space) then advance position to the next byte, then, repeat this step.
|
274
|
+
if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}]/
|
275
|
+
|
276
|
+
position += 1
|
277
|
+
|
278
|
+
elsif string[position] =~ /['"]/
|
279
|
+
|
280
|
+
attribute_value, position = quoted_value(string[position, length])
|
281
|
+
break
|
282
|
+
|
283
|
+
elsif string[position] == '>'
|
284
|
+
position += 1
|
285
|
+
break
|
286
|
+
|
287
|
+
else
|
288
|
+
attribute_value, position = unquoted_value(string[position, length])
|
289
|
+
break
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
[attribute_value, position]
|
294
|
+
end
|
295
|
+
|
296
|
+
# Given a string, at the start of which is quoted attribute value, returns
|
297
|
+
# that attribute value, and the position of the next character in the string
|
298
|
+
# (following the second matching quote mark)
|
299
|
+
def quoted_value(string)
|
300
|
+
|
301
|
+
attribute_value = ""
|
302
|
+
quote_type = string[0]
|
303
|
+
position = 1
|
304
|
+
length = string.length
|
305
|
+
|
306
|
+
while position < length
|
307
|
+
|
308
|
+
if string[position] == quote_type
|
309
|
+
position += 1
|
310
|
+
break
|
311
|
+
else
|
312
|
+
attribute_value += downcase_A_to_Z_only(string[position])
|
313
|
+
position += 1
|
314
|
+
end
|
315
|
+
|
316
|
+
end
|
317
|
+
|
318
|
+
[attribute_value, position]
|
319
|
+
end
|
320
|
+
|
321
|
+
# Given a string, at the start of which is an unquoted attribute value, returns
|
322
|
+
# that attribute value, and the position of the next character in the string
|
323
|
+
def unquoted_value(string)
|
324
|
+
downcased_value = downcase_A_to_Z_only(string[/\A[^\t\u{0A}\u{0C}\u{0D}\u{20}\>]*/])
|
325
|
+
[downcased_value, downcased_value.length]
|
326
|
+
end
|
327
|
+
|
328
|
+
# Downcases the A-Z characters only (eg not É -> é)
|
329
|
+
def downcase_A_to_Z_only(string)
|
330
|
+
string.gsub(/([A-Z])/) { |match| match.downcase }
|
331
|
+
end
|
332
|
+
|
333
|
+
end
|
58
334
|
end
|
@@ -3,6 +3,96 @@ require 'spec_helper'
|
|
3
3
|
|
4
4
|
describe "GuessHtmlEncoding" do
|
5
5
|
describe "#guess" do
|
6
|
+
|
7
|
+
it 'should use an uppercased unquoted meta tag' do
|
8
|
+
expect(GuessHtmlEncoding.guess('<META CHARSET=UTF-8>')).to eql('UTF-8')
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should use a quoted meta tag' do
|
12
|
+
expect(GuessHtmlEncoding.guess('<meta charset="UTF-8">')).to eql('UTF-8')
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should use a http-equiv meta tag' do
|
16
|
+
expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8">')).to eql('UTF-8')
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should use a http-equiv meta tag with semi-colons in the content value' do
|
20
|
+
expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="text/html; charset=UTF-8;">')).to eql('UTF-8')
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should use a http-equiv meta tag with attributes in unusual order' do
|
24
|
+
expect(GuessHtmlEncoding.guess('<meta content="text/html; charset=UTF-8;" http-equiv="content-type">')).to eql('UTF-8')
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should use a http-equiv meta tag with attributes in unusual order' do
|
28
|
+
expect(GuessHtmlEncoding.guess('<meta><meta charset="UTF-8">')).to eql('UTF-8')
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should use the first meta tag with a charset value' do
|
32
|
+
expect(GuessHtmlEncoding.guess('<meta charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-9')
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should use a meta http-equiv tag with spaces in the content value' do
|
36
|
+
expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=' text/html ; charset = UTF-8;'>")).to eql('UTF-8')
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should use a meta http-equiv tag with newlines in the content value' do
|
40
|
+
expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='\t\ncharset=UTF-8\n'>")).to eql('UTF-8')
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should use a meta http-equiv tag with double quotes in the content value' do
|
44
|
+
expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='text/html; charset=\"UTF-8\">")).to eql('UTF-8')
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should use a meta http-equiv tag with single quotes in the content value' do
|
48
|
+
expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=\"text/html; charset='UTF-8'\">")).to eql('UTF-8')
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should use the first charset attribute' do
|
52
|
+
expect(GuessHtmlEncoding.guess('<meta charset="UTF-9" charset="UTF-8">>')).to eql('UTF-9')
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should use the charset value over the content value' do
|
56
|
+
expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8" charset="UTF-9">')).to eql('UTF-9')
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'should use the charset value if it appears before http-equiv' do
|
60
|
+
expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" charset="UTF-9" http-equiv="content-type" >')).to eql('UTF-9')
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'should ignore meta tags with content attribute but no http-equiv' do
|
64
|
+
expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" ><meta charset="UTF-9">')).to eql('UTF-9')
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should ignore a commented-out meta tag' do
|
68
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!--<meta charset="UTF-9">--><meta charset="UTF-8">')).to eql('UTF-8')
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should ignore a minimal comment' do
|
72
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html><html><!--><meta charset="UTF-9"></html>')).to eql('UTF-9')
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should ignore an oddly commented out meta tag using <! >' do
|
76
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should ignore an oddly commented out meta tag using </ >' do
|
80
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html></<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
|
81
|
+
end
|
82
|
+
|
83
|
+
it 'should ignore an oddly commented out meta tag using <? ?>' do
|
84
|
+
expect(GuessHtmlEncoding.guess('<!DOCTYPE html><?<meta charset="UTF-9">?><meta charset="UTF-8">')).to eql('UTF-8')
|
85
|
+
end
|
86
|
+
|
87
|
+
it 'should ignore a <metadata> tag' do
|
88
|
+
expect(GuessHtmlEncoding.guess('<metadata test="yes" charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-8')
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'should only search the first 2500 characters' do
|
92
|
+
html = 2500.times.collect { ' ' }.join + '<meta charset="UTF-8">'
|
93
|
+
expect(GuessHtmlEncoding.guess(html)).to eql(nil)
|
94
|
+
end
|
95
|
+
|
6
96
|
it "can use headers" do
|
7
97
|
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
8
98
|
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guess_html_encoding
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Cantino (Iteration Labs, LLC)
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|