guess_html_encoding 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f24b82e186d3e1a58cd2061c7cb1eef2f5b5d1b0
4
- data.tar.gz: cbfd0284000e074ef621763a36ca2be60cbed218
3
+ metadata.gz: c4a5a5d5cd40292d68650b9dd471adb932f424cd
4
+ data.tar.gz: b6f65abc65007e6cf570520eb78cf24adeb6d6bf
5
5
  SHA512:
6
- metadata.gz: 4d68030d7c0af216faa1e1dc029c65b6557287a8349aa89ec2a7a98833de4178a838693d2bf3e866b966edd597951e0d31c26f3a4c33daab30c7afa93692b7a5
7
- data.tar.gz: e2ddc685bae62c4cc6e962dd79a4f69863aef512b556957b2ab91113b492c3b07d7315f880c57179c8a9aea30d0279823983a21bbdcc6cbbc697cfc9ef2ada30
6
+ metadata.gz: 35d9b1b1b2b42b4b17bcaa49cf908143ba566a214f2637f2b335ba0157191ad52d20b632c9d4ee5de82aeecc088ba3bd6c6023b46d994af69c5c1680353deecd
7
+ data.tar.gz: 91534f086571eac16bd248bb22189e7a88af4b66438dd1ad690c98f4ed8d2122a6572231c6573e87cd7fecd84af0b38e1c1c52764456ddc20708daa848a96593
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- guess_html_encoding (0.0.9)
4
+ guess_html_encoding (0.0.11)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -19,11 +19,9 @@ module GuessHtmlEncoding
19
19
  end
20
20
 
21
21
  if out.nil? || out.empty? || !encoding_loaded?(out)
22
- if html =~ /<meta[^>]*HTTP-EQUIV=["']?Content-Type["']?[^>]*content=["']([^'"]*)["']/i && $1 =~ /charset=([\w\d-]+);?/i
23
- out = $1
24
- elsif html =~ /<meta\s+charset=["']([\w\d-]+)?/i
25
- out = $1
26
- end
22
+
23
+ out = HTMLScanner.new(html[0,2500]).encoding || out
24
+
27
25
  out.upcase! unless out.nil?
28
26
  end
29
27
 
@@ -55,4 +53,282 @@ module GuessHtmlEncoding
55
53
  def self.encoding_loaded?(encoding)
56
54
  !!Encoding.find(encoding) rescue nil
57
55
  end
56
+
57
+ class HTMLScanner
58
+
59
+ def initialize(html)
60
+ @html = html
61
+ end
62
+
63
+ # Returns the encoding sniffed from the content of an HTML page, as determined using an
64
+ # implemention of the algorithm to 'prescan a byte stream to determine its encoding', as
65
+ # specified by the HTML specification:
66
+ # http://www.w3.org/html/wg/drafts/html/master/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
67
+ def encoding
68
+
69
+ position = 0
70
+ charset = nil
71
+ length = @html.length
72
+
73
+ done = false
74
+
75
+ while position < length && !done
76
+
77
+ # First look for a standard HTML comment (ie <!-- blah -->)
78
+ if @html[position, 4] == '<!--'
79
+
80
+ position += 2
81
+
82
+ position += (@html[position, length].index('-->') || length)
83
+
84
+ # Then look for the start of a meta tag
85
+ elsif @html[position, 6] =~ /\A\<meta[\s\/]/i
86
+
87
+ charset, position_increment = charset_from_meta(@html[position + 5, length])
88
+
89
+ break if charset
90
+
91
+ position += position_increment
92
+
93
+ # Then look for <! or </ or <?
94
+ elsif @html[position, 2] =~ /\A\<[\!\/\?]/
95
+
96
+ # Advance position to the first > that appears next in string, or end
97
+ position += @html[position, length].index('>') || length
98
+
99
+ else
100
+ # Do nothing. (This is just here to make the algorithm easier to follow)
101
+ end
102
+
103
+ # Advance position to next character
104
+ position += 1
105
+ end
106
+
107
+ charset
108
+ end
109
+
110
+ private
111
+
112
+
113
+ # Given a string which starts with the space or slash following a `<meta`,
114
+ # look for a charset and returns it along with the position of the next
115
+ # character following the closing `>` character
116
+ def charset_from_meta(string)
117
+
118
+ position = 0
119
+ attribute_list = {}
120
+ got_pragma = false
121
+ need_pragma = nil
122
+ charset = nil
123
+ length = string.length
124
+
125
+ while position < length
126
+
127
+ attribute, position_increment = attribute(string[position, length])
128
+
129
+ position += position_increment.to_i
130
+
131
+ if attribute == nil
132
+
133
+ break
134
+
135
+ elsif attribute_list[attribute[:attribute_name]]
136
+
137
+ # Do nothing
138
+
139
+ else
140
+
141
+ # found a new attribute. Add it to the list
142
+ attribute_list[attribute[:attribute_name]] = attribute[:attribute_value]
143
+
144
+ if attribute[:attribute_name] == 'http-equiv'
145
+
146
+ got_pragma = true
147
+
148
+ elsif attribute[:attribute_name] == 'content'
149
+
150
+ content_charset = charset_from_meta_content(attribute[:attribute_value])
151
+
152
+ if content_charset && charset == nil
153
+ charset = content_charset
154
+ need_pragma = true
155
+ end
156
+
157
+ elsif attribute[:attribute_name] == 'charset'
158
+
159
+ charset = attribute[:attribute_value]
160
+ need_pragma = false
161
+
162
+ end
163
+
164
+ end
165
+
166
+ end
167
+
168
+ if need_pragma == nil || (need_pragma == true && got_pragma == false)
169
+ [nil, position]
170
+ else
171
+ [charset, position]
172
+ end
173
+
174
+ end
175
+
176
+ # Given a string representing the 'content' attribute value of a meta tag
177
+ # with an `http-equiv` attribute, returns the charset specified within that
178
+ # value, or nil.
179
+ def charset_from_meta_content(string)
180
+
181
+ charset_match = string.match(/charset\s*\=\s*(.+)/i)
182
+
183
+ if charset_match
184
+
185
+ charset_value = charset_match[1]
186
+
187
+ charset_value[/\A\"(.*)\"/, 1] ||
188
+ charset_value[/\A\'(.*)\'/, 1] ||
189
+ charset_value[/(.*)[\s;]/, 1] ||
190
+ charset_value[/(.*)/, 1]
191
+ else
192
+ nil
193
+ end
194
+
195
+ end
196
+
197
+ # Given a string, returns the first attribute in the sting (as a hash), and
198
+ # the position of the next character in the string
199
+ def attribute(string)
200
+
201
+ attribute_name = ""
202
+ attribute_value = ""
203
+
204
+ length = string.length
205
+ position = 0
206
+
207
+ return [nil, nil] if length == 0
208
+
209
+ while position < (length)
210
+
211
+ # If character matches 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), 0x20 (ASCII space), or 0x2F (ASCII /) then advance position
212
+ if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}\u{2f}]/
213
+
214
+ position += 1
215
+
216
+ elsif string[position] == '>'
217
+
218
+ attribute_name = nil
219
+ break
220
+
221
+ else
222
+
223
+ while position < length
224
+
225
+ if string[position] == '=' && attribute_name != ''
226
+
227
+ attribute_value, position_increment = attribute_value(string[position + 1, length])
228
+
229
+ position += position_increment + 1
230
+
231
+ break
232
+
233
+ elsif string[position] =~ /[\>\/]/
234
+
235
+ break
236
+
237
+ elsif string[position] =~ /[A-Z]/
238
+
239
+ attribute_name += string[position].downcase
240
+ position += 1
241
+
242
+ else
243
+ attribute_name += string[position]
244
+ position += 1
245
+ end
246
+
247
+ end
248
+
249
+ break
250
+
251
+ end
252
+
253
+ end
254
+
255
+ if attribute_name
256
+ [{attribute_name: attribute_name, attribute_value: attribute_value}, position]
257
+ else
258
+ [nil, position]
259
+ end
260
+
261
+ end
262
+
263
+ # Given a string, this returns the attribute value from the start of the string,
264
+ # and the position of the following character in the string
265
+ def attribute_value(string)
266
+
267
+ attribute_value = ''
268
+ position = 0
269
+ length = string.length
270
+
271
+ while position < length
272
+
273
+ # x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), or 0x20 (ASCII space) then advance position to the next byte, then, repeat this step.
274
+ if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}]/
275
+
276
+ position += 1
277
+
278
+ elsif string[position] =~ /['"]/
279
+
280
+ attribute_value, position = quoted_value(string[position, length])
281
+ break
282
+
283
+ elsif string[position] == '>'
284
+ position += 1
285
+ break
286
+
287
+ else
288
+ attribute_value, position = unquoted_value(string[position, length])
289
+ break
290
+ end
291
+ end
292
+
293
+ [attribute_value, position]
294
+ end
295
+
296
+ # Given a string, at the start of which is quoted attribute value, returns
297
+ # that attribute value, and the position of the next character in the string
298
+ # (following the second matching quote mark)
299
+ def quoted_value(string)
300
+
301
+ attribute_value = ""
302
+ quote_type = string[0]
303
+ position = 1
304
+ length = string.length
305
+
306
+ while position < length
307
+
308
+ if string[position] == quote_type
309
+ position += 1
310
+ break
311
+ else
312
+ attribute_value += downcase_A_to_Z_only(string[position])
313
+ position += 1
314
+ end
315
+
316
+ end
317
+
318
+ [attribute_value, position]
319
+ end
320
+
321
+ # Given a string, at the start of which is an unquoted attribute value, returns
322
+ # that attribute value, and the position of the next character in the string
323
+ def unquoted_value(string)
324
+ downcased_value = downcase_A_to_Z_only(string[/\A[^\t\u{0A}\u{0C}\u{0D}\u{20}\>]*/])
325
+ [downcased_value, downcased_value.length]
326
+ end
327
+
328
+ # Downcases the A-Z characters only (eg not É -> é)
329
+ def downcase_A_to_Z_only(string)
330
+ string.gsub(/([A-Z])/) { |match| match.downcase }
331
+ end
332
+
333
+ end
58
334
  end
@@ -1,3 +1,3 @@
1
1
  module GuessHtmlEncoding
2
- VERSION = "0.0.10"
2
+ VERSION = "0.0.11"
3
3
  end
@@ -3,6 +3,96 @@ require 'spec_helper'
3
3
 
4
4
  describe "GuessHtmlEncoding" do
5
5
  describe "#guess" do
6
+
7
+ it 'should use an uppercased unquoted meta tag' do
8
+ expect(GuessHtmlEncoding.guess('<META CHARSET=UTF-8>')).to eql('UTF-8')
9
+ end
10
+
11
+ it 'should use a quoted meta tag' do
12
+ expect(GuessHtmlEncoding.guess('<meta charset="UTF-8">')).to eql('UTF-8')
13
+ end
14
+
15
+ it 'should use a http-equiv meta tag' do
16
+ expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8">')).to eql('UTF-8')
17
+ end
18
+
19
+ it 'should use a http-equiv meta tag with semi-colons in the content value' do
20
+ expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="text/html; charset=UTF-8;">')).to eql('UTF-8')
21
+ end
22
+
23
+ it 'should use a http-equiv meta tag with attributes in unusual order' do
24
+ expect(GuessHtmlEncoding.guess('<meta content="text/html; charset=UTF-8;" http-equiv="content-type">')).to eql('UTF-8')
25
+ end
26
+
27
+ it 'should use a http-equiv meta tag with attributes in unusual order' do
28
+ expect(GuessHtmlEncoding.guess('<meta><meta charset="UTF-8">')).to eql('UTF-8')
29
+ end
30
+
31
+ it 'should use the first meta tag with a charset value' do
32
+ expect(GuessHtmlEncoding.guess('<meta charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-9')
33
+ end
34
+
35
+ it 'should use a meta http-equiv tag with spaces in the content value' do
36
+ expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=' text/html ; charset = UTF-8;'>")).to eql('UTF-8')
37
+ end
38
+
39
+ it 'should use a meta http-equiv tag with newlines in the content value' do
40
+ expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='\t\ncharset=UTF-8\n'>")).to eql('UTF-8')
41
+ end
42
+
43
+ it 'should use a meta http-equiv tag with double quotes in the content value' do
44
+ expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='text/html; charset=\"UTF-8\">")).to eql('UTF-8')
45
+ end
46
+
47
+ it 'should use a meta http-equiv tag with single quotes in the content value' do
48
+ expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=\"text/html; charset='UTF-8'\">")).to eql('UTF-8')
49
+ end
50
+
51
+ it 'should use the first charset attribute' do
52
+ expect(GuessHtmlEncoding.guess('<meta charset="UTF-9" charset="UTF-8">>')).to eql('UTF-9')
53
+ end
54
+
55
+ it 'should use the charset value over the content value' do
56
+ expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8" charset="UTF-9">')).to eql('UTF-9')
57
+ end
58
+
59
+ it 'should use the charset value if it appears before http-equiv' do
60
+ expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" charset="UTF-9" http-equiv="content-type" >')).to eql('UTF-9')
61
+ end
62
+
63
+ it 'should ignore meta tags with content attribute but no http-equiv' do
64
+ expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" ><meta charset="UTF-9">')).to eql('UTF-9')
65
+ end
66
+
67
+ it 'should ignore a commented-out meta tag' do
68
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!--<meta charset="UTF-9">--><meta charset="UTF-8">')).to eql('UTF-8')
69
+ end
70
+
71
+ it 'should ignore a minimal comment' do
72
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html><html><!--><meta charset="UTF-9"></html>')).to eql('UTF-9')
73
+ end
74
+
75
+ it 'should ignore an oddly commented out meta tag using <! >' do
76
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
77
+ end
78
+
79
+ it 'should ignore an oddly commented out meta tag using </ >' do
80
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html></<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
81
+ end
82
+
83
+ it 'should ignore an oddly commented out meta tag using <? ?>' do
84
+ expect(GuessHtmlEncoding.guess('<!DOCTYPE html><?<meta charset="UTF-9">?><meta charset="UTF-8">')).to eql('UTF-8')
85
+ end
86
+
87
+ it 'should ignore a <metadata> tag' do
88
+ expect(GuessHtmlEncoding.guess('<metadata test="yes" charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-8')
89
+ end
90
+
91
+ it 'should only search the first 2500 characters' do
92
+ html = 2500.times.collect { ' ' }.join + '<meta charset="UTF-8">'
93
+ expect(GuessHtmlEncoding.guess(html)).to eql(nil)
94
+ end
95
+
6
96
  it "can use headers" do
7
97
  guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
8
98
  "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guess_html_encoding
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Cantino (Iteration Labs, LLC)
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-14 00:00:00.000000000 Z
11
+ date: 2015-02-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec