rchardet 1.9.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 28403d2d9a7f0a681c74f8f9e71a7cee248f5b369731c64a29f9e5a4cd221de8
4
- data.tar.gz: a1b76fc7660bdc0e15797d036af4f90116225985baa37b5cc14d1cb974d64469
3
+ metadata.gz: ae69d0c1bf33f290595011946f42312ccc7d3c9d4691096a2f26faf3125b398e
4
+ data.tar.gz: 51fcc82f0d020c9a9815bcec0cb10f12004bc8dbf699c55483e6e7a773da2a95
5
5
  SHA512:
6
- metadata.gz: 4381fd73d549f71bab23af5677801e0a4b130f406fdd2e0ac352c5319ae719db675791a3f1c205de874f50aace623ca0cd70fc23546e8cfe150ff5a325435fd7
7
- data.tar.gz: 65f6e93b9a127e247781a6273d7be44c0f1dd21374f5592d8938799da698b33443416b2806fe47f3d73c530085fd8af0e841637435adde1487fd4a09a0f35597
6
+ metadata.gz: 88e2bee99000fdb57c6c24f3e1d07ab543166979fb25050b9775964f83a838ab0e55d3f6abd158676250355a9b47e30333c574cb2090052f55dd763042798fcc
7
+ data.tar.gz: d7d5c7316e1004617b8767f0eab6cc8658f44519f08641888fb19c5c1cfc53e928a153cc4c85ddc8131fdef0fd51ac72508c9fd2263b8fc943466fdca7f7a792
@@ -42,6 +42,7 @@ module CharDet
42
42
  @highBitDetector = /[\x80-\xFF]/n
43
43
  @escDetector = /(\033|\~\{)/n
44
44
  @escCharSetProber = nil
45
+ @utf1632prober = nil
45
46
  @charSetProbers = []
46
47
  reset()
47
48
  end
@@ -56,6 +57,9 @@ module CharDet
56
57
  if @escCharSetProber
57
58
  @escCharSetProber.reset()
58
59
  end
60
+ if @utf1632prober
61
+ @utf1632prober.reset()
62
+ end
59
63
  for prober in @charSetProbers
60
64
  prober.reset()
61
65
  end
@@ -117,6 +121,22 @@ module CharDet
117
121
  end
118
122
 
119
123
  @lastChar = aBuf[-1, 1]
124
+
125
+ if !@utf1632prober
126
+ @utf1632prober = UTF1632Prober.new()
127
+ end
128
+
129
+ if @utf1632prober.get_state == EDetecting
130
+ if @utf1632prober.feed(aBuf) == EFoundIt
131
+ @result = {
132
+ "encoding" => @utf1632prober.get_charset_name(),
133
+ "confidence" => @utf1632prober.get_confidence()
134
+ }
135
+ @done = true
136
+ return
137
+ end
138
+ end
139
+
120
140
  if @inputState == EEscAscii
121
141
  if !@escCharSetProber
122
142
  @escCharSetProber = EscCharSetProber.new()
@@ -0,0 +1,232 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ MIN_CHARS_FOR_DETECTION = 20
31
+ EXPECTED_RATIO = 0.94
32
+
33
+ class UTF1632Prober < CharSetProber
34
+ def initialize
35
+ super()
36
+ @position = 0
37
+ @zeros_at_mod = [0, 0, 0, 0]
38
+ @nonzeros_at_mod = [0, 0, 0, 0]
39
+ @state = EDetecting
40
+ @quad = [0, 0, 0, 0]
41
+ @invalid_utf16be = false
42
+ @invalid_utf16le = false
43
+ @invalid_utf32be = false
44
+ @invalid_utf32le = false
45
+ @first_half_surrogate_pair_detected_16be = false
46
+ @first_half_surrogate_pair_detected_16le = false
47
+ reset()
48
+ end
49
+
50
+ def reset
51
+ super()
52
+ @position = 0
53
+ @zeros_at_mod = [0, 0, 0, 0]
54
+ @nonzeros_at_mod = [0, 0, 0, 0]
55
+ @state = EDetecting
56
+ @invalid_utf16be = false
57
+ @invalid_utf16le = false
58
+ @invalid_utf32be = false
59
+ @invalid_utf32le = false
60
+ @first_half_surrogate_pair_detected_16be = false
61
+ @first_half_surrogate_pair_detected_16le = false
62
+ @quad = [0, 0, 0, 0]
63
+ end
64
+
65
+ def get_charset_name
66
+ if is_likely_utf32be
67
+ return "UTF-32BE"
68
+ end
69
+ if is_likely_utf32le
70
+ return "UTF-32LE"
71
+ end
72
+ if is_likely_utf16be
73
+ return "UTF-16BE"
74
+ end
75
+ if is_likely_utf16le
76
+ return "UTF-16LE"
77
+ end
78
+ # default to something valid
79
+ return "UTF-16"
80
+ end
81
+
82
+ def feed(aBuf)
83
+ aBuf.each_byte do |b|
84
+ mod4 = @position % 4
85
+ @quad[mod4] = b
86
+ if mod4 == 3
87
+ validate_utf32_characters(@quad)
88
+ validate_utf16_characters(@quad[0..2])
89
+ validate_utf16_characters(@quad[2..4])
90
+ end
91
+ if b == 0
92
+ @zeros_at_mod[mod4] += 1
93
+ else
94
+ @nonzeros_at_mod[mod4] += 1
95
+ end
96
+ @position += 1
97
+ end
98
+
99
+ return get_state()
100
+ end
101
+
102
+ def get_state
103
+ if [ENotMe, EFoundIt].include? @state
104
+ # terminal, decided states
105
+ return @state
106
+ end
107
+ if get_confidence > 0.80
108
+ @state = EFoundIt
109
+ elsif @position > 4 * 1024
110
+ # if we get to 4kb into the file, and we can't conclude it's UTF,
111
+ # let's give up
112
+ @state = ENotMe
113
+ end
114
+ return @state
115
+ end
116
+
117
+ def get_confidence
118
+ if is_likely_utf16le || is_likely_utf16be || is_likely_utf32le || is_likely_utf32be
119
+ 0.85
120
+ else
121
+ 0.00
122
+ end
123
+ end
124
+
125
+ private
126
+
127
+ def approx_32bit_chars
128
+ return [1.0, @position / 4.0].max
129
+ end
130
+
131
+ def approx_16bit_chars
132
+ return [1.0, @position / 2.0].max
133
+ end
134
+
135
+ def is_likely_utf32be
136
+ approx_chars = approx_32bit_chars
137
+ return approx_chars >= MIN_CHARS_FOR_DETECTION &&
138
+ @zeros_at_mod[0] / approx_chars > EXPECTED_RATIO &&
139
+ @zeros_at_mod[1] / approx_chars > EXPECTED_RATIO &&
140
+ @zeros_at_mod[2] / approx_chars > EXPECTED_RATIO &&
141
+ @nonzeros_at_mod[3] / approx_chars > EXPECTED_RATIO &&
142
+ !@invalid_utf32be
143
+
144
+ end
145
+
146
+ def is_likely_utf32le
147
+ approx_chars = approx_32bit_chars
148
+ return approx_chars >= MIN_CHARS_FOR_DETECTION &&
149
+ @nonzeros_at_mod[0] / approx_chars > EXPECTED_RATIO &&
150
+ @zeros_at_mod[1] / approx_chars > EXPECTED_RATIO &&
151
+ @zeros_at_mod[2] / approx_chars > EXPECTED_RATIO &&
152
+ @zeros_at_mod[3] / approx_chars > EXPECTED_RATIO &&
153
+ !@invalid_utf32le
154
+ end
155
+
156
+ def is_likely_utf16be
157
+ approx_chars = approx_16bit_chars
158
+ return approx_chars >= MIN_CHARS_FOR_DETECTION &&
159
+ (@nonzeros_at_mod[1] + @nonzeros_at_mod[3]) / approx_chars > EXPECTED_RATIO &&
160
+ (@zeros_at_mod[0] + @zeros_at_mod[2]) / approx_chars > EXPECTED_RATIO &&
161
+ !@invalid_utf16be
162
+ end
163
+
164
+ def is_likely_utf16le
165
+ approx_chars = approx_16bit_chars
166
+ return approx_chars >= MIN_CHARS_FOR_DETECTION &&
167
+ (@nonzeros_at_mod[0] + @nonzeros_at_mod[2]) / approx_chars > EXPECTED_RATIO &&
168
+ (@zeros_at_mod[1] + @zeros_at_mod[3]) / approx_chars > EXPECTED_RATIO &&
169
+ !@invalid_utf16le
170
+ end
171
+
172
+ # @param [Array<Integer>] quad four consecutive bytes
173
+ # @return [void]
174
+ def validate_utf32_characters(quad)
175
+ "" "
176
+ Validate if the quad of bytes is valid UTF-32.
177
+
178
+ UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
179
+ excluding 0x0000D800 - 0x0000DFFF
180
+
181
+ https://en.wikipedia.org/wiki/UTF-32
182
+ " ""
183
+ if quad[0] != 0 or quad[1] > 0x10 or quad[0] == 0 and quad[1] == 0 and (0xD8..0xDF).include?(quad[2])
184
+ @invalid_utf32be = true
185
+ end
186
+ if quad[3] != 0 or quad[2] > 0x10 or quad[3] == 0 and quad[2] == 0 and (0xD8..0xDF).include?(quad[1])
187
+ @invalid_utf32le = true
188
+ end
189
+ end
190
+
191
+ # @param [Array<Integer>] pair two consecutive bytes
192
+ # @return [void]
193
+ def validate_utf16_characters(pair)
194
+ "" "
195
+ Validate if the pair of bytes is valid UTF-16.
196
+
197
+ UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
198
+ with an exception for surrogate pairs, which must be in the range
199
+ 0xD800-0xDBFF followed by 0xDC00-0xDFFF
200
+
201
+ https://en.wikipedia.org/wiki/UTF-16
202
+ " ""
203
+ if !@first_half_surrogate_pair_detected_16be
204
+ if (0xD8..0xDB).include? pair[0]
205
+ @first_half_surrogate_pair_detected_16be = true
206
+ elsif (0xDC..0xDF).include? pair[0]
207
+ @invalid_utf16be = true
208
+ end
209
+ else
210
+ if (0xDC..0xDF).include? pair[0]
211
+ @first_half_surrogate_pair_detected_16be = false
212
+ else
213
+ @invalid_utf16be = true
214
+ end
215
+ end
216
+
217
+ if not @first_half_surrogate_pair_detected_16le
218
+ if (0xD8..0xDB).include? pair[1]
219
+ @first_half_surrogate_pair_detected_16le = true
220
+ elsif (0xDC..0xDF).include? pair[1]
221
+ @invalid_utf16le = true
222
+ end
223
+ else
224
+ if (0xDC..0xDF).include? pair[1]
225
+ @first_half_surrogate_pair_detected_16le = false
226
+ else
227
+ @invalid_utf16le = true
228
+ end
229
+ end
230
+ end
231
+ end
232
+ end
@@ -1,3 +1,3 @@
1
1
  module CharDet
2
- VERSION = "1.9.0"
2
+ VERSION = "1.10.0"
3
3
  end
data/lib/rchardet.rb CHANGED
@@ -53,6 +53,7 @@ require 'rchardet/sbcsgroupprober'
53
53
  require 'rchardet/sjisprober'
54
54
  require 'rchardet/universaldetector'
55
55
  require 'rchardet/utf8prober'
56
+ require 'rchardet/utf1632prober'
56
57
 
57
58
  module CharDet
58
59
  def CharDet.detect(aBuf)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rchardet
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.0
4
+ version: 1.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Grosser
8
8
  - Jeff Hodges
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-05 00:00:00.000000000 Z
11
+ date: 2025-09-10 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  email:
14
14
  - michael@grosser.it
@@ -52,6 +52,7 @@ files:
52
52
  - lib/rchardet/sbcsgroupprober.rb
53
53
  - lib/rchardet/sjisprober.rb
54
54
  - lib/rchardet/universaldetector.rb
55
+ - lib/rchardet/utf1632prober.rb
55
56
  - lib/rchardet/utf8prober.rb
56
57
  - lib/rchardet/version.rb
57
58
  homepage: https://github.com/jmhodges/rchardet