philiprehberger-encoding_kit 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +78 -9
- data/lib/philiprehberger/encoding_kit/converter.rb +2 -1
- data/lib/philiprehberger/encoding_kit/detection_result.rb +85 -0
- data/lib/philiprehberger/encoding_kit/detector.rb +96 -13
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +121 -1
- metadata +6 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 13e2132a0833297e1443ae3c9fc280b2c75ca43c9092e2b6d4d1431981ab90af
|
|
4
|
+
data.tar.gz: 86091aa48e10920818add35a6e9c147095495096f284d8e00a93d6a3f0ab0596
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 55423bdb247b979775199f4ff83043c5bfebff3963ae8f5c0256ca86482e2fd6ded64fe54aded1f06864c387e5a852129baa5a7e8ffb35f47e0dd756e5750b96
|
|
7
|
+
data.tar.gz: eb8ca82fa4fae2670b5381dca826e95bc7dd281811c97d9488080d2a7666771cc927f16ee8715a05a07418d1b0f4da6096e47f3dcb09bb419e9816c8e284a807
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.2.1] - 2026-03-31
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
- Standardize README badges, support section, and license format
|
|
14
|
+
|
|
15
|
+
## [0.2.0] - 2026-03-28
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
|
|
19
|
+
- Confidence scores: `detect` returns a `DetectionResult` with `.encoding` and `.confidence` (1.0 for BOM, 0.5-0.9 for heuristics)
|
|
20
|
+
- `DetectionResult` delegates to `Encoding` for backward compatibility (e.g., `result == Encoding::UTF_8` still works)
|
|
21
|
+
- Streaming detection: `detect_stream(io, sample_size: 4096)` reads a sample from IO objects
|
|
22
|
+
- Encoding analysis: `analyze(string)` returns byte distribution stats and ranked candidates
|
|
23
|
+
- Windows codepage support: CP1252, CP1250, CP1251 detection via 0x80-0x9F byte patterns
|
|
24
|
+
- Transcode alias: `transcode(string, to:, fallback:, replace:)` for simplified auto-detect-and-convert
|
|
25
|
+
- Issue templates for bug reports and feature requests
|
|
26
|
+
- Dependabot configuration for bundler and GitHub Actions
|
|
27
|
+
- Pull request template
|
|
28
|
+
|
|
10
29
|
## [0.1.1] - 2026-03-26
|
|
11
30
|
|
|
12
31
|
### Added
|
data/README.md
CHANGED
|
@@ -2,8 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://github.com/philiprehberger/rb-encoding-kit/actions/workflows/ci.yml)
|
|
4
4
|
[](https://rubygems.org/gems/philiprehberger-encoding_kit)
|
|
5
|
-
[](https://github.com/sponsors/philiprehberger)
|
|
5
|
+
[](https://github.com/philiprehberger/rb-encoding-kit/commits/main)
|
|
7
6
|
|
|
8
7
|
Character encoding detection, conversion, and normalization
|
|
9
8
|
|
|
@@ -30,19 +29,68 @@ gem install philiprehberger-encoding_kit
|
|
|
30
29
|
```ruby
|
|
31
30
|
require "philiprehberger/encoding_kit"
|
|
32
31
|
|
|
33
|
-
|
|
32
|
+
result = Philiprehberger::EncodingKit.detect(raw_bytes)
|
|
33
|
+
result.encoding # => Encoding::UTF_8
|
|
34
|
+
result.confidence # => 0.9
|
|
34
35
|
utf8 = Philiprehberger::EncodingKit.to_utf8(raw_bytes)
|
|
35
36
|
```
|
|
36
37
|
|
|
37
|
-
### Encoding Detection
|
|
38
|
+
### Encoding Detection with Confidence
|
|
38
39
|
|
|
39
40
|
```ruby
|
|
40
41
|
require "philiprehberger/encoding_kit"
|
|
41
42
|
|
|
42
|
-
#
|
|
43
|
-
Philiprehberger::EncodingKit.detect("\xEF\xBB\xBFhello".b)
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
# Returns a DetectionResult that delegates to Encoding
|
|
44
|
+
result = Philiprehberger::EncodingKit.detect("\xEF\xBB\xBFhello".b)
|
|
45
|
+
result == Encoding::UTF_8 # => true (backward compatible)
|
|
46
|
+
result.confidence # => 1.0 (BOM detected)
|
|
47
|
+
result.name # => "UTF-8"
|
|
48
|
+
result.to_h # => {encoding: Encoding::UTF_8, confidence: 1.0}
|
|
49
|
+
|
|
50
|
+
# Heuristic detection returns lower confidence
|
|
51
|
+
result = Philiprehberger::EncodingKit.detect("caf\xC3\xA9".b)
|
|
52
|
+
result.confidence # => 0.85-0.9
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Streaming Detection
|
|
56
|
+
|
|
57
|
+
```ruby
|
|
58
|
+
require "philiprehberger/encoding_kit"
|
|
59
|
+
|
|
60
|
+
File.open("data.csv", "rb") do |file|
|
|
61
|
+
result = Philiprehberger::EncodingKit.detect_stream(file, sample_size: 8192)
|
|
62
|
+
result.encoding # => Encoding::UTF_8
|
|
63
|
+
result.confidence # => 0.9
|
|
64
|
+
end
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Encoding Analysis
|
|
68
|
+
|
|
69
|
+
```ruby
|
|
70
|
+
require "philiprehberger/encoding_kit"
|
|
71
|
+
|
|
72
|
+
analysis = Philiprehberger::EncodingKit.analyze(raw_bytes)
|
|
73
|
+
analysis[:encoding] # => Encoding::UTF_8
|
|
74
|
+
analysis[:confidence] # => 0.9
|
|
75
|
+
analysis[:printable_ratio] # => 0.95
|
|
76
|
+
analysis[:ascii_ratio] # => 0.8
|
|
77
|
+
analysis[:high_bytes] # => 12
|
|
78
|
+
analysis[:candidates] # => [{encoding: Encoding::UTF_8, confidence: 0.9}, ...]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Transcode
|
|
82
|
+
|
|
83
|
+
```ruby
|
|
84
|
+
require "philiprehberger/encoding_kit"
|
|
85
|
+
|
|
86
|
+
# Auto-detect source, convert to UTF-8
|
|
87
|
+
utf8 = Philiprehberger::EncodingKit.transcode(raw_bytes)
|
|
88
|
+
|
|
89
|
+
# Convert to a specific encoding
|
|
90
|
+
latin1 = Philiprehberger::EncodingKit.transcode(utf8_string, to: Encoding::ISO_8859_1)
|
|
91
|
+
|
|
92
|
+
# Custom fallback behavior
|
|
93
|
+
result = Philiprehberger::EncodingKit.transcode(data, to: "UTF-8", fallback: :replace, replace: "?")
|
|
46
94
|
```
|
|
47
95
|
|
|
48
96
|
### Convert to UTF-8
|
|
@@ -97,7 +145,10 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
97
145
|
|
|
98
146
|
| Method | Description |
|
|
99
147
|
|--------|-------------|
|
|
100
|
-
| `EncodingKit.detect(string)` | Detect encoding via BOM and heuristics, returns
|
|
148
|
+
| `EncodingKit.detect(string)` | Detect encoding via BOM and heuristics, returns a `DetectionResult` with `.encoding` and `.confidence` |
|
|
149
|
+
| `EncodingKit.detect_stream(io, sample_size: 4096)` | Detect encoding from an IO stream by sampling bytes |
|
|
150
|
+
| `EncodingKit.analyze(string)` | Analyze byte distribution and return encoding candidates with stats |
|
|
151
|
+
| `EncodingKit.transcode(string, to:, fallback:, replace:)` | Auto-detect source and convert to target encoding |
|
|
101
152
|
| `EncodingKit.to_utf8(string, from: nil)` | Convert to UTF-8, auto-detect source if `from` is nil |
|
|
102
153
|
| `EncodingKit.normalize(string)` | Force to valid UTF-8, replacing bad bytes with U+FFFD |
|
|
103
154
|
| `EncodingKit.valid?(string, encoding: nil)` | Check if string is valid in given or current encoding |
|
|
@@ -113,6 +164,24 @@ bundle exec rspec
|
|
|
113
164
|
bundle exec rubocop
|
|
114
165
|
```
|
|
115
166
|
|
|
167
|
+
## Support
|
|
168
|
+
|
|
169
|
+
If you find this project useful:
|
|
170
|
+
|
|
171
|
+
⭐ [Star the repo](https://github.com/philiprehberger/rb-encoding-kit)
|
|
172
|
+
|
|
173
|
+
🐛 [Report issues](https://github.com/philiprehberger/rb-encoding-kit/issues?q=is%3Aissue+is%3Aopen+label%3Abug)
|
|
174
|
+
|
|
175
|
+
💡 [Suggest features](https://github.com/philiprehberger/rb-encoding-kit/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement)
|
|
176
|
+
|
|
177
|
+
❤️ [Sponsor development](https://github.com/sponsors/philiprehberger)
|
|
178
|
+
|
|
179
|
+
🌐 [All Open Source Projects](https://philiprehberger.com/open-source-packages)
|
|
180
|
+
|
|
181
|
+
💻 [GitHub Profile](https://github.com/philiprehberger)
|
|
182
|
+
|
|
183
|
+
🔗 [LinkedIn Profile](https://www.linkedin.com/in/philiprehberger)
|
|
184
|
+
|
|
116
185
|
## License
|
|
117
186
|
|
|
118
187
|
[MIT](LICENSE)
|
|
@@ -35,7 +35,8 @@ module Philiprehberger
|
|
|
35
35
|
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
36
36
|
# @return [String] UTF-8 encoded string
|
|
37
37
|
def to_utf8(string, from: nil)
|
|
38
|
-
|
|
38
|
+
detected = from ? Encoding.find(from.to_s) : Detector.call(string)
|
|
39
|
+
source = detected.is_a?(DetectionResult) ? detected.encoding : detected
|
|
39
40
|
str = string.dup.force_encoding(source)
|
|
40
41
|
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
41
42
|
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Philiprehberger
|
|
4
|
+
module EncodingKit
|
|
5
|
+
# A detection result that wraps an Encoding with a confidence score.
|
|
6
|
+
# Delegates to the underlying Encoding so it can be used transparently
|
|
7
|
+
# wherever an Encoding object is expected (e.g., == Encoding::UTF_8).
|
|
8
|
+
class DetectionResult
|
|
9
|
+
attr_reader :encoding, :confidence
|
|
10
|
+
|
|
11
|
+
# @param encoding [Encoding] the detected encoding
|
|
12
|
+
# @param confidence [Float] confidence score between 0.0 and 1.0
|
|
13
|
+
def initialize(encoding, confidence)
|
|
14
|
+
@encoding = encoding
|
|
15
|
+
@confidence = confidence.to_f
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Equality check delegates to the underlying encoding so that
|
|
19
|
+
# `result == Encoding::UTF_8` works as expected.
|
|
20
|
+
#
|
|
21
|
+
# @param other [Object] the object to compare
|
|
22
|
+
# @return [Boolean]
|
|
23
|
+
def ==(other)
|
|
24
|
+
case other
|
|
25
|
+
when Encoding
|
|
26
|
+
@encoding == other
|
|
27
|
+
when DetectionResult
|
|
28
|
+
@encoding == other.encoding
|
|
29
|
+
else
|
|
30
|
+
super
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Support `eql?` for hash key usage.
|
|
35
|
+
#
|
|
36
|
+
# @param other [Object]
|
|
37
|
+
# @return [Boolean]
|
|
38
|
+
def eql?(other)
|
|
39
|
+
self == other
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Delegate hash to encoding for hash key consistency.
|
|
43
|
+
#
|
|
44
|
+
# @return [Integer]
|
|
45
|
+
def hash
|
|
46
|
+
@encoding.hash
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# String representation shows the encoding name.
|
|
50
|
+
#
|
|
51
|
+
# @return [String]
|
|
52
|
+
def to_s
|
|
53
|
+
@encoding.to_s
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Inspect shows both encoding and confidence.
|
|
57
|
+
#
|
|
58
|
+
# @return [String]
|
|
59
|
+
def inspect
|
|
60
|
+
"#<#{self.class} encoding=#{@encoding} confidence=#{@confidence}>"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Convert to a plain hash representation.
|
|
64
|
+
#
|
|
65
|
+
# @return [Hash]
|
|
66
|
+
def to_h
|
|
67
|
+
{ encoding: @encoding, confidence: @confidence }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Delegate unknown methods to the underlying Encoding object.
|
|
71
|
+
def method_missing(method, ...)
|
|
72
|
+
if @encoding.respond_to?(method)
|
|
73
|
+
@encoding.send(method, ...)
|
|
74
|
+
else
|
|
75
|
+
super
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Support respond_to? for delegated methods.
|
|
80
|
+
def respond_to_missing?(method, include_private = false)
|
|
81
|
+
@encoding.respond_to?(method, include_private) || super
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -13,29 +13,54 @@ module Philiprehberger
|
|
|
13
13
|
["\xFF\xFE".b, Encoding::UTF_16LE]
|
|
14
14
|
].freeze
|
|
15
15
|
|
|
16
|
+
# Bytes in 0x80-0x9F that are defined in CP1252 but not in ISO-8859-1.
|
|
17
|
+
# These bytes are unmapped in ISO-8859-1, so their presence strongly
|
|
18
|
+
# suggests a Windows codepage.
|
|
19
|
+
CP1252_SPECIFIC = [
|
|
20
|
+
0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88,
|
|
21
|
+
0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93,
|
|
22
|
+
0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
|
|
23
|
+
0x9C, 0x9E, 0x9F
|
|
24
|
+
].freeze
|
|
25
|
+
|
|
26
|
+
# CP1250 (Central European) has specific characters in 0x80-0x9F
|
|
27
|
+
# that differ from CP1252. Common: 0x8A (S-caron), 0x8E (Z-caron),
|
|
28
|
+
# 0x9A (s-caron), 0x9E (z-caron).
|
|
29
|
+
CP1250_MARKERS = [0x8A, 0x8E, 0x9A, 0x9E].freeze
|
|
30
|
+
|
|
31
|
+
# CP1251 (Cyrillic) maps 0x80-0xFF almost entirely to Cyrillic letters.
|
|
32
|
+
# Bytes 0xC0-0xFF are Cyrillic А-я in CP1251.
|
|
33
|
+
CP1251_RANGE = (0xC0..0xFF)
|
|
34
|
+
|
|
16
35
|
class << self
|
|
17
|
-
# Detect the encoding of a byte string
|
|
36
|
+
# Detect the encoding of a byte string, returning a DetectionResult
|
|
37
|
+
# with encoding and confidence score.
|
|
18
38
|
#
|
|
19
39
|
# Strategy:
|
|
20
|
-
# 1. Check for a byte order mark (BOM)
|
|
21
|
-
# 2. Try UTF-8 validity
|
|
22
|
-
# 3. Check pure ASCII
|
|
23
|
-
# 4.
|
|
24
|
-
# 5.
|
|
40
|
+
# 1. Check for a byte order mark (BOM) - confidence 1.0
|
|
41
|
+
# 2. Try UTF-8 validity - confidence 0.9
|
|
42
|
+
# 3. Check pure ASCII - confidence 0.9
|
|
43
|
+
# 4. Check Windows codepages (CP1252, CP1250, CP1251) - confidence 0.6-0.7
|
|
44
|
+
# 5. Apply Latin-1 heuristic - confidence 0.7
|
|
45
|
+
# 6. Fall back to BINARY - confidence 0.5
|
|
25
46
|
#
|
|
26
47
|
# @param string [String] the input string (ideally with BINARY/ASCII-8BIT encoding)
|
|
27
|
-
# @return [
|
|
48
|
+
# @return [DetectionResult] the detected encoding with confidence
|
|
28
49
|
def call(string)
|
|
29
50
|
bytes = string.b
|
|
30
51
|
|
|
31
|
-
|
|
32
|
-
return
|
|
52
|
+
bom_result = detect_bom_with_confidence(bytes)
|
|
53
|
+
return bom_result if bom_result
|
|
33
54
|
|
|
34
|
-
return Encoding::UTF_8 if valid_utf8?(bytes)
|
|
35
|
-
return Encoding::US_ASCII if ascii_only?(bytes)
|
|
36
|
-
return Encoding::ISO_8859_1 if latin1_heuristic?(bytes)
|
|
55
|
+
return DetectionResult.new(Encoding::UTF_8, utf8_confidence(bytes)) if valid_utf8?(bytes)
|
|
56
|
+
return DetectionResult.new(Encoding::US_ASCII, 0.9) if ascii_only?(bytes)
|
|
37
57
|
|
|
38
|
-
|
|
58
|
+
codepage_result = detect_windows_codepage(bytes)
|
|
59
|
+
return codepage_result if codepage_result
|
|
60
|
+
|
|
61
|
+
return DetectionResult.new(Encoding::ISO_8859_1, 0.7) if latin1_heuristic?(bytes)
|
|
62
|
+
|
|
63
|
+
DetectionResult.new(Encoding::BINARY, 0.5)
|
|
39
64
|
end
|
|
40
65
|
|
|
41
66
|
# Check whether the string starts with a known BOM.
|
|
@@ -51,6 +76,32 @@ module Philiprehberger
|
|
|
51
76
|
|
|
52
77
|
private
|
|
53
78
|
|
|
79
|
+
# Detect BOM and return a DetectionResult with confidence 1.0.
|
|
80
|
+
#
|
|
81
|
+
# @param bytes [String] binary string
|
|
82
|
+
# @return [DetectionResult, nil]
|
|
83
|
+
def detect_bom_with_confidence(bytes)
|
|
84
|
+
BOMS.each do |bom, encoding|
|
|
85
|
+
return DetectionResult.new(encoding, 1.0) if bytes.start_with?(bom)
|
|
86
|
+
end
|
|
87
|
+
nil
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Calculate UTF-8 confidence based on the ratio of multibyte sequences.
|
|
91
|
+
#
|
|
92
|
+
# @param bytes [String] binary string
|
|
93
|
+
# @return [Float] confidence between 0.8 and 0.9
|
|
94
|
+
def utf8_confidence(bytes)
|
|
95
|
+
total = bytes.bytesize.to_f
|
|
96
|
+
return 0.9 if total.zero?
|
|
97
|
+
|
|
98
|
+
high_bytes = bytes.each_byte.count { |b| b >= 128 }
|
|
99
|
+
ratio = high_bytes / total
|
|
100
|
+
|
|
101
|
+
# More multibyte chars = higher confidence it's genuinely UTF-8
|
|
102
|
+
ratio > 0.1 ? 0.9 : 0.85
|
|
103
|
+
end
|
|
104
|
+
|
|
54
105
|
# @param bytes [String] binary string
|
|
55
106
|
# @return [Boolean]
|
|
56
107
|
def valid_utf8?(bytes)
|
|
@@ -64,6 +115,38 @@ module Philiprehberger
|
|
|
64
115
|
bytes.each_byte.all? { |b| b < 128 }
|
|
65
116
|
end
|
|
66
117
|
|
|
118
|
+
# Detect Windows codepages by checking for bytes in the 0x80-0x9F range.
|
|
119
|
+
#
|
|
120
|
+
# @param bytes [String] binary string
|
|
121
|
+
# @return [DetectionResult, nil]
|
|
122
|
+
def detect_windows_codepage(bytes)
|
|
123
|
+
high_control = bytes.each_byte.grep(0x80..0x9F)
|
|
124
|
+
return nil if high_control.empty?
|
|
125
|
+
|
|
126
|
+
# Check for CP1251 (Cyrillic): high ratio of bytes in 0xC0-0xFF
|
|
127
|
+
cyrillic_count = bytes.each_byte.count { |b| CP1251_RANGE.cover?(b) }
|
|
128
|
+
total_high = bytes.each_byte.count { |b| b >= 0x80 }
|
|
129
|
+
|
|
130
|
+
if total_high.positive? && (cyrillic_count.to_f / total_high) > 0.6
|
|
131
|
+
return DetectionResult.new(Encoding::Windows_1251, 0.65)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Check for CP1250 (Central European): presence of specific marker bytes
|
|
135
|
+
cp1250_markers = high_control.count { |b| CP1250_MARKERS.include?(b) }
|
|
136
|
+
if cp1250_markers >= 2
|
|
137
|
+
return DetectionResult.new(Encoding::Windows_1250, 0.6)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Default to CP1252 (Western European) if bytes in 0x80-0x9F are present
|
|
141
|
+
cp1252_count = high_control.count { |b| CP1252_SPECIFIC.include?(b) }
|
|
142
|
+
if cp1252_count.positive?
|
|
143
|
+
confidence = cp1252_count > 3 ? 0.7 : 0.6
|
|
144
|
+
return DetectionResult.new(Encoding::Windows_1252, confidence)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
nil
|
|
148
|
+
end
|
|
149
|
+
|
|
67
150
|
# Simple heuristic: if every byte is in the ISO-8859-1 printable range
|
|
68
151
|
# (0x20..0x7E or 0xA0..0xFF) or is a common control character, treat as Latin-1.
|
|
69
152
|
#
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative 'encoding_kit/version'
|
|
4
|
+
require_relative 'encoding_kit/detection_result'
|
|
4
5
|
require_relative 'encoding_kit/detector'
|
|
5
6
|
require_relative 'encoding_kit/converter'
|
|
6
7
|
|
|
@@ -12,13 +13,79 @@ module Philiprehberger
|
|
|
12
13
|
BOMS = Detector::BOMS
|
|
13
14
|
|
|
14
15
|
# Detect the encoding of a string via BOM and heuristics.
|
|
16
|
+
# Returns a DetectionResult that delegates to the underlying Encoding,
|
|
17
|
+
# so it can be compared directly (e.g., result == Encoding::UTF_8)
|
|
18
|
+
# while also providing a confidence score via result.confidence.
|
|
15
19
|
#
|
|
16
20
|
# @param string [String] the input string
|
|
17
|
-
# @return [
|
|
21
|
+
# @return [DetectionResult] the detected encoding with confidence score
|
|
18
22
|
def self.detect(string)
|
|
19
23
|
Detector.call(string)
|
|
20
24
|
end
|
|
21
25
|
|
|
26
|
+
# Detect encoding from an IO stream by reading a sample of bytes.
|
|
27
|
+
# The IO position is restored after reading (if the IO supports seek).
|
|
28
|
+
#
|
|
29
|
+
# @param io [IO, StringIO] the IO object to read from
|
|
30
|
+
# @param sample_size [Integer] number of bytes to sample (default: 4096)
|
|
31
|
+
# @return [DetectionResult] the detected encoding with confidence score
|
|
32
|
+
def self.detect_stream(io, sample_size: 4096)
|
|
33
|
+
original_pos = io.respond_to?(:pos) ? io.pos : nil
|
|
34
|
+
sample = io.read(sample_size)
|
|
35
|
+
|
|
36
|
+
if original_pos && io.respond_to?(:seek)
|
|
37
|
+
io.seek(original_pos)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
return DetectionResult.new(Encoding::BINARY, 0.5) if sample.nil? || sample.empty?
|
|
41
|
+
|
|
42
|
+
Detector.call(sample)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Analyze a string and return detailed byte distribution statistics
|
|
46
|
+
# along with encoding candidates ranked by confidence.
|
|
47
|
+
#
|
|
48
|
+
# @param string [String] the input string
|
|
49
|
+
# @return [Hash] analysis results with keys :encoding, :confidence,
|
|
50
|
+
# :printable_ratio, :ascii_ratio, :high_bytes, :candidates
|
|
51
|
+
def self.analyze(string)
|
|
52
|
+
bytes = string.b
|
|
53
|
+
total = bytes.bytesize.to_f
|
|
54
|
+
|
|
55
|
+
if total.zero?
|
|
56
|
+
return {
|
|
57
|
+
encoding: Encoding::BINARY,
|
|
58
|
+
confidence: 0.5,
|
|
59
|
+
printable_ratio: 0.0,
|
|
60
|
+
ascii_ratio: 0.0,
|
|
61
|
+
high_bytes: 0,
|
|
62
|
+
candidates: [{ encoding: Encoding::BINARY, confidence: 0.5 }]
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
ascii_count = 0
|
|
67
|
+
printable_count = 0
|
|
68
|
+
high_byte_count = 0
|
|
69
|
+
|
|
70
|
+
bytes.each_byte do |b|
|
|
71
|
+
ascii_count += 1 if b < 128
|
|
72
|
+
printable_count += 1 if (0x20..0x7E).cover?(b) || b == 0x09 || b == 0x0A || b == 0x0D
|
|
73
|
+
high_byte_count += 1 if b >= 128
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
primary = Detector.call(bytes)
|
|
77
|
+
candidates = build_candidates(bytes, primary)
|
|
78
|
+
|
|
79
|
+
{
|
|
80
|
+
encoding: primary.encoding,
|
|
81
|
+
confidence: primary.confidence,
|
|
82
|
+
printable_ratio: (printable_count / total).round(4),
|
|
83
|
+
ascii_ratio: (ascii_count / total).round(4),
|
|
84
|
+
high_bytes: high_byte_count,
|
|
85
|
+
candidates: candidates
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
|
|
22
89
|
# Convert a string to UTF-8, auto-detecting source encoding if not specified.
|
|
23
90
|
#
|
|
24
91
|
# @param string [String] the input string
|
|
@@ -61,6 +128,23 @@ module Philiprehberger
|
|
|
61
128
|
Converter.convert(string, from: from, to: to)
|
|
62
129
|
end
|
|
63
130
|
|
|
131
|
+
# Transcode a string to the target encoding, auto-detecting the source.
|
|
132
|
+
# Simpler API for the most common conversion pattern.
|
|
133
|
+
#
|
|
134
|
+
# @param string [String] the input string
|
|
135
|
+
# @param to [String, Encoding] target encoding (default: UTF-8)
|
|
136
|
+
# @param fallback [Symbol] fallback strategy (:replace or :raise)
|
|
137
|
+
# @param replace [String] replacement character for invalid bytes
|
|
138
|
+
# @return [String] the transcoded string
|
|
139
|
+
# @raise [EncodingKit::Error] on conversion failure when fallback is :raise
|
|
140
|
+
def self.transcode(string, to: Encoding::UTF_8, fallback: :replace, replace: '?')
|
|
141
|
+
detected = Detector.call(string)
|
|
142
|
+
source = detected.encoding
|
|
143
|
+
target = to.is_a?(Encoding) ? to : Encoding.find(to.to_s)
|
|
144
|
+
|
|
145
|
+
Converter.convert(string, from: source, to: target, fallback: fallback, replace: replace)
|
|
146
|
+
end
|
|
147
|
+
|
|
64
148
|
# Remove a byte order mark from the beginning of a string.
|
|
65
149
|
#
|
|
66
150
|
# @param string [String] the input string
|
|
@@ -84,5 +168,41 @@ module Philiprehberger
|
|
|
84
168
|
bytes = string.b
|
|
85
169
|
BOMS.any? { |bom, _encoding| bytes.start_with?(bom) }
|
|
86
170
|
end
|
|
171
|
+
|
|
172
|
+
# Build a list of encoding candidates with confidence scores.
|
|
173
|
+
#
|
|
174
|
+
# @param bytes [String] binary string
|
|
175
|
+
# @param primary [DetectionResult] the primary detection result
|
|
176
|
+
# @return [Array<Hash>] candidates sorted by confidence (descending)
|
|
177
|
+
private_class_method def self.build_candidates(bytes, primary)
|
|
178
|
+
candidates = [{ encoding: primary.encoding, confidence: primary.confidence }]
|
|
179
|
+
|
|
180
|
+
# Check UTF-8 validity as a candidate
|
|
181
|
+
utf8_dup = bytes.dup.force_encoding(Encoding::UTF_8)
|
|
182
|
+
if utf8_dup.valid_encoding? && primary.encoding != Encoding::UTF_8
|
|
183
|
+
candidates << { encoding: Encoding::UTF_8, confidence: 0.6 }
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Check ASCII as a candidate
|
|
187
|
+
if bytes.each_byte.all? { |b| b < 128 } && primary.encoding != Encoding::US_ASCII
|
|
188
|
+
candidates << { encoding: Encoding::US_ASCII, confidence: 0.5 }
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Always consider Latin-1 for high-byte content
|
|
192
|
+
high_bytes = bytes.each_byte.any? { |b| b >= 128 }
|
|
193
|
+
if high_bytes && primary.encoding != Encoding::ISO_8859_1
|
|
194
|
+
candidates << { encoding: Encoding::ISO_8859_1, confidence: 0.5 }
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Consider Windows codepages for high-byte content
|
|
198
|
+
if high_bytes
|
|
199
|
+
has_control_high = bytes.each_byte.any? { |b| (0x80..0x9F).cover?(b) }
|
|
200
|
+
if has_control_high && primary.encoding != Encoding::Windows_1252
|
|
201
|
+
candidates << { encoding: Encoding::Windows_1252, confidence: 0.5 }
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
candidates.sort_by { |c| -c[:confidence] }
|
|
206
|
+
end
|
|
87
207
|
end
|
|
88
208
|
end
|
metadata
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
|
-
description: Detect encoding from BOM and heuristics
|
|
14
|
-
to UTF-8,
|
|
13
|
+
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
|
+
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|
|
15
|
+
codepages. Zero dependencies.
|
|
15
16
|
email:
|
|
16
17
|
- me@philiprehberger.com
|
|
17
18
|
executables: []
|
|
@@ -23,6 +24,7 @@ files:
|
|
|
23
24
|
- README.md
|
|
24
25
|
- lib/philiprehberger/encoding_kit.rb
|
|
25
26
|
- lib/philiprehberger/encoding_kit/converter.rb
|
|
27
|
+
- lib/philiprehberger/encoding_kit/detection_result.rb
|
|
26
28
|
- lib/philiprehberger/encoding_kit/detector.rb
|
|
27
29
|
- lib/philiprehberger/encoding_kit/version.rb
|
|
28
30
|
homepage: https://github.com/philiprehberger/rb-encoding-kit
|