philiprehberger-encoding_kit 0.2.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +30 -0
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +88 -0
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 135376935f830bcfabb9c706853c5a45d14b270d47cab9d5e54f1ddcfae5f1d4
|
|
4
|
+
data.tar.gz: 9b06d60d1fd8c3cea4c5197ecb0e26ababe21be31abd6ca92a029978039bc34a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4e3f14286a3ee38a666a246bc513c972c7483b9b3f77f7ce9e2bfc7324d13eb5cd8a4dc1c3fd3b2d524219b5173679ccfcb5bc510f1f9cedf8eb1f865210b1aa
|
|
7
|
+
data.tar.gz: a71393c68877452b9005b69415b321ddc2287c57d87de20ec55abb6f57f242dcf64bebe0412e62d9b984371a5682c797c0c58614bcc629ba485570399abf496b
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.4.0] - 2026-04-20
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `guess_from_filename(path)` — extract an encoding hint from filename extensions (`.utf8`, `.utf-16`, `.latin1`, `.cp1252`, `.sjis`, etc.). Returns `nil` when no hint is present so callers can fall back to byte-based detection
|
|
14
|
+
- `FILENAME_ENCODING_HINTS` constant exposing the suffix → `Encoding` lookup table
|
|
15
|
+
|
|
16
|
+
## [0.3.0] - 2026-04-11
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
- `detect_file(path, sample_size:)` for file-based encoding detection
|
|
20
|
+
- `read_as_utf8(path, from:)` to read files directly as UTF-8
|
|
21
|
+
- `file_valid?(path, encoding:)` to check file encoding validity
|
|
22
|
+
|
|
10
23
|
## [0.2.1] - 2026-03-31
|
|
11
24
|
|
|
12
25
|
### Changed
|
data/README.md
CHANGED
|
@@ -131,6 +131,32 @@ Philiprehberger::EncodingKit.bom?("\xEF\xBB\xBFhello") # => true
|
|
|
131
131
|
Philiprehberger::EncodingKit.strip_bom("\xEF\xBB\xBFhello") # => "hello"
|
|
132
132
|
```
|
|
133
133
|
|
|
134
|
+
### File Operations
|
|
135
|
+
|
|
136
|
+
```ruby
|
|
137
|
+
require "philiprehberger/encoding_kit"
|
|
138
|
+
|
|
139
|
+
# Detect a file's encoding
|
|
140
|
+
result = Philiprehberger::EncodingKit.detect_file("data.csv")
|
|
141
|
+
result.encoding # => Encoding::UTF_8
|
|
142
|
+
result.confidence # => 0.9
|
|
143
|
+
|
|
144
|
+
# Read a file as UTF-8 (auto-detects source encoding)
|
|
145
|
+
content = Philiprehberger::EncodingKit.read_as_utf8("legacy.txt")
|
|
146
|
+
content.encoding # => Encoding::UTF_8
|
|
147
|
+
|
|
148
|
+
# Read with explicit source encoding
|
|
149
|
+
content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding::ISO_8859_1)
|
|
150
|
+
|
|
151
|
+
# Check if a file's encoding is valid
|
|
152
|
+
Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
|
|
153
|
+
|
|
154
|
+
# Guess encoding from a filename hint without reading the bytes
|
|
155
|
+
Philiprehberger::EncodingKit.guess_from_filename("data.utf8.csv") # => Encoding::UTF_8
|
|
156
|
+
Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encoding::ISO_8859_1
|
|
157
|
+
Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
|
|
158
|
+
```
|
|
159
|
+
|
|
134
160
|
### Validity Check
|
|
135
161
|
|
|
136
162
|
```ruby
|
|
@@ -155,6 +181,10 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
155
181
|
| `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
|
|
156
182
|
| `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
|
|
157
183
|
| `EncodingKit.bom?(string)` | Check if string starts with a BOM |
|
|
184
|
+
| `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
|
|
185
|
+
| `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
|
|
186
|
+
| `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
|
|
187
|
+
| `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
|
|
158
188
|
|
|
159
189
|
## Development
|
|
160
190
|
|
|
@@ -169,6 +169,94 @@ module Philiprehberger
|
|
|
169
169
|
BOMS.any? { |bom, _encoding| bytes.start_with?(bom) }
|
|
170
170
|
end
|
|
171
171
|
|
|
172
|
+
# Detect the encoding of a file by reading a byte sample.
|
|
173
|
+
#
|
|
174
|
+
# @param path [String] path to the file
|
|
175
|
+
# @param sample_size [Integer] number of bytes to sample (default: 4096)
|
|
176
|
+
# @return [DetectionResult] the detected encoding with confidence score
|
|
177
|
+
def self.detect_file(path, sample_size: 4096)
|
|
178
|
+
File.open(path, 'rb') do |file|
|
|
179
|
+
detect_stream(file, sample_size: sample_size)
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Read a file and return its content as UTF-8.
|
|
184
|
+
# Auto-detects the source encoding unless specified via `from:`.
|
|
185
|
+
#
|
|
186
|
+
# @param path [String] path to the file
|
|
187
|
+
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
188
|
+
# @return [String] UTF-8 encoded file content
|
|
189
|
+
def self.read_as_utf8(path, from: nil)
|
|
190
|
+
raw = File.binread(path)
|
|
191
|
+
to_utf8(raw, from: from)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Check if a file's content is valid in the detected or specified encoding.
|
|
195
|
+
#
|
|
196
|
+
# @param path [String] path to the file
|
|
197
|
+
# @param encoding [String, Encoding, nil] encoding to check against (auto-detect if nil)
|
|
198
|
+
# @return [Boolean]
|
|
199
|
+
def self.file_valid?(path, encoding: nil)
|
|
200
|
+
raw = File.binread(path)
|
|
201
|
+
valid?(raw, encoding: encoding)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Filename suffix / extension hints that imply a specific encoding.
|
|
205
|
+
# Matched against the final two extension tokens of the filename.
|
|
206
|
+
FILENAME_ENCODING_HINTS = {
|
|
207
|
+
'utf8' => Encoding::UTF_8,
|
|
208
|
+
'utf-8' => Encoding::UTF_8,
|
|
209
|
+
'utf16' => Encoding::UTF_16,
|
|
210
|
+
'utf-16' => Encoding::UTF_16,
|
|
211
|
+
'utf16le' => Encoding::UTF_16LE,
|
|
212
|
+
'utf-16le' => Encoding::UTF_16LE,
|
|
213
|
+
'utf16be' => Encoding::UTF_16BE,
|
|
214
|
+
'utf-16be' => Encoding::UTF_16BE,
|
|
215
|
+
'utf32' => Encoding::UTF_32,
|
|
216
|
+
'utf-32' => Encoding::UTF_32,
|
|
217
|
+
'ascii' => Encoding::US_ASCII,
|
|
218
|
+
'us-ascii' => Encoding::US_ASCII,
|
|
219
|
+
'latin1' => Encoding::ISO_8859_1,
|
|
220
|
+
'latin-1' => Encoding::ISO_8859_1,
|
|
221
|
+
'iso88591' => Encoding::ISO_8859_1,
|
|
222
|
+
'iso-8859-1' => Encoding::ISO_8859_1,
|
|
223
|
+
'iso88592' => Encoding::ISO_8859_2,
|
|
224
|
+
'iso-8859-2' => Encoding::ISO_8859_2,
|
|
225
|
+
'cp1252' => Encoding::Windows_1252,
|
|
226
|
+
'windows1252' => Encoding::Windows_1252,
|
|
227
|
+
'windows-1252' => Encoding::Windows_1252,
|
|
228
|
+
'sjis' => Encoding::Shift_JIS,
|
|
229
|
+
'shiftjis' => Encoding::Shift_JIS,
|
|
230
|
+
'shift-jis' => Encoding::Shift_JIS,
|
|
231
|
+
'shift_jis' => Encoding::Shift_JIS,
|
|
232
|
+
'euc-jp' => Encoding::EUC_JP,
|
|
233
|
+
'eucjp' => Encoding::EUC_JP,
|
|
234
|
+
'gbk' => Encoding::GBK,
|
|
235
|
+
'gb2312' => Encoding::GB2312,
|
|
236
|
+
'big5' => Encoding::Big5
|
|
237
|
+
}.freeze
|
|
238
|
+
|
|
239
|
+
# Guess the encoding based on filename suffixes/extensions alone.
|
|
240
|
+
# Useful when a file name carries an explicit encoding hint
|
|
241
|
+
# (e.g., "data.utf8.csv", "legacy.latin1.txt"). Falls back to nil
|
|
242
|
+
# when no hint can be extracted — callers should then use
|
|
243
|
+
# {.detect_file} to inspect the bytes.
|
|
244
|
+
#
|
|
245
|
+
# Matching is case-insensitive and considers the final two
|
|
246
|
+
# file extension tokens; the rightmost recognizable hint wins.
|
|
247
|
+
#
|
|
248
|
+
# @param filename [String] filename or path
|
|
249
|
+
# @return [Encoding, nil] detected encoding or nil when no hint matches
|
|
250
|
+
def self.guess_from_filename(filename)
|
|
251
|
+
name = File.basename(filename.to_s).downcase
|
|
252
|
+
tokens = name.split('.').last(3) # extension + up to two modifiers
|
|
253
|
+
tokens.reverse_each do |token|
|
|
254
|
+
enc = FILENAME_ENCODING_HINTS[token]
|
|
255
|
+
return enc if enc
|
|
256
|
+
end
|
|
257
|
+
nil
|
|
258
|
+
end
|
|
259
|
+
|
|
172
260
|
# Build a list of encoding candidates with confidence scores.
|
|
173
261
|
#
|
|
174
262
|
# @param bytes [String] binary string
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
14
|
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|
|
@@ -27,11 +27,11 @@ files:
|
|
|
27
27
|
- lib/philiprehberger/encoding_kit/detection_result.rb
|
|
28
28
|
- lib/philiprehberger/encoding_kit/detector.rb
|
|
29
29
|
- lib/philiprehberger/encoding_kit/version.rb
|
|
30
|
-
homepage: https://
|
|
30
|
+
homepage: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
|
|
31
31
|
licenses:
|
|
32
32
|
- MIT
|
|
33
33
|
metadata:
|
|
34
|
-
homepage_uri: https://
|
|
34
|
+
homepage_uri: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
|
|
35
35
|
source_code_uri: https://github.com/philiprehberger/rb-encoding-kit
|
|
36
36
|
changelog_uri: https://github.com/philiprehberger/rb-encoding-kit/blob/main/CHANGELOG.md
|
|
37
37
|
bug_tracker_uri: https://github.com/philiprehberger/rb-encoding-kit/issues
|