philiprehberger-encoding_kit 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +6 -0
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +56 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 135376935f830bcfabb9c706853c5a45d14b270d47cab9d5e54f1ddcfae5f1d4
|
|
4
|
+
data.tar.gz: 9b06d60d1fd8c3cea4c5197ecb0e26ababe21be31abd6ca92a029978039bc34a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4e3f14286a3ee38a666a246bc513c972c7483b9b3f77f7ce9e2bfc7324d13eb5cd8a4dc1c3fd3b2d524219b5173679ccfcb5bc510f1f9cedf8eb1f865210b1aa
|
|
7
|
+
data.tar.gz: a71393c68877452b9005b69415b321ddc2287c57d87de20ec55abb6f57f242dcf64bebe0412e62d9b984371a5682c797c0c58614bcc629ba485570399abf496b
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.4.0] - 2026-04-20
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `guess_from_filename(path)` — extract an encoding hint from filename extensions (`.utf8`, `.utf-16`, `.latin1`, `.cp1252`, `.sjis`, etc.). Returns `nil` when no hint is present so callers can fall back to byte-based detection
|
|
14
|
+
- `FILENAME_ENCODING_HINTS` constant exposing the suffix → `Encoding` lookup table
|
|
15
|
+
|
|
10
16
|
## [0.3.0] - 2026-04-11
|
|
11
17
|
|
|
12
18
|
### Added
|
data/README.md
CHANGED
|
@@ -150,6 +150,11 @@ content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding
|
|
|
150
150
|
|
|
151
151
|
# Check if a file's encoding is valid
|
|
152
152
|
Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
|
|
153
|
+
|
|
154
|
+
# Guess encoding from a filename hint without reading the bytes
|
|
155
|
+
Philiprehberger::EncodingKit.guess_from_filename("data.utf8.csv") # => Encoding::UTF_8
|
|
156
|
+
Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encoding::ISO_8859_1
|
|
157
|
+
Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
|
|
153
158
|
```
|
|
154
159
|
|
|
155
160
|
### Validity Check
|
|
@@ -179,6 +184,7 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
179
184
|
| `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
|
|
180
185
|
| `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
|
|
181
186
|
| `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
|
|
187
|
+
| `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
|
|
182
188
|
|
|
183
189
|
## Development
|
|
184
190
|
|
|
@@ -201,6 +201,62 @@ module Philiprehberger
|
|
|
201
201
|
valid?(raw, encoding: encoding)
|
|
202
202
|
end
|
|
203
203
|
|
|
204
|
+
# Filename suffix / extension hints that imply a specific encoding.
|
|
205
|
+
# Matched against the final two extension tokens of the filename.
|
|
206
|
+
FILENAME_ENCODING_HINTS = {
|
|
207
|
+
'utf8' => Encoding::UTF_8,
|
|
208
|
+
'utf-8' => Encoding::UTF_8,
|
|
209
|
+
'utf16' => Encoding::UTF_16,
|
|
210
|
+
'utf-16' => Encoding::UTF_16,
|
|
211
|
+
'utf16le' => Encoding::UTF_16LE,
|
|
212
|
+
'utf-16le' => Encoding::UTF_16LE,
|
|
213
|
+
'utf16be' => Encoding::UTF_16BE,
|
|
214
|
+
'utf-16be' => Encoding::UTF_16BE,
|
|
215
|
+
'utf32' => Encoding::UTF_32,
|
|
216
|
+
'utf-32' => Encoding::UTF_32,
|
|
217
|
+
'ascii' => Encoding::US_ASCII,
|
|
218
|
+
'us-ascii' => Encoding::US_ASCII,
|
|
219
|
+
'latin1' => Encoding::ISO_8859_1,
|
|
220
|
+
'latin-1' => Encoding::ISO_8859_1,
|
|
221
|
+
'iso88591' => Encoding::ISO_8859_1,
|
|
222
|
+
'iso-8859-1' => Encoding::ISO_8859_1,
|
|
223
|
+
'iso88592' => Encoding::ISO_8859_2,
|
|
224
|
+
'iso-8859-2' => Encoding::ISO_8859_2,
|
|
225
|
+
'cp1252' => Encoding::Windows_1252,
|
|
226
|
+
'windows1252' => Encoding::Windows_1252,
|
|
227
|
+
'windows-1252' => Encoding::Windows_1252,
|
|
228
|
+
'sjis' => Encoding::Shift_JIS,
|
|
229
|
+
'shiftjis' => Encoding::Shift_JIS,
|
|
230
|
+
'shift-jis' => Encoding::Shift_JIS,
|
|
231
|
+
'shift_jis' => Encoding::Shift_JIS,
|
|
232
|
+
'euc-jp' => Encoding::EUC_JP,
|
|
233
|
+
'eucjp' => Encoding::EUC_JP,
|
|
234
|
+
'gbk' => Encoding::GBK,
|
|
235
|
+
'gb2312' => Encoding::GB2312,
|
|
236
|
+
'big5' => Encoding::Big5
|
|
237
|
+
}.freeze
|
|
238
|
+
|
|
239
|
+
# Guess the encoding based on filename suffixes/extensions alone.
|
|
240
|
+
# Useful when a file name carries an explicit encoding hint
|
|
241
|
+
# (e.g., "data.utf8.csv", "legacy.latin1.txt"). Falls back to nil
|
|
242
|
+
# when no hint can be extracted — callers should then use
|
|
243
|
+
# {.detect_file} to inspect the bytes.
|
|
244
|
+
#
|
|
245
|
+
# Matching is case-insensitive and considers the final two
|
|
246
|
+
# file extension tokens; the rightmost recognizable hint wins.
|
|
247
|
+
#
|
|
248
|
+
# @param filename [String] filename or path
|
|
249
|
+
# @return [Encoding, nil] detected encoding or nil when no hint matches
|
|
250
|
+
def self.guess_from_filename(filename)
|
|
251
|
+
name = File.basename(filename.to_s).downcase
|
|
252
|
+
tokens = name.split('.').last(3) # extension + up to two modifiers
|
|
253
|
+
tokens.reverse_each do |token|
|
|
254
|
+
enc = FILENAME_ENCODING_HINTS[token]
|
|
255
|
+
return enc if enc
|
|
256
|
+
end
|
|
257
|
+
nil
|
|
258
|
+
end
|
|
259
|
+
|
|
204
260
|
# Build a list of encoding candidates with confidence scores.
|
|
205
261
|
#
|
|
206
262
|
# @param bytes [String] binary string
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
14
|
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|