philiprehberger-encoding_kit 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a328360d24956ba2fbcbc7888da4ddfb6dbe1bfac53aa65c6d7431ea6f0c1a16
4
- data.tar.gz: f82f1a192f75e1ed0bf7ac386bb17cc4f259b4847af1a83cd5356ed63a2b675c
3
+ metadata.gz: 135376935f830bcfabb9c706853c5a45d14b270d47cab9d5e54f1ddcfae5f1d4
4
+ data.tar.gz: 9b06d60d1fd8c3cea4c5197ecb0e26ababe21be31abd6ca92a029978039bc34a
5
5
  SHA512:
6
- metadata.gz: 91b0e02d2d301db41bdc4eda1d3298db4fb517e0ec2bce4c4f79deae7787947db55eb0fdbe12557e9cf317b6b1d000c42ffb02b6901774afa531185b7ae46f1f
7
- data.tar.gz: bbab15360e72374f5c9a540c3c8b0c54ae4b673f3bee24ff31ea47ebf2daa29c982564b69069a4f2ead487d5afbefdf606a04fc701c1d6bffd3c75038fd17026
6
+ metadata.gz: 4e3f14286a3ee38a666a246bc513c972c7483b9b3f77f7ce9e2bfc7324d13eb5cd8a4dc1c3fd3b2d524219b5173679ccfcb5bc510f1f9cedf8eb1f865210b1aa
7
+ data.tar.gz: a71393c68877452b9005b69415b321ddc2287c57d87de20ec55abb6f57f242dcf64bebe0412e62d9b984371a5682c797c0c58614bcc629ba485570399abf496b
data/CHANGELOG.md CHANGED
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.0] - 2026-04-20
11
+
12
+ ### Added
13
+ - `guess_from_filename(path)` — extract an encoding hint from filename extensions (`.utf8`, `.utf-16`, `.latin1`, `.cp1252`, `.sjis`, etc.). Returns `nil` when no hint is present so callers can fall back to byte-based detection
14
+ - `FILENAME_ENCODING_HINTS` constant exposing the suffix → `Encoding` lookup table
15
+
10
16
  ## [0.3.0] - 2026-04-11
11
17
 
12
18
  ### Added
data/README.md CHANGED
@@ -150,6 +150,11 @@ content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding
150
150
 
151
151
  # Check if a file's encoding is valid
152
152
  Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
153
+
154
+ # Guess encoding from a filename hint without reading the bytes
155
+ Philiprehberger::EncodingKit.guess_from_filename("data.utf8.csv") # => Encoding::UTF_8
156
+ Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encoding::ISO_8859_1
157
+ Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
153
158
  ```
154
159
 
155
160
  ### Validity Check
@@ -179,6 +184,7 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
179
184
  | `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
180
185
  | `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
181
186
  | `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
187
+ | `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
182
188
 
183
189
  ## Development
184
190
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Philiprehberger
4
4
  module EncodingKit
5
- VERSION = '0.3.0'
5
+ VERSION = '0.4.0'
6
6
  end
7
7
  end
@@ -201,6 +201,62 @@ module Philiprehberger
201
201
  valid?(raw, encoding: encoding)
202
202
  end
203
203
 
204
+ # Filename suffix / extension hints that imply a specific encoding.
205
+ # Matched against the final two extension tokens of the filename.
206
+ FILENAME_ENCODING_HINTS = {
207
+ 'utf8' => Encoding::UTF_8,
208
+ 'utf-8' => Encoding::UTF_8,
209
+ 'utf16' => Encoding::UTF_16,
210
+ 'utf-16' => Encoding::UTF_16,
211
+ 'utf16le' => Encoding::UTF_16LE,
212
+ 'utf-16le' => Encoding::UTF_16LE,
213
+ 'utf16be' => Encoding::UTF_16BE,
214
+ 'utf-16be' => Encoding::UTF_16BE,
215
+ 'utf32' => Encoding::UTF_32,
216
+ 'utf-32' => Encoding::UTF_32,
217
+ 'ascii' => Encoding::US_ASCII,
218
+ 'us-ascii' => Encoding::US_ASCII,
219
+ 'latin1' => Encoding::ISO_8859_1,
220
+ 'latin-1' => Encoding::ISO_8859_1,
221
+ 'iso88591' => Encoding::ISO_8859_1,
222
+ 'iso-8859-1' => Encoding::ISO_8859_1,
223
+ 'iso88592' => Encoding::ISO_8859_2,
224
+ 'iso-8859-2' => Encoding::ISO_8859_2,
225
+ 'cp1252' => Encoding::Windows_1252,
226
+ 'windows1252' => Encoding::Windows_1252,
227
+ 'windows-1252' => Encoding::Windows_1252,
228
+ 'sjis' => Encoding::Shift_JIS,
229
+ 'shiftjis' => Encoding::Shift_JIS,
230
+ 'shift-jis' => Encoding::Shift_JIS,
231
+ 'shift_jis' => Encoding::Shift_JIS,
232
+ 'euc-jp' => Encoding::EUC_JP,
233
+ 'eucjp' => Encoding::EUC_JP,
234
+ 'gbk' => Encoding::GBK,
235
+ 'gb2312' => Encoding::GB2312,
236
+ 'big5' => Encoding::Big5
237
+ }.freeze
238
+
239
+ # Guess the encoding based on filename suffixes/extensions alone.
240
+ # Useful when a file name carries an explicit encoding hint
241
+ # (e.g., "data.utf8.csv", "legacy.latin1.txt"). Falls back to nil
242
+ # when no hint can be extracted — callers should then use
243
+ # {.detect_file} to inspect the bytes.
244
+ #
245
+ # Matching is case-insensitive and considers the final two
246
+ # file extension tokens; the rightmost recognizable hint wins.
247
+ #
248
+ # @param filename [String] filename or path
249
+ # @return [Encoding, nil] detected encoding or nil when no hint matches
250
+ def self.guess_from_filename(filename)
251
+ name = File.basename(filename.to_s).downcase
252
+ tokens = name.split('.').last(3) # extension + up to two modifiers
253
+ tokens.reverse_each do |token|
254
+ enc = FILENAME_ENCODING_HINTS[token]
255
+ return enc if enc
256
+ end
257
+ nil
258
+ end
259
+
204
260
  # Build a list of encoding candidates with confidence scores.
205
261
  #
206
262
  # @param bytes [String] binary string
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: philiprehberger-encoding_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip Rehberger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-11 00:00:00.000000000 Z
11
+ date: 2026-04-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Detect encoding from BOM and heuristics with confidence scores, convert
14
14
  between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows