philiprehberger-encoding_kit 0.2.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 13e2132a0833297e1443ae3c9fc280b2c75ca43c9092e2b6d4d1431981ab90af
4
- data.tar.gz: 86091aa48e10920818add35a6e9c147095495096f284d8e00a93d6a3f0ab0596
3
+ metadata.gz: 135376935f830bcfabb9c706853c5a45d14b270d47cab9d5e54f1ddcfae5f1d4
4
+ data.tar.gz: 9b06d60d1fd8c3cea4c5197ecb0e26ababe21be31abd6ca92a029978039bc34a
5
5
  SHA512:
6
- metadata.gz: 55423bdb247b979775199f4ff83043c5bfebff3963ae8f5c0256ca86482e2fd6ded64fe54aded1f06864c387e5a852129baa5a7e8ffb35f47e0dd756e5750b96
7
- data.tar.gz: eb8ca82fa4fae2670b5381dca826e95bc7dd281811c97d9488080d2a7666771cc927f16ee8715a05a07418d1b0f4da6096e47f3dcb09bb419e9816c8e284a807
6
+ metadata.gz: 4e3f14286a3ee38a666a246bc513c972c7483b9b3f77f7ce9e2bfc7324d13eb5cd8a4dc1c3fd3b2d524219b5173679ccfcb5bc510f1f9cedf8eb1f865210b1aa
7
+ data.tar.gz: a71393c68877452b9005b69415b321ddc2287c57d87de20ec55abb6f57f242dcf64bebe0412e62d9b984371a5682c797c0c58614bcc629ba485570399abf496b
data/CHANGELOG.md CHANGED
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.0] - 2026-04-20
11
+
12
+ ### Added
13
+ - `guess_from_filename(path)` — extract an encoding hint from filename extensions (`.utf8`, `.utf-16`, `.latin1`, `.cp1252`, `.sjis`, etc.). Returns `nil` when no hint is present so callers can fall back to byte-based detection
14
+ - `FILENAME_ENCODING_HINTS` constant exposing the suffix → `Encoding` lookup table
15
+
16
+ ## [0.3.0] - 2026-04-11
17
+
18
+ ### Added
19
+ - `detect_file(path, sample_size:)` for file-based encoding detection
20
+ - `read_as_utf8(path, from:)` to read files directly as UTF-8
21
+ - `file_valid?(path, encoding:)` to check file encoding validity
22
+
10
23
  ## [0.2.1] - 2026-03-31
11
24
 
12
25
  ### Changed
data/README.md CHANGED
@@ -131,6 +131,32 @@ Philiprehberger::EncodingKit.bom?("\xEF\xBB\xBFhello") # => true
131
131
  Philiprehberger::EncodingKit.strip_bom("\xEF\xBB\xBFhello") # => "hello"
132
132
  ```
133
133
 
134
+ ### File Operations
135
+
136
+ ```ruby
137
+ require "philiprehberger/encoding_kit"
138
+
139
+ # Detect a file's encoding
140
+ result = Philiprehberger::EncodingKit.detect_file("data.csv")
141
+ result.encoding # => Encoding::UTF_8
142
+ result.confidence # => 0.9
143
+
144
+ # Read a file as UTF-8 (auto-detects source encoding)
145
+ content = Philiprehberger::EncodingKit.read_as_utf8("legacy.txt")
146
+ content.encoding # => Encoding::UTF_8
147
+
148
+ # Read with explicit source encoding
149
+ content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding::ISO_8859_1)
150
+
151
+ # Check if a file's encoding is valid
152
+ Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
153
+
154
+ # Guess encoding from a filename hint without reading the bytes
155
+ Philiprehberger::EncodingKit.guess_from_filename("data.utf8.csv") # => Encoding::UTF_8
156
+ Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encoding::ISO_8859_1
157
+ Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
158
+ ```
159
+
134
160
  ### Validity Check
135
161
 
136
162
  ```ruby
@@ -155,6 +181,10 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
155
181
  | `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
156
182
  | `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
157
183
  | `EncodingKit.bom?(string)` | Check if string starts with a BOM |
184
+ | `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
185
+ | `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
186
+ | `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
187
+ | `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
158
188
 
159
189
  ## Development
160
190
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Philiprehberger
4
4
  module EncodingKit
5
- VERSION = '0.2.1'
5
+ VERSION = '0.4.0'
6
6
  end
7
7
  end
@@ -169,6 +169,94 @@ module Philiprehberger
169
169
  BOMS.any? { |bom, _encoding| bytes.start_with?(bom) }
170
170
  end
171
171
 
172
+ # Detect the encoding of a file by reading a byte sample.
173
+ #
174
+ # @param path [String] path to the file
175
+ # @param sample_size [Integer] number of bytes to sample (default: 4096)
176
+ # @return [DetectionResult] the detected encoding with confidence score
177
+ def self.detect_file(path, sample_size: 4096)
178
+ File.open(path, 'rb') do |file|
179
+ detect_stream(file, sample_size: sample_size)
180
+ end
181
+ end
182
+
183
+ # Read a file and return its content as UTF-8.
184
+ # Auto-detects the source encoding unless specified via `from:`.
185
+ #
186
+ # @param path [String] path to the file
187
+ # @param from [String, Encoding, nil] source encoding (auto-detect if nil)
188
+ # @return [String] UTF-8 encoded file content
189
+ def self.read_as_utf8(path, from: nil)
190
+ raw = File.binread(path)
191
+ to_utf8(raw, from: from)
192
+ end
193
+
194
+ # Check if a file's content is valid in the detected or specified encoding.
195
+ #
196
+ # @param path [String] path to the file
197
+ # @param encoding [String, Encoding, nil] encoding to check against (auto-detect if nil)
198
+ # @return [Boolean]
199
+ def self.file_valid?(path, encoding: nil)
200
+ raw = File.binread(path)
201
+ valid?(raw, encoding: encoding)
202
+ end
203
+
204
+ # Filename suffix / extension hints that imply a specific encoding.
205
+ # Matched against the final two extension tokens of the filename.
206
+ FILENAME_ENCODING_HINTS = {
207
+ 'utf8' => Encoding::UTF_8,
208
+ 'utf-8' => Encoding::UTF_8,
209
+ 'utf16' => Encoding::UTF_16,
210
+ 'utf-16' => Encoding::UTF_16,
211
+ 'utf16le' => Encoding::UTF_16LE,
212
+ 'utf-16le' => Encoding::UTF_16LE,
213
+ 'utf16be' => Encoding::UTF_16BE,
214
+ 'utf-16be' => Encoding::UTF_16BE,
215
+ 'utf32' => Encoding::UTF_32,
216
+ 'utf-32' => Encoding::UTF_32,
217
+ 'ascii' => Encoding::US_ASCII,
218
+ 'us-ascii' => Encoding::US_ASCII,
219
+ 'latin1' => Encoding::ISO_8859_1,
220
+ 'latin-1' => Encoding::ISO_8859_1,
221
+ 'iso88591' => Encoding::ISO_8859_1,
222
+ 'iso-8859-1' => Encoding::ISO_8859_1,
223
+ 'iso88592' => Encoding::ISO_8859_2,
224
+ 'iso-8859-2' => Encoding::ISO_8859_2,
225
+ 'cp1252' => Encoding::Windows_1252,
226
+ 'windows1252' => Encoding::Windows_1252,
227
+ 'windows-1252' => Encoding::Windows_1252,
228
+ 'sjis' => Encoding::Shift_JIS,
229
+ 'shiftjis' => Encoding::Shift_JIS,
230
+ 'shift-jis' => Encoding::Shift_JIS,
231
+ 'shift_jis' => Encoding::Shift_JIS,
232
+ 'euc-jp' => Encoding::EUC_JP,
233
+ 'eucjp' => Encoding::EUC_JP,
234
+ 'gbk' => Encoding::GBK,
235
+ 'gb2312' => Encoding::GB2312,
236
+ 'big5' => Encoding::Big5
237
+ }.freeze
238
+
239
+ # Guess the encoding based on filename suffixes/extensions alone.
240
+ # Useful when a file name carries an explicit encoding hint
241
+ # (e.g., "data.utf8.csv", "legacy.latin1.txt"). Falls back to nil
242
+ # when no hint can be extracted — callers should then use
243
+ # {.detect_file} to inspect the bytes.
244
+ #
245
+ # Matching is case-insensitive and considers the final two
246
+ # file extension tokens; the rightmost recognizable hint wins.
247
+ #
248
+ # @param filename [String] filename or path
249
+ # @return [Encoding, nil] detected encoding or nil when no hint matches
250
+ def self.guess_from_filename(filename)
251
+ name = File.basename(filename.to_s).downcase
252
+ tokens = name.split('.').last(3) # extension + up to two modifiers
253
+ tokens.reverse_each do |token|
254
+ enc = FILENAME_ENCODING_HINTS[token]
255
+ return enc if enc
256
+ end
257
+ nil
258
+ end
259
+
172
260
  # Build a list of encoding candidates with confidence scores.
173
261
  #
174
262
  # @param bytes [String] binary string
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: philiprehberger-encoding_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip Rehberger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-03-31 00:00:00.000000000 Z
11
+ date: 2026-04-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Detect encoding from BOM and heuristics with confidence scores, convert
14
14
  between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
@@ -27,11 +27,11 @@ files:
27
27
  - lib/philiprehberger/encoding_kit/detection_result.rb
28
28
  - lib/philiprehberger/encoding_kit/detector.rb
29
29
  - lib/philiprehberger/encoding_kit/version.rb
30
- homepage: https://github.com/philiprehberger/rb-encoding-kit
30
+ homepage: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
31
31
  licenses:
32
32
  - MIT
33
33
  metadata:
34
- homepage_uri: https://github.com/philiprehberger/rb-encoding-kit
34
+ homepage_uri: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
35
35
  source_code_uri: https://github.com/philiprehberger/rb-encoding-kit
36
36
  changelog_uri: https://github.com/philiprehberger/rb-encoding-kit/blob/main/CHANGELOG.md
37
37
  bug_tracker_uri: https://github.com/philiprehberger/rb-encoding-kit/issues