philiprehberger-encoding_kit 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a328360d24956ba2fbcbc7888da4ddfb6dbe1bfac53aa65c6d7431ea6f0c1a16
4
- data.tar.gz: f82f1a192f75e1ed0bf7ac386bb17cc4f259b4847af1a83cd5356ed63a2b675c
3
+ metadata.gz: 3e69668037179f14b92560b58c816a29a14134560bd32568775fd321325b4644
4
+ data.tar.gz: 3ad9997b697ca6cecca1d8e63955512eb860a86c48007a5138777167da9c4d13
5
5
  SHA512:
6
- metadata.gz: 91b0e02d2d301db41bdc4eda1d3298db4fb517e0ec2bce4c4f79deae7787947db55eb0fdbe12557e9cf317b6b1d000c42ffb02b6901774afa531185b7ae46f1f
7
- data.tar.gz: bbab15360e72374f5c9a540c3c8b0c54ae4b673f3bee24ff31ea47ebf2daa29c982564b69069a4f2ead487d5afbefdf606a04fc701c1d6bffd3c75038fd17026
6
+ metadata.gz: af626ca49ad283a08574162ed45b81fabdccf7aca6d978d78d9367df150234e28d7cd563a1c212145c3a399a7b431b65cd1508fbba67a784517344b124a85a38
7
+ data.tar.gz: ffd8620298177f0a689411d49bdadbcc61c0a93c0fb0c5afc6cd9e5f72ef77f627ec14aea5819ca898f4aeb774cbdef02957c1777d640712cab360c7eb6622e6
data/CHANGELOG.md CHANGED
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.5.0] - 2026-04-30
11
+
12
+ ### Added
13
+ - `EncodingKit.scrub(string)` — strip invalid bytes from a string (vs. `normalize` which replaces with `�`)
14
+ - `EncodingKit.normalize_line_endings(string, to:)` — convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, or `:cr`)
15
+ - `Converter.scrub` companion method on the `Converter` module
16
+
17
+ ## [0.4.0] - 2026-04-20
18
+
19
+ ### Added
20
+ - `guess_from_filename(path)` — extract an encoding hint from filename extensions (`.utf8`, `.utf-16`, `.latin1`, `.cp1252`, `.sjis`, etc.). Returns `nil` when no hint is present so callers can fall back to byte-based detection
21
+ - `FILENAME_ENCODING_HINTS` constant exposing the suffix → `Encoding` lookup table
22
+
10
23
  ## [0.3.0] - 2026-04-11
11
24
 
12
25
  ### Added
data/README.md CHANGED
@@ -150,6 +150,31 @@ content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding
150
150
 
151
151
  # Check if a file's encoding is valid
152
152
  Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
153
+
154
+ # Guess encoding from a filename hint without reading the bytes
155
+ Philiprehberger::EncodingKit.guess_from_filename("data.utf8.csv") # => Encoding::UTF_8
156
+ Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encoding::ISO_8859_1
157
+ Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
158
+ ```
159
+
160
+ ### Stripping Invalid Bytes
161
+
162
+ ```ruby
163
+ # normalize replaces invalid bytes with U+FFFD ('�')
164
+ Philiprehberger::EncodingKit.normalize("foo\xFFbar") # => "foo�bar"
165
+
166
+ # scrub removes them entirely
167
+ Philiprehberger::EncodingKit.scrub("foo\xFFbar") # => "foobar"
168
+ ```
169
+
170
+ ### Normalizing Line Endings
171
+
172
+ ```ruby
173
+ mixed = "alpha\r\nbeta\rgamma\ndelta"
174
+
175
+ Philiprehberger::EncodingKit.normalize_line_endings(mixed) # => "alpha\nbeta\ngamma\ndelta"
176
+ Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :crlf) # => "alpha\r\nbeta\r\ngamma\r\ndelta"
177
+ Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :cr) # => "alpha\rbeta\rgamma\rdelta"
153
178
  ```
154
179
 
155
180
  ### Validity Check
@@ -172,6 +197,8 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
172
197
  | `EncodingKit.transcode(string, to:, fallback:, replace:)` | Auto-detect source and convert to target encoding |
173
198
  | `EncodingKit.to_utf8(string, from: nil)` | Convert to UTF-8, auto-detect source if `from` is nil |
174
199
  | `EncodingKit.normalize(string)` | Force to valid UTF-8, replacing bad bytes with U+FFFD |
200
+ | `EncodingKit.scrub(string)` | Force to valid UTF-8 by removing invalid bytes entirely |
201
+ | `EncodingKit.normalize_line_endings(string, to: :lf)` | Convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, `:cr`) |
175
202
  | `EncodingKit.valid?(string, encoding: nil)` | Check if string is valid in given or current encoding |
176
203
  | `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
177
204
  | `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
@@ -179,6 +206,7 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
179
206
  | `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
180
207
  | `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
181
208
  | `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
209
+ | `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
182
210
 
183
211
  ## Development
184
212
 
@@ -53,6 +53,20 @@ module Philiprehberger
53
53
 
54
54
  str.encode(Encoding::UTF_8, str.encoding, invalid: :replace, undef: :replace, replace: "\uFFFD")
55
55
  end
56
+
57
+ # Strip invalid bytes from a string, returning valid UTF-8 with bad bytes removed.
58
+ #
59
+ # Unlike {.normalize}, which replaces invalid bytes with `\uFFFD`, this method
60
+ # removes them entirely \u2014 useful when downstream consumers cannot tolerate
61
+ # any non-source content.
62
+ #
63
+ # @param string [String] the input string
64
+ # @return [String] valid UTF-8 string with invalid bytes removed
65
+ def scrub(string)
66
+ str = string.dup
67
+ str.force_encoding(Encoding::UTF_8) if [Encoding::BINARY, Encoding::ASCII_8BIT].include?(str.encoding)
68
+ str.scrub('')
69
+ end
56
70
  end
57
71
  end
58
72
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Philiprehberger
4
4
  module EncodingKit
5
- VERSION = '0.3.0'
5
+ VERSION = '0.5.0'
6
6
  end
7
7
  end
@@ -104,6 +104,31 @@ module Philiprehberger
104
104
  Converter.normalize(string)
105
105
  end
106
106
 
107
+ # Strip invalid bytes from a string, returning valid UTF-8.
108
+ #
109
+ # Unlike {.normalize}, which replaces invalid bytes with `�`, this method
110
+ # removes them entirely.
111
+ #
112
+ # @param string [String] the input string
113
+ # @return [String] valid UTF-8 string with invalid bytes removed
114
+ def self.scrub(string)
115
+ Converter.scrub(string)
116
+ end
117
+
118
+ LINE_ENDINGS = { lf: "\n", crlf: "\r\n", cr: "\r" }.freeze
119
+
120
+ # Normalize line endings to a single canonical form.
121
+ #
122
+ # @param string [String] the input string
123
+ # @param to [Symbol] target line ending: `:lf`, `:crlf`, or `:cr`
124
+ # @return [String] string with normalized line endings
125
+ # @raise [Error] if `to:` is not one of `:lf`, `:crlf`, or `:cr`
126
+ def self.normalize_line_endings(string, to: :lf)
127
+ target = LINE_ENDINGS[to] or raise Error, "Unknown line ending: #{to.inspect} (expected :lf, :crlf, or :cr)"
128
+
129
+ string.gsub(/\r\n|\r|\n/, target)
130
+ end
131
+
107
132
  # Check if a string is valid in the given encoding (or its current encoding).
108
133
  #
109
134
  # @param string [String] the input string
@@ -201,6 +226,62 @@ module Philiprehberger
201
226
  valid?(raw, encoding: encoding)
202
227
  end
203
228
 
229
+ # Filename suffix / extension hints that imply a specific encoding.
230
+ # Matched against the final two extension tokens of the filename.
231
+ FILENAME_ENCODING_HINTS = {
232
+ 'utf8' => Encoding::UTF_8,
233
+ 'utf-8' => Encoding::UTF_8,
234
+ 'utf16' => Encoding::UTF_16,
235
+ 'utf-16' => Encoding::UTF_16,
236
+ 'utf16le' => Encoding::UTF_16LE,
237
+ 'utf-16le' => Encoding::UTF_16LE,
238
+ 'utf16be' => Encoding::UTF_16BE,
239
+ 'utf-16be' => Encoding::UTF_16BE,
240
+ 'utf32' => Encoding::UTF_32,
241
+ 'utf-32' => Encoding::UTF_32,
242
+ 'ascii' => Encoding::US_ASCII,
243
+ 'us-ascii' => Encoding::US_ASCII,
244
+ 'latin1' => Encoding::ISO_8859_1,
245
+ 'latin-1' => Encoding::ISO_8859_1,
246
+ 'iso88591' => Encoding::ISO_8859_1,
247
+ 'iso-8859-1' => Encoding::ISO_8859_1,
248
+ 'iso88592' => Encoding::ISO_8859_2,
249
+ 'iso-8859-2' => Encoding::ISO_8859_2,
250
+ 'cp1252' => Encoding::Windows_1252,
251
+ 'windows1252' => Encoding::Windows_1252,
252
+ 'windows-1252' => Encoding::Windows_1252,
253
+ 'sjis' => Encoding::Shift_JIS,
254
+ 'shiftjis' => Encoding::Shift_JIS,
255
+ 'shift-jis' => Encoding::Shift_JIS,
256
+ 'shift_jis' => Encoding::Shift_JIS,
257
+ 'euc-jp' => Encoding::EUC_JP,
258
+ 'eucjp' => Encoding::EUC_JP,
259
+ 'gbk' => Encoding::GBK,
260
+ 'gb2312' => Encoding::GB2312,
261
+ 'big5' => Encoding::Big5
262
+ }.freeze
263
+
264
+ # Guess the encoding based on filename suffixes/extensions alone.
265
+ # Useful when a file name carries an explicit encoding hint
266
+ # (e.g., "data.utf8.csv", "legacy.latin1.txt"). Falls back to nil
267
+ # when no hint can be extracted — callers should then use
268
+ # {.detect_file} to inspect the bytes.
269
+ #
270
+ # Matching is case-insensitive and considers the final two
271
+ # file extension tokens; the rightmost recognizable hint wins.
272
+ #
273
+ # @param filename [String] filename or path
274
+ # @return [Encoding, nil] detected encoding or nil when no hint matches
275
+ def self.guess_from_filename(filename)
276
+ name = File.basename(filename.to_s).downcase
277
+ tokens = name.split('.').last(3) # extension + up to two modifiers
278
+ tokens.reverse_each do |token|
279
+ enc = FILENAME_ENCODING_HINTS[token]
280
+ return enc if enc
281
+ end
282
+ nil
283
+ end
284
+
204
285
  # Build a list of encoding candidates with confidence scores.
205
286
  #
206
287
  # @param bytes [String] binary string
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: philiprehberger-encoding_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip Rehberger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-11 00:00:00.000000000 Z
11
+ date: 2026-05-01 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Detect encoding from BOM and heuristics with confidence scores, convert
14
14
  between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows