philiprehberger-encoding_kit 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +28 -0
- data/lib/philiprehberger/encoding_kit/converter.rb +14 -0
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +81 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3e69668037179f14b92560b58c816a29a14134560bd32568775fd321325b4644
|
|
4
|
+
data.tar.gz: 3ad9997b697ca6cecca1d8e63955512eb860a86c48007a5138777167da9c4d13
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: af626ca49ad283a08574162ed45b81fabdccf7aca6d978d78d9367df150234e28d7cd563a1c212145c3a399a7b431b65cd1508fbba67a784517344b124a85a38
|
|
7
|
+
data.tar.gz: ffd8620298177f0a689411d49bdadbcc61c0a93c0fb0c5afc6cd9e5f72ef77f627ec14aea5819ca898f4aeb774cbdef02957c1777d640712cab360c7eb6622e6
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.5.0] - 2026-04-30
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `EncodingKit.scrub(string)` — strip invalid bytes from a string (vs. `normalize` which replaces with `�`)
|
|
14
|
+
- `EncodingKit.normalize_line_endings(string, to:)` — convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, or `:cr`)
|
|
15
|
+
- `Converter.scrub` companion method on the `Converter` module
|
|
16
|
+
|
|
17
|
+
## [0.4.0] - 2026-04-20
|
|
18
|
+
|
|
19
|
+
### Added
|
|
20
|
+
- `guess_from_filename(path)` — extract an encoding hint from filename extensions (`.utf8`, `.utf-16`, `.latin1`, `.cp1252`, `.sjis`, etc.). Returns `nil` when no hint is present so callers can fall back to byte-based detection
|
|
21
|
+
- `FILENAME_ENCODING_HINTS` constant exposing the suffix → `Encoding` lookup table
|
|
22
|
+
|
|
10
23
|
## [0.3.0] - 2026-04-11
|
|
11
24
|
|
|
12
25
|
### Added
|
data/README.md
CHANGED
|
@@ -150,6 +150,31 @@ content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding
|
|
|
150
150
|
|
|
151
151
|
# Check if a file's encoding is valid
|
|
152
152
|
Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
|
|
153
|
+
|
|
154
|
+
# Guess encoding from a filename hint without reading the bytes
|
|
155
|
+
Philiprehberger::EncodingKit.guess_from_filename("data.utf8.csv") # => Encoding::UTF_8
|
|
156
|
+
Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encoding::ISO_8859_1
|
|
157
|
+
Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Stripping Invalid Bytes
|
|
161
|
+
|
|
162
|
+
```ruby
|
|
163
|
+
# normalize replaces invalid bytes with U+FFFD ('�')
|
|
164
|
+
Philiprehberger::EncodingKit.normalize("foo\xFFbar") # => "foo�bar"
|
|
165
|
+
|
|
166
|
+
# scrub removes them entirely
|
|
167
|
+
Philiprehberger::EncodingKit.scrub("foo\xFFbar") # => "foobar"
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Normalizing Line Endings
|
|
171
|
+
|
|
172
|
+
```ruby
|
|
173
|
+
mixed = "alpha\r\nbeta\rgamma\ndelta"
|
|
174
|
+
|
|
175
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed) # => "alpha\nbeta\ngamma\ndelta"
|
|
176
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :crlf) # => "alpha\r\nbeta\r\ngamma\r\ndelta"
|
|
177
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :cr) # => "alpha\rbeta\rgamma\rdelta"
|
|
153
178
|
```
|
|
154
179
|
|
|
155
180
|
### Validity Check
|
|
@@ -172,6 +197,8 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
172
197
|
| `EncodingKit.transcode(string, to:, fallback:, replace:)` | Auto-detect source and convert to target encoding |
|
|
173
198
|
| `EncodingKit.to_utf8(string, from: nil)` | Convert to UTF-8, auto-detect source if `from` is nil |
|
|
174
199
|
| `EncodingKit.normalize(string)` | Force to valid UTF-8, replacing bad bytes with U+FFFD |
|
|
200
|
+
| `EncodingKit.scrub(string)` | Force to valid UTF-8 by removing invalid bytes entirely |
|
|
201
|
+
| `EncodingKit.normalize_line_endings(string, to: :lf)` | Convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, `:cr`) |
|
|
175
202
|
| `EncodingKit.valid?(string, encoding: nil)` | Check if string is valid in given or current encoding |
|
|
176
203
|
| `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
|
|
177
204
|
| `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
|
|
@@ -179,6 +206,7 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
179
206
|
| `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
|
|
180
207
|
| `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
|
|
181
208
|
| `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
|
|
209
|
+
| `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
|
|
182
210
|
|
|
183
211
|
## Development
|
|
184
212
|
|
|
@@ -53,6 +53,20 @@ module Philiprehberger
|
|
|
53
53
|
|
|
54
54
|
str.encode(Encoding::UTF_8, str.encoding, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
55
55
|
end
|
|
56
|
+
|
|
57
|
+
# Strip invalid bytes from a string, returning valid UTF-8 with bad bytes removed.
|
|
58
|
+
#
|
|
59
|
+
# Unlike {.normalize}, which replaces invalid bytes with `\uFFFD`, this method
|
|
60
|
+
# removes them entirely \u2014 useful when downstream consumers cannot tolerate
|
|
61
|
+
# any non-source content.
|
|
62
|
+
#
|
|
63
|
+
# @param string [String] the input string
|
|
64
|
+
# @return [String] valid UTF-8 string with invalid bytes removed
|
|
65
|
+
def scrub(string)
|
|
66
|
+
str = string.dup
|
|
67
|
+
str.force_encoding(Encoding::UTF_8) if [Encoding::BINARY, Encoding::ASCII_8BIT].include?(str.encoding)
|
|
68
|
+
str.scrub('')
|
|
69
|
+
end
|
|
56
70
|
end
|
|
57
71
|
end
|
|
58
72
|
end
|
|
@@ -104,6 +104,31 @@ module Philiprehberger
|
|
|
104
104
|
Converter.normalize(string)
|
|
105
105
|
end
|
|
106
106
|
|
|
107
|
+
# Strip invalid bytes from a string, returning valid UTF-8.
|
|
108
|
+
#
|
|
109
|
+
# Unlike {.normalize}, which replaces invalid bytes with `�`, this method
|
|
110
|
+
# removes them entirely.
|
|
111
|
+
#
|
|
112
|
+
# @param string [String] the input string
|
|
113
|
+
# @return [String] valid UTF-8 string with invalid bytes removed
|
|
114
|
+
def self.scrub(string)
|
|
115
|
+
Converter.scrub(string)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
LINE_ENDINGS = { lf: "\n", crlf: "\r\n", cr: "\r" }.freeze
|
|
119
|
+
|
|
120
|
+
# Normalize line endings to a single canonical form.
|
|
121
|
+
#
|
|
122
|
+
# @param string [String] the input string
|
|
123
|
+
# @param to [Symbol] target line ending: `:lf`, `:crlf`, or `:cr`
|
|
124
|
+
# @return [String] string with normalized line endings
|
|
125
|
+
# @raise [Error] if `to:` is not one of `:lf`, `:crlf`, or `:cr`
|
|
126
|
+
def self.normalize_line_endings(string, to: :lf)
|
|
127
|
+
target = LINE_ENDINGS[to] or raise Error, "Unknown line ending: #{to.inspect} (expected :lf, :crlf, or :cr)"
|
|
128
|
+
|
|
129
|
+
string.gsub(/\r\n|\r|\n/, target)
|
|
130
|
+
end
|
|
131
|
+
|
|
107
132
|
# Check if a string is valid in the given encoding (or its current encoding).
|
|
108
133
|
#
|
|
109
134
|
# @param string [String] the input string
|
|
@@ -201,6 +226,62 @@ module Philiprehberger
|
|
|
201
226
|
valid?(raw, encoding: encoding)
|
|
202
227
|
end
|
|
203
228
|
|
|
229
|
+
# Filename suffix / extension hints that imply a specific encoding.
|
|
230
|
+
# Matched against the final two extension tokens of the filename.
|
|
231
|
+
FILENAME_ENCODING_HINTS = {
|
|
232
|
+
'utf8' => Encoding::UTF_8,
|
|
233
|
+
'utf-8' => Encoding::UTF_8,
|
|
234
|
+
'utf16' => Encoding::UTF_16,
|
|
235
|
+
'utf-16' => Encoding::UTF_16,
|
|
236
|
+
'utf16le' => Encoding::UTF_16LE,
|
|
237
|
+
'utf-16le' => Encoding::UTF_16LE,
|
|
238
|
+
'utf16be' => Encoding::UTF_16BE,
|
|
239
|
+
'utf-16be' => Encoding::UTF_16BE,
|
|
240
|
+
'utf32' => Encoding::UTF_32,
|
|
241
|
+
'utf-32' => Encoding::UTF_32,
|
|
242
|
+
'ascii' => Encoding::US_ASCII,
|
|
243
|
+
'us-ascii' => Encoding::US_ASCII,
|
|
244
|
+
'latin1' => Encoding::ISO_8859_1,
|
|
245
|
+
'latin-1' => Encoding::ISO_8859_1,
|
|
246
|
+
'iso88591' => Encoding::ISO_8859_1,
|
|
247
|
+
'iso-8859-1' => Encoding::ISO_8859_1,
|
|
248
|
+
'iso88592' => Encoding::ISO_8859_2,
|
|
249
|
+
'iso-8859-2' => Encoding::ISO_8859_2,
|
|
250
|
+
'cp1252' => Encoding::Windows_1252,
|
|
251
|
+
'windows1252' => Encoding::Windows_1252,
|
|
252
|
+
'windows-1252' => Encoding::Windows_1252,
|
|
253
|
+
'sjis' => Encoding::Shift_JIS,
|
|
254
|
+
'shiftjis' => Encoding::Shift_JIS,
|
|
255
|
+
'shift-jis' => Encoding::Shift_JIS,
|
|
256
|
+
'shift_jis' => Encoding::Shift_JIS,
|
|
257
|
+
'euc-jp' => Encoding::EUC_JP,
|
|
258
|
+
'eucjp' => Encoding::EUC_JP,
|
|
259
|
+
'gbk' => Encoding::GBK,
|
|
260
|
+
'gb2312' => Encoding::GB2312,
|
|
261
|
+
'big5' => Encoding::Big5
|
|
262
|
+
}.freeze
|
|
263
|
+
|
|
264
|
+
# Guess the encoding based on filename suffixes/extensions alone.
|
|
265
|
+
# Useful when a file name carries an explicit encoding hint
|
|
266
|
+
# (e.g., "data.utf8.csv", "legacy.latin1.txt"). Falls back to nil
|
|
267
|
+
# when no hint can be extracted — callers should then use
|
|
268
|
+
# {.detect_file} to inspect the bytes.
|
|
269
|
+
#
|
|
270
|
+
# Matching is case-insensitive and considers the final two
|
|
271
|
+
# file extension tokens; the rightmost recognizable hint wins.
|
|
272
|
+
#
|
|
273
|
+
# @param filename [String] filename or path
|
|
274
|
+
# @return [Encoding, nil] detected encoding or nil when no hint matches
|
|
275
|
+
def self.guess_from_filename(filename)
|
|
276
|
+
name = File.basename(filename.to_s).downcase
|
|
277
|
+
tokens = name.split('.').last(3) # extension + up to two modifiers
|
|
278
|
+
tokens.reverse_each do |token|
|
|
279
|
+
enc = FILENAME_ENCODING_HINTS[token]
|
|
280
|
+
return enc if enc
|
|
281
|
+
end
|
|
282
|
+
nil
|
|
283
|
+
end
|
|
284
|
+
|
|
204
285
|
# Build a list of encoding candidates with confidence scores.
|
|
205
286
|
#
|
|
206
287
|
# @param bytes [String] binary string
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
14
|
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|