philiprehberger-encoding_kit 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +30 -2
- data/lib/philiprehberger/encoding_kit/converter.rb +18 -2
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +31 -4
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 47de974b9f87b11740744bced13177a6afe17de3ac74a589246b67604241085c
|
|
4
|
+
data.tar.gz: 3ee29c6b81b51166858bb9e051ae5652f2da84d242f80c8baa6ba45017ecbaa8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ba8d01eda47ece58ac39d5c494fa82a5a4fa761087cf7c97da6fda3188b1e060afe50dc90e1228a0dfd3a8ab21384255ba862d027fee98407d33ebfd5f9466ca
|
|
7
|
+
data.tar.gz: 2b5e70cb2767541713c59733e858a6b5f751bd838df739811308e7773a3383aea0955bcd24d161921fec1c1526b92e60b9e0734bafeaed3713f7f379ade00ece
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.6.0] - 2026-05-20
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `strip_bom:` option on `EncodingKit.to_utf8` and `EncodingKit.read_as_utf8` to drop a leading UTF BOM after transcoding
|
|
14
|
+
- Card image reference in the README for registry-side rendering
|
|
15
|
+
|
|
16
|
+
## [0.5.0] - 2026-04-30
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
- `EncodingKit.scrub(string)` — strip invalid bytes from a string (vs. `normalize` which replaces with `�`)
|
|
20
|
+
- `EncodingKit.normalize_line_endings(string, to:)` — convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, or `:cr`)
|
|
21
|
+
- `Converter.scrub` companion method on the `Converter` module
|
|
22
|
+
|
|
10
23
|
## [0.4.0] - 2026-04-20
|
|
11
24
|
|
|
12
25
|
### Added
|
data/README.md
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
[](https://rubygems.org/gems/philiprehberger-encoding_kit)
|
|
5
5
|
[](https://github.com/philiprehberger/rb-encoding-kit/commits/main)
|
|
6
6
|
|
|
7
|
+

|
|
8
|
+
|
|
7
9
|
Character encoding detection, conversion, and normalization
|
|
8
10
|
|
|
9
11
|
## Requirements
|
|
@@ -103,6 +105,10 @@ utf8 = Philiprehberger::EncodingKit.to_utf8(raw_bytes)
|
|
|
103
105
|
|
|
104
106
|
# Specify source encoding
|
|
105
107
|
utf8 = Philiprehberger::EncodingKit.to_utf8(latin1_string, from: Encoding::ISO_8859_1)
|
|
108
|
+
|
|
109
|
+
# Strip a leading BOM after transcoding
|
|
110
|
+
clean = Philiprehberger::EncodingKit.to_utf8("\xEF\xBB\xBFhello".b, strip_bom: true)
|
|
111
|
+
# => "hello"
|
|
106
112
|
```
|
|
107
113
|
|
|
108
114
|
### Normalize
|
|
@@ -157,6 +163,26 @@ Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encod
|
|
|
157
163
|
Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
|
|
158
164
|
```
|
|
159
165
|
|
|
166
|
+
### Stripping Invalid Bytes
|
|
167
|
+
|
|
168
|
+
```ruby
|
|
169
|
+
# normalize replaces invalid bytes with U+FFFD ('�')
|
|
170
|
+
Philiprehberger::EncodingKit.normalize("foo\xFFbar") # => "foo�bar"
|
|
171
|
+
|
|
172
|
+
# scrub removes them entirely
|
|
173
|
+
Philiprehberger::EncodingKit.scrub("foo\xFFbar") # => "foobar"
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Normalizing Line Endings
|
|
177
|
+
|
|
178
|
+
```ruby
|
|
179
|
+
mixed = "alpha\r\nbeta\rgamma\ndelta"
|
|
180
|
+
|
|
181
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed) # => "alpha\nbeta\ngamma\ndelta"
|
|
182
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :crlf) # => "alpha\r\nbeta\r\ngamma\r\ndelta"
|
|
183
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :cr) # => "alpha\rbeta\rgamma\rdelta"
|
|
184
|
+
```
|
|
185
|
+
|
|
160
186
|
### Validity Check
|
|
161
187
|
|
|
162
188
|
```ruby
|
|
@@ -175,14 +201,16 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
175
201
|
| `EncodingKit.detect_stream(io, sample_size: 4096)` | Detect encoding from an IO stream by sampling bytes |
|
|
176
202
|
| `EncodingKit.analyze(string)` | Analyze byte distribution and return encoding candidates with stats |
|
|
177
203
|
| `EncodingKit.transcode(string, to:, fallback:, replace:)` | Auto-detect source and convert to target encoding |
|
|
178
|
-
| `EncodingKit.to_utf8(string, from: nil)` | Convert to UTF-8, auto-detect source if `from` is nil |
|
|
204
|
+
| `EncodingKit.to_utf8(string, from: nil, strip_bom: false)` | Convert to UTF-8, auto-detect source if `from` is nil; pass `strip_bom: true` to drop a leading UTF BOM |
|
|
179
205
|
| `EncodingKit.normalize(string)` | Force to valid UTF-8, replacing bad bytes with U+FFFD |
|
|
206
|
+
| `EncodingKit.scrub(string)` | Force to valid UTF-8 by removing invalid bytes entirely |
|
|
207
|
+
| `EncodingKit.normalize_line_endings(string, to: :lf)` | Convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, `:cr`) |
|
|
180
208
|
| `EncodingKit.valid?(string, encoding: nil)` | Check if string is valid in given or current encoding |
|
|
181
209
|
| `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
|
|
182
210
|
| `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
|
|
183
211
|
| `EncodingKit.bom?(string)` | Check if string starts with a BOM |
|
|
184
212
|
| `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
|
|
185
|
-
| `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
|
|
213
|
+
| `EncodingKit.read_as_utf8(path, from: nil, strip_bom: false)` | Read a file and return its content as UTF-8; pass `strip_bom: true` to drop a leading UTF BOM |
|
|
186
214
|
| `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
|
|
187
215
|
| `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
|
|
188
216
|
|
|
@@ -33,12 +33,14 @@ module Philiprehberger
|
|
|
33
33
|
#
|
|
34
34
|
# @param string [String] the input string
|
|
35
35
|
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
36
|
+
# @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
|
|
36
37
|
# @return [String] UTF-8 encoded string
|
|
37
|
-
def to_utf8(string, from: nil)
|
|
38
|
+
def to_utf8(string, from: nil, strip_bom: false)
|
|
38
39
|
detected = from ? Encoding.find(from.to_s) : Detector.call(string)
|
|
39
40
|
source = detected.is_a?(DetectionResult) ? detected.encoding : detected
|
|
40
41
|
str = string.dup.force_encoding(source)
|
|
41
|
-
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
42
|
+
encoded = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
43
|
+
strip_bom ? encoded.delete_prefix("\uFEFF") : encoded
|
|
42
44
|
end
|
|
43
45
|
|
|
44
46
|
# Force a string to valid UTF-8 by replacing invalid and undefined bytes.
|
|
@@ -53,6 +55,20 @@ module Philiprehberger
|
|
|
53
55
|
|
|
54
56
|
str.encode(Encoding::UTF_8, str.encoding, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
55
57
|
end
|
|
58
|
+
|
|
59
|
+
# Strip invalid bytes from a string, returning valid UTF-8 with bad bytes removed.
|
|
60
|
+
#
|
|
61
|
+
# Unlike {.normalize}, which replaces invalid bytes with `\uFFFD`, this method
|
|
62
|
+
# removes them entirely \u2014 useful when downstream consumers cannot tolerate
|
|
63
|
+
# any non-source content.
|
|
64
|
+
#
|
|
65
|
+
# @param string [String] the input string
|
|
66
|
+
# @return [String] valid UTF-8 string with invalid bytes removed
|
|
67
|
+
def scrub(string)
|
|
68
|
+
str = string.dup
|
|
69
|
+
str.force_encoding(Encoding::UTF_8) if [Encoding::BINARY, Encoding::ASCII_8BIT].include?(str.encoding)
|
|
70
|
+
str.scrub('')
|
|
71
|
+
end
|
|
56
72
|
end
|
|
57
73
|
end
|
|
58
74
|
end
|
|
@@ -90,9 +90,10 @@ module Philiprehberger
|
|
|
90
90
|
#
|
|
91
91
|
# @param string [String] the input string
|
|
92
92
|
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
93
|
+
# @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
|
|
93
94
|
# @return [String] UTF-8 encoded string
|
|
94
|
-
def self.to_utf8(string, from: nil)
|
|
95
|
-
Converter.to_utf8(string, from: from)
|
|
95
|
+
def self.to_utf8(string, from: nil, strip_bom: false)
|
|
96
|
+
Converter.to_utf8(string, from: from, strip_bom: strip_bom)
|
|
96
97
|
end
|
|
97
98
|
|
|
98
99
|
# Normalize a string to valid UTF-8, replacing invalid/undefined bytes
|
|
@@ -104,6 +105,31 @@ module Philiprehberger
|
|
|
104
105
|
Converter.normalize(string)
|
|
105
106
|
end
|
|
106
107
|
|
|
108
|
+
# Strip invalid bytes from a string, returning valid UTF-8.
|
|
109
|
+
#
|
|
110
|
+
# Unlike {.normalize}, which replaces invalid bytes with `�`, this method
|
|
111
|
+
# removes them entirely.
|
|
112
|
+
#
|
|
113
|
+
# @param string [String] the input string
|
|
114
|
+
# @return [String] valid UTF-8 string with invalid bytes removed
|
|
115
|
+
def self.scrub(string)
|
|
116
|
+
Converter.scrub(string)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
LINE_ENDINGS = { lf: "\n", crlf: "\r\n", cr: "\r" }.freeze
|
|
120
|
+
|
|
121
|
+
# Normalize line endings to a single canonical form.
|
|
122
|
+
#
|
|
123
|
+
# @param string [String] the input string
|
|
124
|
+
# @param to [Symbol] target line ending: `:lf`, `:crlf`, or `:cr`
|
|
125
|
+
# @return [String] string with normalized line endings
|
|
126
|
+
# @raise [Error] if `to:` is not one of `:lf`, `:crlf`, or `:cr`
|
|
127
|
+
def self.normalize_line_endings(string, to: :lf)
|
|
128
|
+
target = LINE_ENDINGS[to] or raise Error, "Unknown line ending: #{to.inspect} (expected :lf, :crlf, or :cr)"
|
|
129
|
+
|
|
130
|
+
string.gsub(/\r\n|\r|\n/, target)
|
|
131
|
+
end
|
|
132
|
+
|
|
107
133
|
# Check if a string is valid in the given encoding (or its current encoding).
|
|
108
134
|
#
|
|
109
135
|
# @param string [String] the input string
|
|
@@ -185,10 +211,11 @@ module Philiprehberger
|
|
|
185
211
|
#
|
|
186
212
|
# @param path [String] path to the file
|
|
187
213
|
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
214
|
+
# @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
|
|
188
215
|
# @return [String] UTF-8 encoded file content
|
|
189
|
-
def self.read_as_utf8(path, from: nil)
|
|
216
|
+
def self.read_as_utf8(path, from: nil, strip_bom: false)
|
|
190
217
|
raw = File.binread(path)
|
|
191
|
-
to_utf8(raw, from: from)
|
|
218
|
+
to_utf8(raw, from: from, strip_bom: strip_bom)
|
|
192
219
|
end
|
|
193
220
|
|
|
194
221
|
# Check if a file's content is valid in the detected or specified encoding.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
14
|
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|