philiprehberger-encoding_kit 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +22 -0
- data/lib/philiprehberger/encoding_kit/converter.rb +14 -0
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +25 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3e69668037179f14b92560b58c816a29a14134560bd32568775fd321325b4644
|
|
4
|
+
data.tar.gz: 3ad9997b697ca6cecca1d8e63955512eb860a86c48007a5138777167da9c4d13
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: af626ca49ad283a08574162ed45b81fabdccf7aca6d978d78d9367df150234e28d7cd563a1c212145c3a399a7b431b65cd1508fbba67a784517344b124a85a38
|
|
7
|
+
data.tar.gz: ffd8620298177f0a689411d49bdadbcc61c0a93c0fb0c5afc6cd9e5f72ef77f627ec14aea5819ca898f4aeb774cbdef02957c1777d640712cab360c7eb6622e6
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.5.0] - 2026-04-30
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `EncodingKit.scrub(string)` — strip invalid bytes from a string (vs. `normalize` which replaces with `�`)
|
|
14
|
+
- `EncodingKit.normalize_line_endings(string, to:)` — convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, or `:cr`)
|
|
15
|
+
- `Converter.scrub` companion method on the `Converter` module
|
|
16
|
+
|
|
10
17
|
## [0.4.0] - 2026-04-20
|
|
11
18
|
|
|
12
19
|
### Added
|
data/README.md
CHANGED
|
@@ -157,6 +157,26 @@ Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encod
|
|
|
157
157
|
Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
|
|
158
158
|
```
|
|
159
159
|
|
|
160
|
+
### Stripping Invalid Bytes
|
|
161
|
+
|
|
162
|
+
```ruby
|
|
163
|
+
# normalize replaces invalid bytes with U+FFFD ('�')
|
|
164
|
+
Philiprehberger::EncodingKit.normalize("foo\xFFbar") # => "foo�bar"
|
|
165
|
+
|
|
166
|
+
# scrub removes them entirely
|
|
167
|
+
Philiprehberger::EncodingKit.scrub("foo\xFFbar") # => "foobar"
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Normalizing Line Endings
|
|
171
|
+
|
|
172
|
+
```ruby
|
|
173
|
+
mixed = "alpha\r\nbeta\rgamma\ndelta"
|
|
174
|
+
|
|
175
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed) # => "alpha\nbeta\ngamma\ndelta"
|
|
176
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :crlf) # => "alpha\r\nbeta\r\ngamma\r\ndelta"
|
|
177
|
+
Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :cr) # => "alpha\rbeta\rgamma\rdelta"
|
|
178
|
+
```
|
|
179
|
+
|
|
160
180
|
### Validity Check
|
|
161
181
|
|
|
162
182
|
```ruby
|
|
@@ -177,6 +197,8 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
177
197
|
| `EncodingKit.transcode(string, to:, fallback:, replace:)` | Auto-detect source and convert to target encoding |
|
|
178
198
|
| `EncodingKit.to_utf8(string, from: nil)` | Convert to UTF-8, auto-detect source if `from` is nil |
|
|
179
199
|
| `EncodingKit.normalize(string)` | Force to valid UTF-8, replacing bad bytes with U+FFFD |
|
|
200
|
+
| `EncodingKit.scrub(string)` | Force to valid UTF-8 by removing invalid bytes entirely |
|
|
201
|
+
| `EncodingKit.normalize_line_endings(string, to: :lf)` | Convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, `:cr`) |
|
|
180
202
|
| `EncodingKit.valid?(string, encoding: nil)` | Check if string is valid in given or current encoding |
|
|
181
203
|
| `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
|
|
182
204
|
| `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
|
|
@@ -53,6 +53,20 @@ module Philiprehberger
|
|
|
53
53
|
|
|
54
54
|
str.encode(Encoding::UTF_8, str.encoding, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
55
55
|
end
|
|
56
|
+
|
|
57
|
+
# Strip invalid bytes from a string, returning valid UTF-8 with bad bytes removed.
|
|
58
|
+
#
|
|
59
|
+
# Unlike {.normalize}, which replaces invalid bytes with `\uFFFD`, this method
|
|
60
|
+
# removes them entirely \u2014 useful when downstream consumers cannot tolerate
|
|
61
|
+
# any non-source content.
|
|
62
|
+
#
|
|
63
|
+
# @param string [String] the input string
|
|
64
|
+
# @return [String] valid UTF-8 string with invalid bytes removed
|
|
65
|
+
def scrub(string)
|
|
66
|
+
str = string.dup
|
|
67
|
+
str.force_encoding(Encoding::UTF_8) if [Encoding::BINARY, Encoding::ASCII_8BIT].include?(str.encoding)
|
|
68
|
+
str.scrub('')
|
|
69
|
+
end
|
|
56
70
|
end
|
|
57
71
|
end
|
|
58
72
|
end
|
|
@@ -104,6 +104,31 @@ module Philiprehberger
|
|
|
104
104
|
Converter.normalize(string)
|
|
105
105
|
end
|
|
106
106
|
|
|
107
|
+
# Strip invalid bytes from a string, returning valid UTF-8.
|
|
108
|
+
#
|
|
109
|
+
# Unlike {.normalize}, which replaces invalid bytes with `�`, this method
|
|
110
|
+
# removes them entirely.
|
|
111
|
+
#
|
|
112
|
+
# @param string [String] the input string
|
|
113
|
+
# @return [String] valid UTF-8 string with invalid bytes removed
|
|
114
|
+
def self.scrub(string)
|
|
115
|
+
Converter.scrub(string)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
LINE_ENDINGS = { lf: "\n", crlf: "\r\n", cr: "\r" }.freeze
|
|
119
|
+
|
|
120
|
+
# Normalize line endings to a single canonical form.
|
|
121
|
+
#
|
|
122
|
+
# @param string [String] the input string
|
|
123
|
+
# @param to [Symbol] target line ending: `:lf`, `:crlf`, or `:cr`
|
|
124
|
+
# @return [String] string with normalized line endings
|
|
125
|
+
# @raise [Error] if `to:` is not one of `:lf`, `:crlf`, or `:cr`
|
|
126
|
+
def self.normalize_line_endings(string, to: :lf)
|
|
127
|
+
target = LINE_ENDINGS[to] or raise Error, "Unknown line ending: #{to.inspect} (expected :lf, :crlf, or :cr)"
|
|
128
|
+
|
|
129
|
+
string.gsub(/\r\n|\r|\n/, target)
|
|
130
|
+
end
|
|
131
|
+
|
|
107
132
|
# Check if a string is valid in the given encoding (or its current encoding).
|
|
108
133
|
#
|
|
109
134
|
# @param string [String] the input string
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
14
|
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|