philiprehberger-encoding_kit 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 135376935f830bcfabb9c706853c5a45d14b270d47cab9d5e54f1ddcfae5f1d4
4
- data.tar.gz: 9b06d60d1fd8c3cea4c5197ecb0e26ababe21be31abd6ca92a029978039bc34a
3
+ metadata.gz: 47de974b9f87b11740744bced13177a6afe17de3ac74a589246b67604241085c
4
+ data.tar.gz: 3ee29c6b81b51166858bb9e051ae5652f2da84d242f80c8baa6ba45017ecbaa8
5
5
  SHA512:
6
- metadata.gz: 4e3f14286a3ee38a666a246bc513c972c7483b9b3f77f7ce9e2bfc7324d13eb5cd8a4dc1c3fd3b2d524219b5173679ccfcb5bc510f1f9cedf8eb1f865210b1aa
7
- data.tar.gz: a71393c68877452b9005b69415b321ddc2287c57d87de20ec55abb6f57f242dcf64bebe0412e62d9b984371a5682c797c0c58614bcc629ba485570399abf496b
6
+ metadata.gz: ba8d01eda47ece58ac39d5c494fa82a5a4fa761087cf7c97da6fda3188b1e060afe50dc90e1228a0dfd3a8ab21384255ba862d027fee98407d33ebfd5f9466ca
7
+ data.tar.gz: 2b5e70cb2767541713c59733e858a6b5f751bd838df739811308e7773a3383aea0955bcd24d161921fec1c1526b92e60b9e0734bafeaed3713f7f379ade00ece
data/CHANGELOG.md CHANGED
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.6.0] - 2026-05-20
11
+
12
+ ### Added
13
+ - `strip_bom:` option on `EncodingKit.to_utf8` and `EncodingKit.read_as_utf8` to drop a leading UTF BOM after transcoding
14
+ - Card image reference in the README for registry-side rendering
15
+
16
+ ## [0.5.0] - 2026-04-30
17
+
18
+ ### Added
19
+ - `EncodingKit.scrub(string)` — strip invalid bytes from a string (vs. `normalize` which replaces with `�`)
20
+ - `EncodingKit.normalize_line_endings(string, to:)` — convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, or `:cr`)
21
+ - `Converter.scrub` companion method on the `Converter` module
22
+
10
23
  ## [0.4.0] - 2026-04-20
11
24
 
12
25
  ### Added
data/README.md CHANGED
@@ -4,6 +4,8 @@
4
4
  [![Gem Version](https://badge.fury.io/rb/philiprehberger-encoding_kit.svg)](https://rubygems.org/gems/philiprehberger-encoding_kit)
5
5
  [![Last updated](https://img.shields.io/github/last-commit/philiprehberger/rb-encoding-kit)](https://github.com/philiprehberger/rb-encoding-kit/commits/main)
6
6
 
7
+ ![philiprehberger-encoding_kit](https://raw.githubusercontent.com/philiprehberger/rb-encoding-kit/main/package-card.webp)
8
+
7
9
  Character encoding detection, conversion, and normalization
8
10
 
9
11
  ## Requirements
@@ -103,6 +105,10 @@ utf8 = Philiprehberger::EncodingKit.to_utf8(raw_bytes)
103
105
 
104
106
  # Specify source encoding
105
107
  utf8 = Philiprehberger::EncodingKit.to_utf8(latin1_string, from: Encoding::ISO_8859_1)
108
+
109
+ # Strip a leading BOM after transcoding
110
+ clean = Philiprehberger::EncodingKit.to_utf8("\xEF\xBB\xBFhello".b, strip_bom: true)
111
+ # => "hello"
106
112
  ```
107
113
 
108
114
  ### Normalize
@@ -157,6 +163,26 @@ Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encod
157
163
  Philiprehberger::EncodingKit.guess_from_filename("report.csv") # => nil
158
164
  ```
159
165
 
166
+ ### Stripping Invalid Bytes
167
+
168
+ ```ruby
169
+ # normalize replaces invalid bytes with U+FFFD ('�')
170
+ Philiprehberger::EncodingKit.normalize("foo\xFFbar") # => "foo�bar"
171
+
172
+ # scrub removes them entirely
173
+ Philiprehberger::EncodingKit.scrub("foo\xFFbar") # => "foobar"
174
+ ```
175
+
176
+ ### Normalizing Line Endings
177
+
178
+ ```ruby
179
+ mixed = "alpha\r\nbeta\rgamma\ndelta"
180
+
181
+ Philiprehberger::EncodingKit.normalize_line_endings(mixed) # => "alpha\nbeta\ngamma\ndelta"
182
+ Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :crlf) # => "alpha\r\nbeta\r\ngamma\r\ndelta"
183
+ Philiprehberger::EncodingKit.normalize_line_endings(mixed, to: :cr) # => "alpha\rbeta\rgamma\rdelta"
184
+ ```
185
+
160
186
  ### Validity Check
161
187
 
162
188
  ```ruby
@@ -175,14 +201,16 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
175
201
  | `EncodingKit.detect_stream(io, sample_size: 4096)` | Detect encoding from an IO stream by sampling bytes |
176
202
  | `EncodingKit.analyze(string)` | Analyze byte distribution and return encoding candidates with stats |
177
203
  | `EncodingKit.transcode(string, to:, fallback:, replace:)` | Auto-detect source and convert to target encoding |
178
- | `EncodingKit.to_utf8(string, from: nil)` | Convert to UTF-8, auto-detect source if `from` is nil |
204
+ | `EncodingKit.to_utf8(string, from: nil, strip_bom: false)` | Convert to UTF-8, auto-detect source if `from` is nil; pass `strip_bom: true` to drop a leading UTF BOM |
179
205
  | `EncodingKit.normalize(string)` | Force to valid UTF-8, replacing bad bytes with U+FFFD |
206
+ | `EncodingKit.scrub(string)` | Force to valid UTF-8 by removing invalid bytes entirely |
207
+ | `EncodingKit.normalize_line_endings(string, to: :lf)` | Convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, `:cr`) |
180
208
  | `EncodingKit.valid?(string, encoding: nil)` | Check if string is valid in given or current encoding |
181
209
  | `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
182
210
  | `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
183
211
  | `EncodingKit.bom?(string)` | Check if string starts with a BOM |
184
212
  | `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
185
- | `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
213
+ | `EncodingKit.read_as_utf8(path, from: nil, strip_bom: false)` | Read a file and return its content as UTF-8; pass `strip_bom: true` to drop a leading UTF BOM |
186
214
  | `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
187
215
  | `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
188
216
 
@@ -33,12 +33,14 @@ module Philiprehberger
33
33
  #
34
34
  # @param string [String] the input string
35
35
  # @param from [String, Encoding, nil] source encoding (auto-detect if nil)
36
+ # @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
36
37
  # @return [String] UTF-8 encoded string
37
- def to_utf8(string, from: nil)
38
+ def to_utf8(string, from: nil, strip_bom: false)
38
39
  detected = from ? Encoding.find(from.to_s) : Detector.call(string)
39
40
  source = detected.is_a?(DetectionResult) ? detected.encoding : detected
40
41
  str = string.dup.force_encoding(source)
41
- str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
42
+ encoded = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
43
+ strip_bom ? encoded.delete_prefix("\uFEFF") : encoded
42
44
  end
43
45
 
44
46
  # Force a string to valid UTF-8 by replacing invalid and undefined bytes.
@@ -53,6 +55,20 @@ module Philiprehberger
53
55
 
54
56
  str.encode(Encoding::UTF_8, str.encoding, invalid: :replace, undef: :replace, replace: "\uFFFD")
55
57
  end
58
+
59
+ # Strip invalid bytes from a string, returning valid UTF-8 with bad bytes removed.
60
+ #
61
+ # Unlike {.normalize}, which replaces invalid bytes with `\uFFFD`, this method
62
+ # removes them entirely \u2014 useful when downstream consumers cannot tolerate
63
+ # any non-source content.
64
+ #
65
+ # @param string [String] the input string
66
+ # @return [String] valid UTF-8 string with invalid bytes removed
67
+ def scrub(string)
68
+ str = string.dup
69
+ str.force_encoding(Encoding::UTF_8) if [Encoding::BINARY, Encoding::ASCII_8BIT].include?(str.encoding)
70
+ str.scrub('')
71
+ end
56
72
  end
57
73
  end
58
74
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Philiprehberger
4
4
  module EncodingKit
5
- VERSION = '0.4.0'
5
+ VERSION = '0.6.0'
6
6
  end
7
7
  end
@@ -90,9 +90,10 @@ module Philiprehberger
90
90
  #
91
91
  # @param string [String] the input string
92
92
  # @param from [String, Encoding, nil] source encoding (auto-detect if nil)
93
+ # @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
93
94
  # @return [String] UTF-8 encoded string
94
- def self.to_utf8(string, from: nil)
95
- Converter.to_utf8(string, from: from)
95
+ def self.to_utf8(string, from: nil, strip_bom: false)
96
+ Converter.to_utf8(string, from: from, strip_bom: strip_bom)
96
97
  end
97
98
 
98
99
  # Normalize a string to valid UTF-8, replacing invalid/undefined bytes
@@ -104,6 +105,31 @@ module Philiprehberger
104
105
  Converter.normalize(string)
105
106
  end
106
107
 
108
+ # Strip invalid bytes from a string, returning valid UTF-8.
109
+ #
110
+ # Unlike {.normalize}, which replaces invalid bytes with `�`, this method
111
+ # removes them entirely.
112
+ #
113
+ # @param string [String] the input string
114
+ # @return [String] valid UTF-8 string with invalid bytes removed
115
+ def self.scrub(string)
116
+ Converter.scrub(string)
117
+ end
118
+
119
+ LINE_ENDINGS = { lf: "\n", crlf: "\r\n", cr: "\r" }.freeze
120
+
121
+ # Normalize line endings to a single canonical form.
122
+ #
123
+ # @param string [String] the input string
124
+ # @param to [Symbol] target line ending: `:lf`, `:crlf`, or `:cr`
125
+ # @return [String] string with normalized line endings
126
+ # @raise [Error] if `to:` is not one of `:lf`, `:crlf`, or `:cr`
127
+ def self.normalize_line_endings(string, to: :lf)
128
+ target = LINE_ENDINGS[to] or raise Error, "Unknown line ending: #{to.inspect} (expected :lf, :crlf, or :cr)"
129
+
130
+ string.gsub(/\r\n|\r|\n/, target)
131
+ end
132
+
107
133
  # Check if a string is valid in the given encoding (or its current encoding).
108
134
  #
109
135
  # @param string [String] the input string
@@ -185,10 +211,11 @@ module Philiprehberger
185
211
  #
186
212
  # @param path [String] path to the file
187
213
  # @param from [String, Encoding, nil] source encoding (auto-detect if nil)
214
+ # @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
188
215
  # @return [String] UTF-8 encoded file content
189
- def self.read_as_utf8(path, from: nil)
216
+ def self.read_as_utf8(path, from: nil, strip_bom: false)
190
217
  raw = File.binread(path)
191
- to_utf8(raw, from: from)
218
+ to_utf8(raw, from: from, strip_bom: strip_bom)
192
219
  end
193
220
 
194
221
  # Check if a file's content is valid in the detected or specified encoding.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: philiprehberger-encoding_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip Rehberger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-20 00:00:00.000000000 Z
11
+ date: 2026-05-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Detect encoding from BOM and heuristics with confidence scores, convert
14
14
  between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows