philiprehberger-encoding_kit 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +8 -2
- data/lib/philiprehberger/encoding_kit/converter.rb +4 -2
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +6 -4
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 47de974b9f87b11740744bced13177a6afe17de3ac74a589246b67604241085c
|
|
4
|
+
data.tar.gz: 3ee29c6b81b51166858bb9e051ae5652f2da84d242f80c8baa6ba45017ecbaa8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ba8d01eda47ece58ac39d5c494fa82a5a4fa761087cf7c97da6fda3188b1e060afe50dc90e1228a0dfd3a8ab21384255ba862d027fee98407d33ebfd5f9466ca
|
|
7
|
+
data.tar.gz: 2b5e70cb2767541713c59733e858a6b5f751bd838df739811308e7773a3383aea0955bcd24d161921fec1c1526b92e60b9e0734bafeaed3713f7f379ade00ece
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.6.0] - 2026-05-20
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `strip_bom:` option on `EncodingKit.to_utf8` and `EncodingKit.read_as_utf8` to drop a leading UTF BOM after transcoding
|
|
14
|
+
- Card image reference in the README for registry-side rendering
|
|
15
|
+
|
|
10
16
|
## [0.5.0] - 2026-04-30
|
|
11
17
|
|
|
12
18
|
### Added
|
data/README.md
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
[](https://rubygems.org/gems/philiprehberger-encoding_kit)
|
|
5
5
|
[](https://github.com/philiprehberger/rb-encoding-kit/commits/main)
|
|
6
6
|
|
|
7
|
+

|
|
8
|
+
|
|
7
9
|
Character encoding detection, conversion, and normalization
|
|
8
10
|
|
|
9
11
|
## Requirements
|
|
@@ -103,6 +105,10 @@ utf8 = Philiprehberger::EncodingKit.to_utf8(raw_bytes)
|
|
|
103
105
|
|
|
104
106
|
# Specify source encoding
|
|
105
107
|
utf8 = Philiprehberger::EncodingKit.to_utf8(latin1_string, from: Encoding::ISO_8859_1)
|
|
108
|
+
|
|
109
|
+
# Strip a leading BOM after transcoding
|
|
110
|
+
clean = Philiprehberger::EncodingKit.to_utf8("\xEF\xBB\xBFhello".b, strip_bom: true)
|
|
111
|
+
# => "hello"
|
|
106
112
|
```
|
|
107
113
|
|
|
108
114
|
### Normalize
|
|
@@ -195,7 +201,7 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
195
201
|
| `EncodingKit.detect_stream(io, sample_size: 4096)` | Detect encoding from an IO stream by sampling bytes |
|
|
196
202
|
| `EncodingKit.analyze(string)` | Analyze byte distribution and return encoding candidates with stats |
|
|
197
203
|
| `EncodingKit.transcode(string, to:, fallback:, replace:)` | Auto-detect source and convert to target encoding |
|
|
198
|
-
| `EncodingKit.to_utf8(string, from: nil)` | Convert to UTF-8, auto-detect source if `from` is nil |
|
|
204
|
+
| `EncodingKit.to_utf8(string, from: nil, strip_bom: false)` | Convert to UTF-8, auto-detect source if `from` is nil; pass `strip_bom: true` to drop a leading UTF BOM |
|
|
199
205
|
| `EncodingKit.normalize(string)` | Force to valid UTF-8, replacing bad bytes with U+FFFD |
|
|
200
206
|
| `EncodingKit.scrub(string)` | Force to valid UTF-8 by removing invalid bytes entirely |
|
|
201
207
|
| `EncodingKit.normalize_line_endings(string, to: :lf)` | Convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, `:cr`) |
|
|
@@ -204,7 +210,7 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
204
210
|
| `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
|
|
205
211
|
| `EncodingKit.bom?(string)` | Check if string starts with a BOM |
|
|
206
212
|
| `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
|
|
207
|
-
| `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
|
|
213
|
+
| `EncodingKit.read_as_utf8(path, from: nil, strip_bom: false)` | Read a file and return its content as UTF-8; pass `strip_bom: true` to drop a leading UTF BOM |
|
|
208
214
|
| `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
|
|
209
215
|
| `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
|
|
210
216
|
|
|
@@ -33,12 +33,14 @@ module Philiprehberger
|
|
|
33
33
|
#
|
|
34
34
|
# @param string [String] the input string
|
|
35
35
|
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
36
|
+
# @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
|
|
36
37
|
# @return [String] UTF-8 encoded string
|
|
37
|
-
def to_utf8(string, from: nil)
|
|
38
|
+
def to_utf8(string, from: nil, strip_bom: false)
|
|
38
39
|
detected = from ? Encoding.find(from.to_s) : Detector.call(string)
|
|
39
40
|
source = detected.is_a?(DetectionResult) ? detected.encoding : detected
|
|
40
41
|
str = string.dup.force_encoding(source)
|
|
41
|
-
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
42
|
+
encoded = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
43
|
+
strip_bom ? encoded.delete_prefix("\uFEFF") : encoded
|
|
42
44
|
end
|
|
43
45
|
|
|
44
46
|
# Force a string to valid UTF-8 by replacing invalid and undefined bytes.
|
|
@@ -90,9 +90,10 @@ module Philiprehberger
|
|
|
90
90
|
#
|
|
91
91
|
# @param string [String] the input string
|
|
92
92
|
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
93
|
+
# @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
|
|
93
94
|
# @return [String] UTF-8 encoded string
|
|
94
|
-
def self.to_utf8(string, from: nil)
|
|
95
|
-
Converter.to_utf8(string, from: from)
|
|
95
|
+
def self.to_utf8(string, from: nil, strip_bom: false)
|
|
96
|
+
Converter.to_utf8(string, from: from, strip_bom: strip_bom)
|
|
96
97
|
end
|
|
97
98
|
|
|
98
99
|
# Normalize a string to valid UTF-8, replacing invalid/undefined bytes
|
|
@@ -210,10 +211,11 @@ module Philiprehberger
|
|
|
210
211
|
#
|
|
211
212
|
# @param path [String] path to the file
|
|
212
213
|
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
214
|
+
# @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
|
|
213
215
|
# @return [String] UTF-8 encoded file content
|
|
214
|
-
def self.read_as_utf8(path, from: nil)
|
|
216
|
+
def self.read_as_utf8(path, from: nil, strip_bom: false)
|
|
215
217
|
raw = File.binread(path)
|
|
216
|
-
to_utf8(raw, from: from)
|
|
218
|
+
to_utf8(raw, from: from, strip_bom: strip_bom)
|
|
217
219
|
end
|
|
218
220
|
|
|
219
221
|
# Check if a file's content is valid in the detected or specified encoding.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
14
|
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|