philiprehberger-encoding_kit 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3e69668037179f14b92560b58c816a29a14134560bd32568775fd321325b4644
4
- data.tar.gz: 3ad9997b697ca6cecca1d8e63955512eb860a86c48007a5138777167da9c4d13
3
+ metadata.gz: 47de974b9f87b11740744bced13177a6afe17de3ac74a589246b67604241085c
4
+ data.tar.gz: 3ee29c6b81b51166858bb9e051ae5652f2da84d242f80c8baa6ba45017ecbaa8
5
5
  SHA512:
6
- metadata.gz: af626ca49ad283a08574162ed45b81fabdccf7aca6d978d78d9367df150234e28d7cd563a1c212145c3a399a7b431b65cd1508fbba67a784517344b124a85a38
7
- data.tar.gz: ffd8620298177f0a689411d49bdadbcc61c0a93c0fb0c5afc6cd9e5f72ef77f627ec14aea5819ca898f4aeb774cbdef02957c1777d640712cab360c7eb6622e6
6
+ metadata.gz: ba8d01eda47ece58ac39d5c494fa82a5a4fa761087cf7c97da6fda3188b1e060afe50dc90e1228a0dfd3a8ab21384255ba862d027fee98407d33ebfd5f9466ca
7
+ data.tar.gz: 2b5e70cb2767541713c59733e858a6b5f751bd838df739811308e7773a3383aea0955bcd24d161921fec1c1526b92e60b9e0734bafeaed3713f7f379ade00ece
data/CHANGELOG.md CHANGED
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.6.0] - 2026-05-20
11
+
12
+ ### Added
13
+ - `strip_bom:` option on `EncodingKit.to_utf8` and `EncodingKit.read_as_utf8` to drop a leading UTF BOM after transcoding
14
+ - Card image reference in the README for registry-side rendering
15
+
10
16
  ## [0.5.0] - 2026-04-30
11
17
 
12
18
  ### Added
data/README.md CHANGED
@@ -4,6 +4,8 @@
4
4
  [![Gem Version](https://badge.fury.io/rb/philiprehberger-encoding_kit.svg)](https://rubygems.org/gems/philiprehberger-encoding_kit)
5
5
  [![Last updated](https://img.shields.io/github/last-commit/philiprehberger/rb-encoding-kit)](https://github.com/philiprehberger/rb-encoding-kit/commits/main)
6
6
 
7
+ ![philiprehberger-encoding_kit](https://raw.githubusercontent.com/philiprehberger/rb-encoding-kit/main/package-card.webp)
8
+
7
9
  Character encoding detection, conversion, and normalization
8
10
 
9
11
  ## Requirements
@@ -103,6 +105,10 @@ utf8 = Philiprehberger::EncodingKit.to_utf8(raw_bytes)
103
105
 
104
106
  # Specify source encoding
105
107
  utf8 = Philiprehberger::EncodingKit.to_utf8(latin1_string, from: Encoding::ISO_8859_1)
108
+
109
+ # Strip a leading BOM after transcoding
110
+ clean = Philiprehberger::EncodingKit.to_utf8("\xEF\xBB\xBFhello".b, strip_bom: true)
111
+ # => "hello"
106
112
  ```
107
113
 
108
114
  ### Normalize
@@ -195,7 +201,7 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
195
201
  | `EncodingKit.detect_stream(io, sample_size: 4096)` | Detect encoding from an IO stream by sampling bytes |
196
202
  | `EncodingKit.analyze(string)` | Analyze byte distribution and return encoding candidates with stats |
197
203
  | `EncodingKit.transcode(string, to:, fallback:, replace:)` | Auto-detect source and convert to target encoding |
198
- | `EncodingKit.to_utf8(string, from: nil)` | Convert to UTF-8, auto-detect source if `from` is nil |
204
+ | `EncodingKit.to_utf8(string, from: nil, strip_bom: false)` | Convert to UTF-8, auto-detect source if `from` is nil; pass `strip_bom: true` to drop a leading UTF BOM |
199
205
  | `EncodingKit.normalize(string)` | Force to valid UTF-8, replacing bad bytes with U+FFFD |
200
206
  | `EncodingKit.scrub(string)` | Force to valid UTF-8 by removing invalid bytes entirely |
201
207
  | `EncodingKit.normalize_line_endings(string, to: :lf)` | Convert mixed CRLF/CR/LF to a single canonical form (`:lf`, `:crlf`, `:cr`) |
@@ -204,7 +210,7 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
204
210
  | `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
205
211
  | `EncodingKit.bom?(string)` | Check if string starts with a BOM |
206
212
  | `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
207
- | `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
213
+ | `EncodingKit.read_as_utf8(path, from: nil, strip_bom: false)` | Read a file and return its content as UTF-8; pass `strip_bom: true` to drop a leading UTF BOM |
208
214
  | `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
209
215
  | `EncodingKit.guess_from_filename(path)` | Guess `Encoding` from filename suffixes (e.g. `.utf8`, `.latin1`), `nil` if unknown |
210
216
 
@@ -33,12 +33,14 @@ module Philiprehberger
33
33
  #
34
34
  # @param string [String] the input string
35
35
  # @param from [String, Encoding, nil] source encoding (auto-detect if nil)
36
+ # @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
36
37
  # @return [String] UTF-8 encoded string
37
- def to_utf8(string, from: nil)
38
+ def to_utf8(string, from: nil, strip_bom: false)
38
39
  detected = from ? Encoding.find(from.to_s) : Detector.call(string)
39
40
  source = detected.is_a?(DetectionResult) ? detected.encoding : detected
40
41
  str = string.dup.force_encoding(source)
41
- str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
42
+ encoded = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
43
+ strip_bom ? encoded.delete_prefix("\uFEFF") : encoded
42
44
  end
43
45
 
44
46
  # Force a string to valid UTF-8 by replacing invalid and undefined bytes.
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Philiprehberger
4
4
  module EncodingKit
5
- VERSION = '0.5.0'
5
+ VERSION = '0.6.0'
6
6
  end
7
7
  end
@@ -90,9 +90,10 @@ module Philiprehberger
90
90
  #
91
91
  # @param string [String] the input string
92
92
  # @param from [String, Encoding, nil] source encoding (auto-detect if nil)
93
+ # @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
93
94
  # @return [String] UTF-8 encoded string
94
- def self.to_utf8(string, from: nil)
95
- Converter.to_utf8(string, from: from)
95
+ def self.to_utf8(string, from: nil, strip_bom: false)
96
+ Converter.to_utf8(string, from: from, strip_bom: strip_bom)
96
97
  end
97
98
 
98
99
  # Normalize a string to valid UTF-8, replacing invalid/undefined bytes
@@ -210,10 +211,11 @@ module Philiprehberger
210
211
  #
211
212
  # @param path [String] path to the file
212
213
  # @param from [String, Encoding, nil] source encoding (auto-detect if nil)
214
+ # @param strip_bom [Boolean] remove any leading UTF BOM from the result (default: false)
213
215
  # @return [String] UTF-8 encoded file content
214
- def self.read_as_utf8(path, from: nil)
216
+ def self.read_as_utf8(path, from: nil, strip_bom: false)
215
217
  raw = File.binread(path)
216
- to_utf8(raw, from: from)
218
+ to_utf8(raw, from: from, strip_bom: strip_bom)
217
219
  end
218
220
 
219
221
  # Check if a file's content is valid in the detected or specified encoding.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: philiprehberger-encoding_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip Rehberger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-05-01 00:00:00.000000000 Z
11
+ date: 2026-05-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Detect encoding from BOM and heuristics with confidence scores, convert
14
14
  between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows