philiprehberger-encoding_kit 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +24 -0
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +32 -0
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a328360d24956ba2fbcbc7888da4ddfb6dbe1bfac53aa65c6d7431ea6f0c1a16
|
|
4
|
+
data.tar.gz: f82f1a192f75e1ed0bf7ac386bb17cc4f259b4847af1a83cd5356ed63a2b675c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 91b0e02d2d301db41bdc4eda1d3298db4fb517e0ec2bce4c4f79deae7787947db55eb0fdbe12557e9cf317b6b1d000c42ffb02b6901774afa531185b7ae46f1f
|
|
7
|
+
data.tar.gz: bbab15360e72374f5c9a540c3c8b0c54ae4b673f3bee24ff31ea47ebf2daa29c982564b69069a4f2ead487d5afbefdf606a04fc701c1d6bffd3c75038fd17026
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.3.0] - 2026-04-11
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `detect_file(path, sample_size:)` for file-based encoding detection
|
|
14
|
+
- `read_as_utf8(path, from:)` to read files directly as UTF-8
|
|
15
|
+
- `file_valid?(path, encoding:)` to check file encoding validity
|
|
16
|
+
|
|
10
17
|
## [0.2.1] - 2026-03-31
|
|
11
18
|
|
|
12
19
|
### Changed
|
data/README.md
CHANGED
|
@@ -131,6 +131,27 @@ Philiprehberger::EncodingKit.bom?("\xEF\xBB\xBFhello") # => true
|
|
|
131
131
|
Philiprehberger::EncodingKit.strip_bom("\xEF\xBB\xBFhello") # => "hello"
|
|
132
132
|
```
|
|
133
133
|
|
|
134
|
+
### File Operations
|
|
135
|
+
|
|
136
|
+
```ruby
|
|
137
|
+
require "philiprehberger/encoding_kit"
|
|
138
|
+
|
|
139
|
+
# Detect a file's encoding
|
|
140
|
+
result = Philiprehberger::EncodingKit.detect_file("data.csv")
|
|
141
|
+
result.encoding # => Encoding::UTF_8
|
|
142
|
+
result.confidence # => 0.9
|
|
143
|
+
|
|
144
|
+
# Read a file as UTF-8 (auto-detects source encoding)
|
|
145
|
+
content = Philiprehberger::EncodingKit.read_as_utf8("legacy.txt")
|
|
146
|
+
content.encoding # => Encoding::UTF_8
|
|
147
|
+
|
|
148
|
+
# Read with explicit source encoding
|
|
149
|
+
content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding::ISO_8859_1)
|
|
150
|
+
|
|
151
|
+
# Check if a file's encoding is valid
|
|
152
|
+
Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
|
|
153
|
+
```
|
|
154
|
+
|
|
134
155
|
### Validity Check
|
|
135
156
|
|
|
136
157
|
```ruby
|
|
@@ -155,6 +176,9 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
155
176
|
| `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
|
|
156
177
|
| `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
|
|
157
178
|
| `EncodingKit.bom?(string)` | Check if string starts with a BOM |
|
|
179
|
+
| `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
|
|
180
|
+
| `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
|
|
181
|
+
| `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
|
|
158
182
|
|
|
159
183
|
## Development
|
|
160
184
|
|
|
@@ -169,6 +169,38 @@ module Philiprehberger
|
|
|
169
169
|
BOMS.any? { |bom, _encoding| bytes.start_with?(bom) }
|
|
170
170
|
end
|
|
171
171
|
|
|
172
|
+
# Detect the encoding of a file by reading a byte sample.
|
|
173
|
+
#
|
|
174
|
+
# @param path [String] path to the file
|
|
175
|
+
# @param sample_size [Integer] number of bytes to sample (default: 4096)
|
|
176
|
+
# @return [DetectionResult] the detected encoding with confidence score
|
|
177
|
+
def self.detect_file(path, sample_size: 4096)
|
|
178
|
+
File.open(path, 'rb') do |file|
|
|
179
|
+
detect_stream(file, sample_size: sample_size)
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Read a file and return its content as UTF-8.
|
|
184
|
+
# Auto-detects the source encoding unless specified via `from:`.
|
|
185
|
+
#
|
|
186
|
+
# @param path [String] path to the file
|
|
187
|
+
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
188
|
+
# @return [String] UTF-8 encoded file content
|
|
189
|
+
def self.read_as_utf8(path, from: nil)
|
|
190
|
+
raw = File.binread(path)
|
|
191
|
+
to_utf8(raw, from: from)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Check if a file's content is valid in the detected or specified encoding.
|
|
195
|
+
#
|
|
196
|
+
# @param path [String] path to the file
|
|
197
|
+
# @param encoding [String, Encoding, nil] encoding to check against (auto-detect if nil)
|
|
198
|
+
# @return [Boolean]
|
|
199
|
+
def self.file_valid?(path, encoding: nil)
|
|
200
|
+
raw = File.binread(path)
|
|
201
|
+
valid?(raw, encoding: encoding)
|
|
202
|
+
end
|
|
203
|
+
|
|
172
204
|
# Build a list of encoding candidates with confidence scores.
|
|
173
205
|
#
|
|
174
206
|
# @param bytes [String] binary string
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
14
|
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|
|
@@ -27,11 +27,11 @@ files:
|
|
|
27
27
|
- lib/philiprehberger/encoding_kit/detection_result.rb
|
|
28
28
|
- lib/philiprehberger/encoding_kit/detector.rb
|
|
29
29
|
- lib/philiprehberger/encoding_kit/version.rb
|
|
30
|
-
homepage: https://
|
|
30
|
+
homepage: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
|
|
31
31
|
licenses:
|
|
32
32
|
- MIT
|
|
33
33
|
metadata:
|
|
34
|
-
homepage_uri: https://
|
|
34
|
+
homepage_uri: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
|
|
35
35
|
source_code_uri: https://github.com/philiprehberger/rb-encoding-kit
|
|
36
36
|
changelog_uri: https://github.com/philiprehberger/rb-encoding-kit/blob/main/CHANGELOG.md
|
|
37
37
|
bug_tracker_uri: https://github.com/philiprehberger/rb-encoding-kit/issues
|