philiprehberger-encoding_kit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +38 -8
- data/lib/philiprehberger/encoding_kit/version.rb +1 -1
- data/lib/philiprehberger/encoding_kit.rb +32 -0
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a328360d24956ba2fbcbc7888da4ddfb6dbe1bfac53aa65c6d7431ea6f0c1a16
|
|
4
|
+
data.tar.gz: f82f1a192f75e1ed0bf7ac386bb17cc4f259b4847af1a83cd5356ed63a2b675c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 91b0e02d2d301db41bdc4eda1d3298db4fb517e0ec2bce4c4f79deae7787947db55eb0fdbe12557e9cf317b6b1d000c42ffb02b6901774afa531185b7ae46f1f
|
|
7
|
+
data.tar.gz: bbab15360e72374f5c9a540c3c8b0c54ae4b673f3bee24ff31ea47ebf2daa29c982564b69069a4f2ead487d5afbefdf606a04fc701c1d6bffd3c75038fd17026
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.3.0] - 2026-04-11
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `detect_file(path, sample_size:)` for file-based encoding detection
|
|
14
|
+
- `read_as_utf8(path, from:)` to read files directly as UTF-8
|
|
15
|
+
- `file_valid?(path, encoding:)` to check file encoding validity
|
|
16
|
+
|
|
17
|
+
## [0.2.1] - 2026-03-31
|
|
18
|
+
|
|
19
|
+
### Changed
|
|
20
|
+
- Standardize README badges, support section, and license format
|
|
21
|
+
|
|
10
22
|
## [0.2.0] - 2026-03-28
|
|
11
23
|
|
|
12
24
|
### Added
|
data/README.md
CHANGED
|
@@ -2,12 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://github.com/philiprehberger/rb-encoding-kit/actions/workflows/ci.yml)
|
|
4
4
|
[](https://rubygems.org/gems/philiprehberger-encoding_kit)
|
|
5
|
-
[](https://github.com/philiprehberger/rb-encoding-kit/releases)
|
|
6
5
|
[](https://github.com/philiprehberger/rb-encoding-kit/commits/main)
|
|
7
|
-
[](LICENSE)
|
|
8
|
-
[](https://github.com/philiprehberger/rb-encoding-kit/issues?q=is%3Aissue+is%3Aopen+label%3Abug)
|
|
9
|
-
[](https://github.com/philiprehberger/rb-encoding-kit/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement)
|
|
10
|
-
[](https://github.com/sponsors/philiprehberger)
|
|
11
6
|
|
|
12
7
|
Character encoding detection, conversion, and normalization
|
|
13
8
|
|
|
@@ -136,6 +131,27 @@ Philiprehberger::EncodingKit.bom?("\xEF\xBB\xBFhello") # => true
|
|
|
136
131
|
Philiprehberger::EncodingKit.strip_bom("\xEF\xBB\xBFhello") # => "hello"
|
|
137
132
|
```
|
|
138
133
|
|
|
134
|
+
### File Operations
|
|
135
|
+
|
|
136
|
+
```ruby
|
|
137
|
+
require "philiprehberger/encoding_kit"
|
|
138
|
+
|
|
139
|
+
# Detect a file's encoding
|
|
140
|
+
result = Philiprehberger::EncodingKit.detect_file("data.csv")
|
|
141
|
+
result.encoding # => Encoding::UTF_8
|
|
142
|
+
result.confidence # => 0.9
|
|
143
|
+
|
|
144
|
+
# Read a file as UTF-8 (auto-detects source encoding)
|
|
145
|
+
content = Philiprehberger::EncodingKit.read_as_utf8("legacy.txt")
|
|
146
|
+
content.encoding # => Encoding::UTF_8
|
|
147
|
+
|
|
148
|
+
# Read with explicit source encoding
|
|
149
|
+
content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding::ISO_8859_1)
|
|
150
|
+
|
|
151
|
+
# Check if a file's encoding is valid
|
|
152
|
+
Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
|
|
153
|
+
```
|
|
154
|
+
|
|
139
155
|
### Validity Check
|
|
140
156
|
|
|
141
157
|
```ruby
|
|
@@ -160,6 +176,9 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
|
|
|
160
176
|
| `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
|
|
161
177
|
| `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
|
|
162
178
|
| `EncodingKit.bom?(string)` | Check if string starts with a BOM |
|
|
179
|
+
| `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
|
|
180
|
+
| `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
|
|
181
|
+
| `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
|
|
163
182
|
|
|
164
183
|
## Development
|
|
165
184
|
|
|
@@ -171,10 +190,21 @@ bundle exec rubocop
|
|
|
171
190
|
|
|
172
191
|
## Support
|
|
173
192
|
|
|
174
|
-
If you find this
|
|
193
|
+
If you find this project useful:
|
|
194
|
+
|
|
195
|
+
⭐ [Star the repo](https://github.com/philiprehberger/rb-encoding-kit)
|
|
196
|
+
|
|
197
|
+
🐛 [Report issues](https://github.com/philiprehberger/rb-encoding-kit/issues?q=is%3Aissue+is%3Aopen+label%3Abug)
|
|
198
|
+
|
|
199
|
+
💡 [Suggest features](https://github.com/philiprehberger/rb-encoding-kit/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement)
|
|
200
|
+
|
|
201
|
+
❤️ [Sponsor development](https://github.com/sponsors/philiprehberger)
|
|
202
|
+
|
|
203
|
+
🌐 [All Open Source Projects](https://philiprehberger.com/open-source-packages)
|
|
204
|
+
|
|
205
|
+
💻 [GitHub Profile](https://github.com/philiprehberger)
|
|
175
206
|
|
|
176
|
-
[
|
|
177
|
-
[](https://philiprehberger.com/open-source-packages)
|
|
207
|
+
🔗 [LinkedIn Profile](https://www.linkedin.com/in/philiprehberger)
|
|
178
208
|
|
|
179
209
|
## License
|
|
180
210
|
|
|
@@ -169,6 +169,38 @@ module Philiprehberger
|
|
|
169
169
|
BOMS.any? { |bom, _encoding| bytes.start_with?(bom) }
|
|
170
170
|
end
|
|
171
171
|
|
|
172
|
+
# Detect the encoding of a file by reading a byte sample.
|
|
173
|
+
#
|
|
174
|
+
# @param path [String] path to the file
|
|
175
|
+
# @param sample_size [Integer] number of bytes to sample (default: 4096)
|
|
176
|
+
# @return [DetectionResult] the detected encoding with confidence score
|
|
177
|
+
def self.detect_file(path, sample_size: 4096)
|
|
178
|
+
File.open(path, 'rb') do |file|
|
|
179
|
+
detect_stream(file, sample_size: sample_size)
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Read a file and return its content as UTF-8.
|
|
184
|
+
# Auto-detects the source encoding unless specified via `from:`.
|
|
185
|
+
#
|
|
186
|
+
# @param path [String] path to the file
|
|
187
|
+
# @param from [String, Encoding, nil] source encoding (auto-detect if nil)
|
|
188
|
+
# @return [String] UTF-8 encoded file content
|
|
189
|
+
def self.read_as_utf8(path, from: nil)
|
|
190
|
+
raw = File.binread(path)
|
|
191
|
+
to_utf8(raw, from: from)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Check if a file's content is valid in the detected or specified encoding.
|
|
195
|
+
#
|
|
196
|
+
# @param path [String] path to the file
|
|
197
|
+
# @param encoding [String, Encoding, nil] encoding to check against (auto-detect if nil)
|
|
198
|
+
# @return [Boolean]
|
|
199
|
+
def self.file_valid?(path, encoding: nil)
|
|
200
|
+
raw = File.binread(path)
|
|
201
|
+
valid?(raw, encoding: encoding)
|
|
202
|
+
end
|
|
203
|
+
|
|
172
204
|
# Build a list of encoding candidates with confidence scores.
|
|
173
205
|
#
|
|
174
206
|
# @param bytes [String] binary string
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-encoding_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detect encoding from BOM and heuristics with confidence scores, convert
|
|
14
14
|
between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
|
|
@@ -27,11 +27,11 @@ files:
|
|
|
27
27
|
- lib/philiprehberger/encoding_kit/detection_result.rb
|
|
28
28
|
- lib/philiprehberger/encoding_kit/detector.rb
|
|
29
29
|
- lib/philiprehberger/encoding_kit/version.rb
|
|
30
|
-
homepage: https://
|
|
30
|
+
homepage: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
|
|
31
31
|
licenses:
|
|
32
32
|
- MIT
|
|
33
33
|
metadata:
|
|
34
|
-
homepage_uri: https://
|
|
34
|
+
homepage_uri: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
|
|
35
35
|
source_code_uri: https://github.com/philiprehberger/rb-encoding-kit
|
|
36
36
|
changelog_uri: https://github.com/philiprehberger/rb-encoding-kit/blob/main/CHANGELOG.md
|
|
37
37
|
bug_tracker_uri: https://github.com/philiprehberger/rb-encoding-kit/issues
|