philiprehberger-encoding_kit 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 13e2132a0833297e1443ae3c9fc280b2c75ca43c9092e2b6d4d1431981ab90af
4
- data.tar.gz: 86091aa48e10920818add35a6e9c147095495096f284d8e00a93d6a3f0ab0596
3
+ metadata.gz: a328360d24956ba2fbcbc7888da4ddfb6dbe1bfac53aa65c6d7431ea6f0c1a16
4
+ data.tar.gz: f82f1a192f75e1ed0bf7ac386bb17cc4f259b4847af1a83cd5356ed63a2b675c
5
5
  SHA512:
6
- metadata.gz: 55423bdb247b979775199f4ff83043c5bfebff3963ae8f5c0256ca86482e2fd6ded64fe54aded1f06864c387e5a852129baa5a7e8ffb35f47e0dd756e5750b96
7
- data.tar.gz: eb8ca82fa4fae2670b5381dca826e95bc7dd281811c97d9488080d2a7666771cc927f16ee8715a05a07418d1b0f4da6096e47f3dcb09bb419e9816c8e284a807
6
+ metadata.gz: 91b0e02d2d301db41bdc4eda1d3298db4fb517e0ec2bce4c4f79deae7787947db55eb0fdbe12557e9cf317b6b1d000c42ffb02b6901774afa531185b7ae46f1f
7
+ data.tar.gz: bbab15360e72374f5c9a540c3c8b0c54ae4b673f3bee24ff31ea47ebf2daa29c982564b69069a4f2ead487d5afbefdf606a04fc701c1d6bffd3c75038fd17026
data/CHANGELOG.md CHANGED
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.3.0] - 2026-04-11
11
+
12
+ ### Added
13
+ - `detect_file(path, sample_size:)` for file-based encoding detection
14
+ - `read_as_utf8(path, from:)` to read files directly as UTF-8
15
+ - `file_valid?(path, encoding:)` to check file encoding validity
16
+
10
17
  ## [0.2.1] - 2026-03-31
11
18
 
12
19
  ### Changed
data/README.md CHANGED
@@ -131,6 +131,27 @@ Philiprehberger::EncodingKit.bom?("\xEF\xBB\xBFhello") # => true
131
131
  Philiprehberger::EncodingKit.strip_bom("\xEF\xBB\xBFhello") # => "hello"
132
132
  ```
133
133
 
134
+ ### File Operations
135
+
136
+ ```ruby
137
+ require "philiprehberger/encoding_kit"
138
+
139
+ # Detect a file's encoding
140
+ result = Philiprehberger::EncodingKit.detect_file("data.csv")
141
+ result.encoding # => Encoding::UTF_8
142
+ result.confidence # => 0.9
143
+
144
+ # Read a file as UTF-8 (auto-detects source encoding)
145
+ content = Philiprehberger::EncodingKit.read_as_utf8("legacy.txt")
146
+ content.encoding # => Encoding::UTF_8
147
+
148
+ # Read with explicit source encoding
149
+ content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding::ISO_8859_1)
150
+
151
+ # Check if a file's encoding is valid
152
+ Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8) # => true
153
+ ```
154
+
134
155
  ### Validity Check
135
156
 
136
157
  ```ruby
@@ -155,6 +176,9 @@ Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII) # =>
155
176
  | `EncodingKit.convert(string, from:, to:)` | Convert between arbitrary encodings |
156
177
  | `EncodingKit.strip_bom(string)` | Remove byte order mark if present |
157
178
  | `EncodingKit.bom?(string)` | Check if string starts with a BOM |
179
+ | `EncodingKit.detect_file(path, sample_size: 4096)` | Detect encoding of a file by reading a byte sample |
180
+ | `EncodingKit.read_as_utf8(path, from: nil)` | Read a file and return its content as UTF-8 |
181
+ | `EncodingKit.file_valid?(path, encoding: nil)` | Check if a file's content is valid in the given encoding |
158
182
 
159
183
  ## Development
160
184
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Philiprehberger
4
4
  module EncodingKit
5
- VERSION = '0.2.1'
5
+ VERSION = '0.3.0'
6
6
  end
7
7
  end
@@ -169,6 +169,38 @@ module Philiprehberger
169
169
  BOMS.any? { |bom, _encoding| bytes.start_with?(bom) }
170
170
  end
171
171
 
172
+ # Detect the encoding of a file by reading a byte sample.
173
+ #
174
+ # @param path [String] path to the file
175
+ # @param sample_size [Integer] number of bytes to sample (default: 4096)
176
+ # @return [DetectionResult] the detected encoding with confidence score
177
+ def self.detect_file(path, sample_size: 4096)
178
+ File.open(path, 'rb') do |file|
179
+ detect_stream(file, sample_size: sample_size)
180
+ end
181
+ end
182
+
183
+ # Read a file and return its content as UTF-8.
184
+ # Auto-detects the source encoding unless specified via `from:`.
185
+ #
186
+ # @param path [String] path to the file
187
+ # @param from [String, Encoding, nil] source encoding (auto-detect if nil)
188
+ # @return [String] UTF-8 encoded file content
189
+ def self.read_as_utf8(path, from: nil)
190
+ raw = File.binread(path)
191
+ to_utf8(raw, from: from)
192
+ end
193
+
194
+ # Check if a file's content is valid in the detected or specified encoding.
195
+ #
196
+ # @param path [String] path to the file
197
+ # @param encoding [String, Encoding, nil] encoding to check against (auto-detect if nil)
198
+ # @return [Boolean]
199
+ def self.file_valid?(path, encoding: nil)
200
+ raw = File.binread(path)
201
+ valid?(raw, encoding: encoding)
202
+ end
203
+
172
204
  # Build a list of encoding candidates with confidence scores.
173
205
  #
174
206
  # @param bytes [String] binary string
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: philiprehberger-encoding_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip Rehberger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-03-31 00:00:00.000000000 Z
11
+ date: 2026-04-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Detect encoding from BOM and heuristics with confidence scores, convert
14
14
  between encodings, normalize to UTF-8, analyze byte distributions, and handle Windows
@@ -27,11 +27,11 @@ files:
27
27
  - lib/philiprehberger/encoding_kit/detection_result.rb
28
28
  - lib/philiprehberger/encoding_kit/detector.rb
29
29
  - lib/philiprehberger/encoding_kit/version.rb
30
- homepage: https://github.com/philiprehberger/rb-encoding-kit
30
+ homepage: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
31
31
  licenses:
32
32
  - MIT
33
33
  metadata:
34
- homepage_uri: https://github.com/philiprehberger/rb-encoding-kit
34
+ homepage_uri: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-encoding_kit
35
35
  source_code_uri: https://github.com/philiprehberger/rb-encoding-kit
36
36
  changelog_uri: https://github.com/philiprehberger/rb-encoding-kit/blob/main/CHANGELOG.md
37
37
  bug_tracker_uri: https://github.com/philiprehberger/rb-encoding-kit/issues