tre_regex 0.2.1-x86_64-linux-musl → 0.2.2-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a97a9dfcb628db7da557e73385b3fac07455238f1cbc01b9bad82fbade3377aa
4
- data.tar.gz: af28d4d591b9d3b2d5660794031785bb968d25173ed2dae53636138f91fa7e2d
3
+ metadata.gz: 36bd0de0f0a67c19b0869ad5f5d17e41147122cc6d0451fb2090fb3841385008
4
+ data.tar.gz: 15715b7d00b3dce17df3ac94d5765006b3be551ed729355f3861c7ded7268715
5
5
  SHA512:
6
- metadata.gz: 07ab5cf743d11d07ec8253d1bcd246a9d78602b61b564ae0c83e3ef6479fccabe87a1a85cfe44f31f9a933b6931a98948cddf10560c8502b9af69f10299c337d
7
- data.tar.gz: e96fbd3886331f6da5aa3ea2bed0939d45855209421215d94d1c58b16ddf0b4019fbab0386c3056df562089e49f81716808bb959d394b78548a89612fe538361
6
+ metadata.gz: 310f3a3c62bf398f44d1838989bcb2c2ae61d955381b340bed093e8b8592d91914e7c0e8d2f512c9b72a70038cb4a1de9645ba3a4098cdd8eeeef59749ebcdd3
7
+ data.tar.gz: 759b303f5d2a2da88197dfccc5335ec660424729a28a1501e560e725e1b101945a04846287483aabc010bdc7ad1dae85765d72d5a775d6db9e4b4b67134a0cad
data/README.md CHANGED
@@ -87,10 +87,10 @@ regex = TreRegex::Regex.new('cat')
87
87
  # Returns an array of match hashes
88
88
  regex.match_all('cat, cot, cut', max_errors: 1).to_a
89
89
  # => [
90
- # {match: "cat", submatches: [], index: 0, end_index: 3, cost: 0, errors: {insertions: 0, deletions: 0, substitutions: 0}},
90
+ # {match: "cat", submatches: [], index: 0, end_index: 3, cost: 0, errors: {insertions: 0, deletions: 0, substitutions: 0}},
91
91
  # {match: "cot", submatches: [], index: 5, end_index: 8, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
92
92
  # {match: "cut", submatches: [], index: 10, end_index: 13, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}}
93
- # ]
93
+ # ]
94
94
  ```
95
95
 
96
96
  ### Capture Groups (Submatches)
@@ -240,10 +240,10 @@ regex = TreRegex::Regex.new('cat')
240
240
  # but it also matches "" at the end of the string (3 deletions)!
241
241
  regex.match_all('cot, cow', max_errors: 3).to_a
242
242
  # => [
243
- # {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
244
- # {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}},
245
- # {match: "", submatches: [], index: 8, end_index: 8, cost: 3, errors: {insertions: 0, deletions: 3, substitutions: 0}}
246
- # ]
243
+ # {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
244
+ # {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}},
245
+ # {match: "", submatches: [], index: 8, end_index: 8, cost: 3, errors: {insertions: 0, deletions: 3, substitutions: 0}}
246
+ # ]
247
247
  ```
248
248
 
249
249
  **Best Practice**: if you need a high `max_errors` limit but want to prevent the engine from matching empty strings, explicitly cap the `max_deletions` option so that at least one character of your pattern must survive
@@ -252,9 +252,9 @@ regex.match_all('cot, cow', max_errors: 3).to_a
252
252
  # Allow 3 total errors, but strictly forbid the engine from deleting more than 2 characters
253
253
  regex.match_all('cot, cow', max_errors: 3, max_deletions: 2).to_a
254
254
  # => [
255
- # {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
256
- # {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}}
257
- # ] # The empty match is mathematically prevented
255
+ # {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
256
+ # {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}}
257
+ # ] # The empty match is mathematically prevented
258
258
  ```
259
259
 
260
260
  ### POSIX vs. PCRE Syntax
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module TreRegex
4
- VERSION = '0.2.1'
4
+ VERSION = '0.2.2'
5
5
  end
data/lib/tre_regex.rb CHANGED
@@ -199,43 +199,71 @@ module TreRegex
199
199
  end
200
200
  end
201
201
 
202
+ # Helper to safely align TRE's raw byte offsets to valid UTF-8 boundaries
203
+ def align_bounds(text, absolute_so, absolute_eo)
204
+ safe_so = absolute_so.clamp(0, text.bytesize)
205
+
206
+ # Shift start backward to the nearest valid character boundary
207
+ # (byte & 0xC0) == 0x80 checks if the byte is a UTF-8 continuation byte
208
+ safe_so -= 1 while safe_so.positive? && (text.getbyte(safe_so) & 0xC0) == 0x80
209
+
210
+ safe_eo = absolute_eo.clamp(0, text.bytesize)
211
+
212
+ # Shift end forward to the nearest valid character boundary
213
+ safe_eo += 1 while safe_eo < text.bytesize && (text.getbyte(safe_eo) & 0xC0) == 0x80
214
+
215
+ safe_so = safe_eo if safe_so > safe_eo
216
+
217
+ [safe_so, safe_eo]
218
+ end
219
+
202
220
  def extract_match_payload(text, byte_off, char_off, m_info)
203
221
  pmatch_array, nmatch, match_data = m_info
222
+ abs_so, abs_eo = primary_match_bounds(text, byte_off, pmatch_array)
204
223
 
205
- # Read the full match boundaries from index 0
206
- full_rm = Native::RegMatch.new(pmatch_array)
207
- rm_so = full_rm[:rm_so]
208
- rm_eo = full_rm[:rm_eo]
224
+ match_str = text.byteslice(abs_so...abs_eo) || ''
225
+ start_index = char_off + (text.byteslice(byte_off...abs_so) || '').length
226
+
227
+ payload = format_payload(
228
+ match_str, start_index, match_data,
229
+ extract_submatches(text, byte_off, pmatch_array, nmatch)
230
+ )
209
231
 
210
- prefix_len = (text.byteslice(byte_off, rm_so) || '').length
211
- match_str = text.byteslice((byte_off + rm_so)...(byte_off + rm_eo))
232
+ [payload, abs_eo - byte_off, start_index - char_off + match_str.length]
233
+ end
212
234
 
213
- payload = {
235
+ def primary_match_bounds(text, byte_off, pmatch_array)
236
+ full_rm = Native::RegMatch.new(pmatch_array)
237
+ align_bounds(text, byte_off + full_rm[:rm_so], byte_off + full_rm[:rm_eo])
238
+ end
239
+
240
+ def format_payload(match_str, start_index, match_data, submatches)
241
+ {
214
242
  match: match_str,
215
- submatches: extract_submatches(text, byte_off, pmatch_array, nmatch),
216
- index: char_off + prefix_len,
217
- end_index: char_off + prefix_len + match_str.length,
243
+ submatches:,
244
+ index: start_index,
245
+ end_index: start_index + match_str.length,
218
246
  cost: match_data[:cost],
219
247
  errors: parse_errors(match_data)
220
248
  }
221
-
222
- [payload, rm_eo, prefix_len + match_str.length]
223
249
  end
224
250
 
225
251
  def extract_submatches(text, byte_off, pmatch_array, nmatch)
226
252
  submatches = (1...nmatch).map do |i|
227
253
  # Advance the memory pointer by the size of the struct for each index
228
254
  rm = Native::RegMatch.new(pmatch_array + (i * Native::RegMatch.size))
229
- sub_so = rm[:rm_so]
230
- sub_eo = rm[:rm_eo]
231
-
232
- # Safely extract the group, inserting nil if it was optional and unmatched
233
- sub_so == -1 ? nil : text.byteslice((byte_off + sub_so)...(byte_off + sub_eo))
255
+ raw_so = rm[:rm_so]
256
+ raw_eo = rm[:rm_eo]
257
+
258
+ if raw_so == -1 || raw_so > raw_eo
259
+ nil
260
+ else
261
+ abs_so, abs_eo = align_bounds(text, byte_off + raw_so, byte_off + raw_eo)
262
+ text.byteslice(abs_so...abs_eo)
263
+ end
234
264
  end
235
265
 
236
- # Cleanup: Remove trailing nil values (unused capture groups)
237
266
  submatches.pop while submatches.last.nil? && !submatches.empty?
238
-
239
267
  submatches
240
268
  end
241
269
 
data/tre_regex.gemspec CHANGED
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
 
12
12
  spec.summary = 'A fast Ruby FFI wrapper for the TRE approximate regex matching library'
13
13
  spec.description = [
14
- 'TreRegex provides a high-performance Ruby interface to the TRE C library using FFI.',
14
+ 'TreRegex provides a high-performance Ruby interface to the TRE C library.',
15
15
  'It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring',
16
16
  'multi-byte Unicode string safety, and granular error limits'
17
17
  ].join(' ')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tre_regex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: x86_64-linux-musl
6
6
  authors:
7
7
  - Oleksii Vasyliev
@@ -23,9 +23,9 @@ dependencies:
23
23
  - - ">="
24
24
  - !ruby/object:Gem::Version
25
25
  version: '1.0'
26
- description: TreRegex provides a high-performance Ruby interface to the TRE C library
27
- using FFI. It brings robust approximate (fuzzy) regular expression matching to Ruby,
28
- featuring multi-byte Unicode string safety, and granular error limits
26
+ description: TreRegex provides a high-performance Ruby interface to the TRE C library.
27
+ It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring
28
+ multi-byte Unicode string safety, and granular error limits
29
29
  email:
30
30
  - leopard.not.a@gmail.com
31
31
  executables: []