tre_regex 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -9
- data/lib/tre_regex/version.rb +1 -1
- data/lib/tre_regex.rb +47 -19
- data/tre_regex.gemspec +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 76f866d01a4c36d9779bba23bd55f3a767e6b110e19a5b7c11797a9c48d88a48
|
|
4
|
+
data.tar.gz: 64181d7a9578eb66326ec9d898e9f439f117bd7180eb17e08d29de0bcd478b54
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bad8e0d99218e8963f4805de46ac318692d1fbf50577953be3189f982595eca040f1f7ddcf2d63fe3a948a54a55649b61b763e46ab85f8d1fcb67d18c5eed27d
|
|
7
|
+
data.tar.gz: 07d4c6d70c3516adcf7f23a247aae92ee5b417b470ff5879b8f451411406ad511c19317ec79c2b505584d20be801029c7cb08976264456d870899dbee9042625
|
data/README.md
CHANGED
|
@@ -87,10 +87,10 @@ regex = TreRegex::Regex.new('cat')
|
|
|
87
87
|
# Returns an array of match hashes
|
|
88
88
|
regex.match_all('cat, cot, cut', max_errors: 1).to_a
|
|
89
89
|
# => [
|
|
90
|
-
#
|
|
90
|
+
# {match: "cat", submatches: [], index: 0, end_index: 3, cost: 0, errors: {insertions: 0, deletions: 0, substitutions: 0}},
|
|
91
91
|
# {match: "cot", submatches: [], index: 5, end_index: 8, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
|
|
92
92
|
# {match: "cut", submatches: [], index: 10, end_index: 13, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}}
|
|
93
|
-
#
|
|
93
|
+
# ]
|
|
94
94
|
```
|
|
95
95
|
|
|
96
96
|
### Capture Groups (Submatches)
|
|
@@ -240,10 +240,10 @@ regex = TreRegex::Regex.new('cat')
|
|
|
240
240
|
# but it also matches "" at the end of the string (3 deletions)!
|
|
241
241
|
regex.match_all('cot, cow', max_errors: 3).to_a
|
|
242
242
|
# => [
|
|
243
|
-
#
|
|
244
|
-
#
|
|
245
|
-
#
|
|
246
|
-
#
|
|
243
|
+
# {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
|
|
244
|
+
# {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}},
|
|
245
|
+
# {match: "", submatches: [], index: 8, end_index: 8, cost: 3, errors: {insertions: 0, deletions: 3, substitutions: 0}}
|
|
246
|
+
# ]
|
|
247
247
|
```
|
|
248
248
|
|
|
249
249
|
**Best Practice**: if you need a high `max_errors` limit but want to prevent the engine from matching empty strings, explicitly cap the `max_deletions` option so that at least one character of your pattern must survive
|
|
@@ -252,9 +252,9 @@ regex.match_all('cot, cow', max_errors: 3).to_a
|
|
|
252
252
|
# Allow 3 total errors, but strictly forbid the engine from deleting more than 2 characters
|
|
253
253
|
regex.match_all('cot, cow', max_errors: 3, max_deletions: 2).to_a
|
|
254
254
|
# => [
|
|
255
|
-
#
|
|
256
|
-
#
|
|
257
|
-
#
|
|
255
|
+
# {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
|
|
256
|
+
# {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}}
|
|
257
|
+
# ] # The empty match is mathematically prevented
|
|
258
258
|
```
|
|
259
259
|
|
|
260
260
|
### POSIX vs. PCRE Syntax
|
data/lib/tre_regex/version.rb
CHANGED
data/lib/tre_regex.rb
CHANGED
|
@@ -199,43 +199,71 @@ module TreRegex
|
|
|
199
199
|
end
|
|
200
200
|
end
|
|
201
201
|
|
|
202
|
+
# Helper to safely align TRE's raw byte offsets to valid UTF-8 boundaries
|
|
203
|
+
def align_bounds(text, absolute_so, absolute_eo)
|
|
204
|
+
safe_so = absolute_so.clamp(0, text.bytesize)
|
|
205
|
+
|
|
206
|
+
# Shift start backward to the nearest valid character boundary
|
|
207
|
+
# (byte & 0xC0) == 0x80 checks if the byte is a UTF-8 continuation byte
|
|
208
|
+
safe_so -= 1 while safe_so.positive? && (text.getbyte(safe_so) & 0xC0) == 0x80
|
|
209
|
+
|
|
210
|
+
safe_eo = absolute_eo.clamp(0, text.bytesize)
|
|
211
|
+
|
|
212
|
+
# Shift end forward to the nearest valid character boundary
|
|
213
|
+
safe_eo += 1 while safe_eo < text.bytesize && (text.getbyte(safe_eo) & 0xC0) == 0x80
|
|
214
|
+
|
|
215
|
+
safe_so = safe_eo if safe_so > safe_eo
|
|
216
|
+
|
|
217
|
+
[safe_so, safe_eo]
|
|
218
|
+
end
|
|
219
|
+
|
|
202
220
|
def extract_match_payload(text, byte_off, char_off, m_info)
|
|
203
221
|
pmatch_array, nmatch, match_data = m_info
|
|
222
|
+
abs_so, abs_eo = primary_match_bounds(text, byte_off, pmatch_array)
|
|
204
223
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
224
|
+
match_str = text.byteslice(abs_so...abs_eo) || ''
|
|
225
|
+
start_index = char_off + (text.byteslice(byte_off...abs_so) || '').length
|
|
226
|
+
|
|
227
|
+
payload = format_payload(
|
|
228
|
+
match_str, start_index, match_data,
|
|
229
|
+
extract_submatches(text, byte_off, pmatch_array, nmatch)
|
|
230
|
+
)
|
|
209
231
|
|
|
210
|
-
|
|
211
|
-
|
|
232
|
+
[payload, abs_eo - byte_off, start_index - char_off + match_str.length]
|
|
233
|
+
end
|
|
212
234
|
|
|
213
|
-
|
|
235
|
+
def primary_match_bounds(text, byte_off, pmatch_array)
|
|
236
|
+
full_rm = Native::RegMatch.new(pmatch_array)
|
|
237
|
+
align_bounds(text, byte_off + full_rm[:rm_so], byte_off + full_rm[:rm_eo])
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def format_payload(match_str, start_index, match_data, submatches)
|
|
241
|
+
{
|
|
214
242
|
match: match_str,
|
|
215
|
-
submatches
|
|
216
|
-
index:
|
|
217
|
-
end_index:
|
|
243
|
+
submatches:,
|
|
244
|
+
index: start_index,
|
|
245
|
+
end_index: start_index + match_str.length,
|
|
218
246
|
cost: match_data[:cost],
|
|
219
247
|
errors: parse_errors(match_data)
|
|
220
248
|
}
|
|
221
|
-
|
|
222
|
-
[payload, rm_eo, prefix_len + match_str.length]
|
|
223
249
|
end
|
|
224
250
|
|
|
225
251
|
def extract_submatches(text, byte_off, pmatch_array, nmatch)
|
|
226
252
|
submatches = (1...nmatch).map do |i|
|
|
227
253
|
# Advance the memory pointer by the size of the struct for each index
|
|
228
254
|
rm = Native::RegMatch.new(pmatch_array + (i * Native::RegMatch.size))
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
255
|
+
raw_so = rm[:rm_so]
|
|
256
|
+
raw_eo = rm[:rm_eo]
|
|
257
|
+
|
|
258
|
+
if raw_so == -1 || raw_so > raw_eo
|
|
259
|
+
nil
|
|
260
|
+
else
|
|
261
|
+
abs_so, abs_eo = align_bounds(text, byte_off + raw_so, byte_off + raw_eo)
|
|
262
|
+
text.byteslice(abs_so...abs_eo)
|
|
263
|
+
end
|
|
234
264
|
end
|
|
235
265
|
|
|
236
|
-
# Cleanup: Remove trailing nil values (unused capture groups)
|
|
237
266
|
submatches.pop while submatches.last.nil? && !submatches.empty?
|
|
238
|
-
|
|
239
267
|
submatches
|
|
240
268
|
end
|
|
241
269
|
|
data/tre_regex.gemspec
CHANGED
|
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
|
|
|
11
11
|
|
|
12
12
|
spec.summary = 'A fast Ruby FFI wrapper for the TRE approximate regex matching library'
|
|
13
13
|
spec.description = [
|
|
14
|
-
'TreRegex provides a high-performance Ruby interface to the TRE C library
|
|
14
|
+
'TreRegex provides a high-performance Ruby interface to the TRE C library.',
|
|
15
15
|
'It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring',
|
|
16
16
|
'multi-byte Unicode string safety, and granular error limits'
|
|
17
17
|
].join(' ')
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tre_regex
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Oleksii Vasyliev
|
|
@@ -23,9 +23,9 @@ dependencies:
|
|
|
23
23
|
- - ">="
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '1.0'
|
|
26
|
-
description: TreRegex provides a high-performance Ruby interface to the TRE C library
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
description: TreRegex provides a high-performance Ruby interface to the TRE C library.
|
|
27
|
+
It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring
|
|
28
|
+
multi-byte Unicode string safety, and granular error limits
|
|
29
29
|
email:
|
|
30
30
|
- leopard.not.a@gmail.com
|
|
31
31
|
executables: []
|
|
@@ -64,7 +64,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
64
64
|
- !ruby/object:Gem::Version
|
|
65
65
|
version: '0'
|
|
66
66
|
requirements: []
|
|
67
|
-
rubygems_version: 4.0.
|
|
67
|
+
rubygems_version: 4.0.10
|
|
68
68
|
specification_version: 4
|
|
69
69
|
summary: A fast Ruby FFI wrapper for the TRE approximate regex matching library
|
|
70
70
|
test_files: []
|