tre_regex 0.2.0-arm-linux-gnu → 0.2.2-arm-linux-gnu
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +26 -10
- data/ext/tre_regex/extconf.rb +0 -1
- data/lib/tre_regex/version.rb +1 -1
- data/lib/tre_regex.rb +61 -26
- data/tre_regex.gemspec +3 -3
- metadata +5 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9f8c20d8041890fc0d7916d92b76687e86568c32dee6783a83392300e4c29317
|
|
4
|
+
data.tar.gz: afa7350a95723e9ce754472ccaf62e1c6ae1446004ce085a382c5746631ae240
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 72d0afe25289d4504ef8a66480ec94e9f13abe5f930b7cec4e67411654aab649a30c2dbc9dc62d997094cd28d06528dbdc088a98fd0d9c54f4cedad9fc02346f
|
|
7
|
+
data.tar.gz: 9254dc3f9670d3b7ee1008aacc804f501e4e8e94f3e286e867a03828f7a7c7cb3e35cf3ffd538b7de346ceddfc84df603915e08909555e00128c229eb8aa0b1f
|
data/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# TreRegex [](https://github.com/le0pard/tre_regex/actions/workflows/main.yml)
|
|
2
2
|
|
|
3
|
-
`TreRegex`
|
|
3
|
+
`TreRegex` provides a high-performance Ruby interface to the [TRE](https://github.com/laurikari/tre) C library using FFI. It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring multi-byte Unicode string safety, and granular error limits
|
|
4
4
|
|
|
5
5
|
## Why?
|
|
6
6
|
|
|
@@ -87,10 +87,10 @@ regex = TreRegex::Regex.new('cat')
|
|
|
87
87
|
# Returns an array of match hashes
|
|
88
88
|
regex.match_all('cat, cot, cut', max_errors: 1).to_a
|
|
89
89
|
# => [
|
|
90
|
-
#
|
|
90
|
+
# {match: "cat", submatches: [], index: 0, end_index: 3, cost: 0, errors: {insertions: 0, deletions: 0, substitutions: 0}},
|
|
91
91
|
# {match: "cot", submatches: [], index: 5, end_index: 8, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
|
|
92
92
|
# {match: "cut", submatches: [], index: 10, end_index: 13, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}}
|
|
93
|
-
#
|
|
93
|
+
# ]
|
|
94
94
|
```
|
|
95
95
|
|
|
96
96
|
### Capture Groups (Submatches)
|
|
@@ -240,10 +240,10 @@ regex = TreRegex::Regex.new('cat')
|
|
|
240
240
|
# but it also matches "" at the end of the string (3 deletions)!
|
|
241
241
|
regex.match_all('cot, cow', max_errors: 3).to_a
|
|
242
242
|
# => [
|
|
243
|
-
#
|
|
244
|
-
#
|
|
245
|
-
#
|
|
246
|
-
#
|
|
243
|
+
# {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
|
|
244
|
+
# {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}},
|
|
245
|
+
# {match: "", submatches: [], index: 8, end_index: 8, cost: 3, errors: {insertions: 0, deletions: 3, substitutions: 0}}
|
|
246
|
+
# ]
|
|
247
247
|
```
|
|
248
248
|
|
|
249
249
|
**Best Practice**: if you need a high `max_errors` limit but want to prevent the engine from matching empty strings, explicitly cap the `max_deletions` option so that at least one character of your pattern must survive
|
|
@@ -252,9 +252,9 @@ regex.match_all('cot, cow', max_errors: 3).to_a
|
|
|
252
252
|
# Allow 3 total errors, but strictly forbid the engine from deleting more than 2 characters
|
|
253
253
|
regex.match_all('cot, cow', max_errors: 3, max_deletions: 2).to_a
|
|
254
254
|
# => [
|
|
255
|
-
#
|
|
256
|
-
#
|
|
257
|
-
#
|
|
255
|
+
# {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
|
|
256
|
+
# {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}}
|
|
257
|
+
# ] # The empty match is mathematically prevented
|
|
258
258
|
```
|
|
259
259
|
|
|
260
260
|
### POSIX vs. PCRE Syntax
|
|
@@ -316,6 +316,22 @@ If you need to find overlapping fuzzy matches, you will need to manually step th
|
|
|
316
316
|
|
|
317
317
|
## Development
|
|
318
318
|
|
|
319
|
+
Because `TreRegex` compiles the underlying TRE C-library from source, you must have standard C-compilation and `autotools` dependencies installed on your machine before running the setup script
|
|
320
|
+
|
|
321
|
+
**Ubuntu / Debian Linux**
|
|
322
|
+
|
|
323
|
+
```bash
|
|
324
|
+
sudo apt-get update
|
|
325
|
+
sudo apt-get install build-essential autoconf automake libtool gettext autopoint pkg-config
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
**macOS**
|
|
329
|
+
|
|
330
|
+
Then, install the autotools suite via [Homebrew](https://brew.sh/):
|
|
331
|
+
```bash
|
|
332
|
+
brew install autoconf automake libtool gettext pkg-config
|
|
333
|
+
```
|
|
334
|
+
|
|
319
335
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
320
336
|
|
|
321
337
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
data/ext/tre_regex/extconf.rb
CHANGED
data/lib/tre_regex/version.rb
CHANGED
data/lib/tre_regex.rb
CHANGED
|
@@ -42,6 +42,12 @@ module TreRegex
|
|
|
42
42
|
REG_NEWLINE = 4
|
|
43
43
|
REG_NOSUB = 8
|
|
44
44
|
|
|
45
|
+
# TRE's regex_t struct
|
|
46
|
+
class RegexT < FFI::Struct
|
|
47
|
+
layout :re_nsub, :size_t,
|
|
48
|
+
:value, :pointer
|
|
49
|
+
end
|
|
50
|
+
|
|
45
51
|
# Memory layout for TRE match offsets
|
|
46
52
|
class RegMatch < FFI::Struct
|
|
47
53
|
layout :rm_so, :int,
|
|
@@ -82,13 +88,12 @@ module TreRegex
|
|
|
82
88
|
|
|
83
89
|
def initialize(pattern, ignore_case: false)
|
|
84
90
|
@pattern = pattern
|
|
85
|
-
|
|
86
|
-
@preg = FFI::MemoryPointer.new(:char, 256)
|
|
91
|
+
@preg = Native::RegexT.new
|
|
87
92
|
|
|
88
93
|
flags = Native::REG_EXTENDED
|
|
89
94
|
flags |= Native::REG_ICASE if ignore_case
|
|
90
95
|
|
|
91
|
-
res = Native.tre_regcomp(@preg, pattern, flags)
|
|
96
|
+
res = Native.tre_regcomp(@preg.to_ptr, pattern, flags)
|
|
92
97
|
raise TreRegex::Error, "Failed to compile regex pattern: #{pattern}" if res != 0
|
|
93
98
|
|
|
94
99
|
# Garbage Collection Hook: Tell Ruby to free the C memory when this object is destroyed
|
|
@@ -96,10 +101,12 @@ module TreRegex
|
|
|
96
101
|
end
|
|
97
102
|
|
|
98
103
|
# The GC finalizer proc
|
|
99
|
-
def self.finalize(
|
|
104
|
+
def self.finalize(preg)
|
|
100
105
|
proc do
|
|
101
|
-
|
|
102
|
-
|
|
106
|
+
# Free the internal arrays allocated by TRE
|
|
107
|
+
Native.tre_regfree(preg.to_ptr)
|
|
108
|
+
# Safely free the struct memory ourselves
|
|
109
|
+
preg.to_ptr.free
|
|
103
110
|
end
|
|
104
111
|
end
|
|
105
112
|
|
|
@@ -149,7 +156,7 @@ module TreRegex
|
|
|
149
156
|
pmatch_array = FFI::MemoryPointer.new(Native::RegMatch, MAX_NMATCH)
|
|
150
157
|
match_data = prepare_match_data(pmatch_array, MAX_NMATCH)
|
|
151
158
|
|
|
152
|
-
res = Native.tre_reganexec(@preg, text_ptr, len, match_data, params, 0)
|
|
159
|
+
res = Native.tre_reganexec(@preg.to_ptr, text_ptr, len, match_data, params, 0)
|
|
153
160
|
return nil unless res.zero?
|
|
154
161
|
|
|
155
162
|
# Return the entire array pointer to be parsed
|
|
@@ -192,43 +199,71 @@ module TreRegex
|
|
|
192
199
|
end
|
|
193
200
|
end
|
|
194
201
|
|
|
202
|
+
# Helper to safely align TRE's raw byte offsets to valid UTF-8 boundaries
|
|
203
|
+
def align_bounds(text, absolute_so, absolute_eo)
|
|
204
|
+
safe_so = absolute_so.clamp(0, text.bytesize)
|
|
205
|
+
|
|
206
|
+
# Shift start backward to the nearest valid character boundary
|
|
207
|
+
# (byte & 0xC0) == 0x80 checks if the byte is a UTF-8 continuation byte
|
|
208
|
+
safe_so -= 1 while safe_so.positive? && (text.getbyte(safe_so) & 0xC0) == 0x80
|
|
209
|
+
|
|
210
|
+
safe_eo = absolute_eo.clamp(0, text.bytesize)
|
|
211
|
+
|
|
212
|
+
# Shift end forward to the nearest valid character boundary
|
|
213
|
+
safe_eo += 1 while safe_eo < text.bytesize && (text.getbyte(safe_eo) & 0xC0) == 0x80
|
|
214
|
+
|
|
215
|
+
safe_so = safe_eo if safe_so > safe_eo
|
|
216
|
+
|
|
217
|
+
[safe_so, safe_eo]
|
|
218
|
+
end
|
|
219
|
+
|
|
195
220
|
def extract_match_payload(text, byte_off, char_off, m_info)
|
|
196
221
|
pmatch_array, nmatch, match_data = m_info
|
|
222
|
+
abs_so, abs_eo = primary_match_bounds(text, byte_off, pmatch_array)
|
|
197
223
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
rm_so = full_rm[:rm_so]
|
|
201
|
-
rm_eo = full_rm[:rm_eo]
|
|
224
|
+
match_str = text.byteslice(abs_so...abs_eo) || ''
|
|
225
|
+
start_index = char_off + (text.byteslice(byte_off...abs_so) || '').length
|
|
202
226
|
|
|
203
|
-
|
|
204
|
-
|
|
227
|
+
payload = format_payload(
|
|
228
|
+
match_str, start_index, match_data,
|
|
229
|
+
extract_submatches(text, byte_off, pmatch_array, nmatch)
|
|
230
|
+
)
|
|
205
231
|
|
|
206
|
-
payload
|
|
232
|
+
[payload, abs_eo - byte_off, start_index - char_off + match_str.length]
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def primary_match_bounds(text, byte_off, pmatch_array)
|
|
236
|
+
full_rm = Native::RegMatch.new(pmatch_array)
|
|
237
|
+
align_bounds(text, byte_off + full_rm[:rm_so], byte_off + full_rm[:rm_eo])
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def format_payload(match_str, start_index, match_data, submatches)
|
|
241
|
+
{
|
|
207
242
|
match: match_str,
|
|
208
|
-
submatches
|
|
209
|
-
index:
|
|
210
|
-
end_index:
|
|
243
|
+
submatches:,
|
|
244
|
+
index: start_index,
|
|
245
|
+
end_index: start_index + match_str.length,
|
|
211
246
|
cost: match_data[:cost],
|
|
212
247
|
errors: parse_errors(match_data)
|
|
213
248
|
}
|
|
214
|
-
|
|
215
|
-
[payload, rm_eo, prefix_len + match_str.length]
|
|
216
249
|
end
|
|
217
250
|
|
|
218
251
|
def extract_submatches(text, byte_off, pmatch_array, nmatch)
|
|
219
252
|
submatches = (1...nmatch).map do |i|
|
|
220
253
|
# Advance the memory pointer by the size of the struct for each index
|
|
221
254
|
rm = Native::RegMatch.new(pmatch_array + (i * Native::RegMatch.size))
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
255
|
+
raw_so = rm[:rm_so]
|
|
256
|
+
raw_eo = rm[:rm_eo]
|
|
257
|
+
|
|
258
|
+
if raw_so == -1 || raw_so > raw_eo
|
|
259
|
+
nil
|
|
260
|
+
else
|
|
261
|
+
abs_so, abs_eo = align_bounds(text, byte_off + raw_so, byte_off + raw_eo)
|
|
262
|
+
text.byteslice(abs_so...abs_eo)
|
|
263
|
+
end
|
|
227
264
|
end
|
|
228
265
|
|
|
229
|
-
# Cleanup: Remove trailing nil values (unused capture groups)
|
|
230
266
|
submatches.pop while submatches.last.nil? && !submatches.empty?
|
|
231
|
-
|
|
232
267
|
submatches
|
|
233
268
|
end
|
|
234
269
|
|
data/tre_regex.gemspec
CHANGED
|
@@ -9,11 +9,11 @@ Gem::Specification.new do |spec|
|
|
|
9
9
|
spec.email = ['leopard.not.a@gmail.com']
|
|
10
10
|
spec.license = 'MIT'
|
|
11
11
|
|
|
12
|
-
spec.summary = 'A fast Ruby FFI wrapper for the TRE approximate regex matching library
|
|
12
|
+
spec.summary = 'A fast Ruby FFI wrapper for the TRE approximate regex matching library'
|
|
13
13
|
spec.description = [
|
|
14
|
-
'TreRegex provides a high-performance Ruby interface to the TRE C library
|
|
14
|
+
'TreRegex provides a high-performance Ruby interface to the TRE C library.',
|
|
15
15
|
'It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring',
|
|
16
|
-
'multi-byte Unicode string safety, granular error limits
|
|
16
|
+
'multi-byte Unicode string safety, and granular error limits'
|
|
17
17
|
].join(' ')
|
|
18
18
|
spec.homepage = 'https://github.com/le0pard/tre_regex'
|
|
19
19
|
spec.required_ruby_version = '>= 3.3.0'
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tre_regex
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.2
|
|
5
5
|
platform: arm-linux-gnu
|
|
6
6
|
authors:
|
|
7
7
|
- Oleksii Vasyliev
|
|
@@ -23,10 +23,9 @@ dependencies:
|
|
|
23
23
|
- - ">="
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '1.0'
|
|
26
|
-
description: TreRegex provides a high-performance Ruby interface to the TRE C library
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
cross-platform native binaries
|
|
26
|
+
description: TreRegex provides a high-performance Ruby interface to the TRE C library.
|
|
27
|
+
It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring
|
|
28
|
+
multi-byte Unicode string safety, and granular error limits
|
|
30
29
|
email:
|
|
31
30
|
- leopard.not.a@gmail.com
|
|
32
31
|
executables: []
|
|
@@ -73,5 +72,5 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
73
72
|
requirements: []
|
|
74
73
|
rubygems_version: 4.0.6
|
|
75
74
|
specification_version: 4
|
|
76
|
-
summary: A fast Ruby FFI wrapper for the TRE approximate regex matching library
|
|
75
|
+
summary: A fast Ruby FFI wrapper for the TRE approximate regex matching library
|
|
77
76
|
test_files: []
|