RubyGems - tre_regex - Versions diffs - 0.2.1 → 0.2.2 - Mend

tre_regex 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3f996c2be74794c2803014dec0ad4a6f10cf5d1d6038b193b3783a0dc6860a47
-  data.tar.gz: 40ee68a2e227afea214a7cca1e6f4c8a09eae6fbbdba94af66a33b17c77fd500
+  metadata.gz: 76f866d01a4c36d9779bba23bd55f3a767e6b110e19a5b7c11797a9c48d88a48
+  data.tar.gz: 64181d7a9578eb66326ec9d898e9f439f117bd7180eb17e08d29de0bcd478b54
 SHA512:
-  metadata.gz: 75be84d904e510aea1439698d62bf02d07d18629153b7ded65df6d52ba0ff7139db0e2e9fbb019c77eb5c82daf97f70e1aab7717a8b8538cb6c03a0e69398ee5
-  data.tar.gz: 7dc4c879541d514c0ccbac0d88681f7c91246819042d6dcda5ca8ce8ad6e05a14112a4b909864a2d534e8946e69897d4acf06ece18f3b0ad9ecae49e74f94cd6
+  metadata.gz: bad8e0d99218e8963f4805de46ac318692d1fbf50577953be3189f982595eca040f1f7ddcf2d63fe3a948a54a55649b61b763e46ab85f8d1fcb67d18c5eed27d
+  data.tar.gz: 07d4c6d70c3516adcf7f23a247aae92ee5b417b470ff5879b8f451411406ad511c19317ec79c2b505584d20be801029c7cb08976264456d870899dbee9042625

data/README.md CHANGED Viewed

@@ -87,10 +87,10 @@ regex = TreRegex::Regex.new('cat')
 # Returns an array of match hashes
 regex.match_all('cat, cot, cut', max_errors: 1).to_a
 # => [
-#   {match: "cat", submatches: [], index: 0, end_index: 3, cost: 0, errors: {insertions: 0, deletions: 0, substitutions: 0}},
+#  {match: "cat", submatches: [], index: 0, end_index: 3, cost: 0, errors: {insertions: 0, deletions: 0, substitutions: 0}},
 #  {match: "cot", submatches: [], index: 5, end_index: 8, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
 #  {match: "cut", submatches: [], index: 10, end_index: 13, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}}
-#    ]
+# ]
 ```
 ### Capture Groups (Submatches)
@@ -240,10 +240,10 @@ regex = TreRegex::Regex.new('cat')
 # but it also matches "" at the end of the string (3 deletions)!
 regex.match_all('cot, cow', max_errors: 3).to_a
 # => [
-#     {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
-#     {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}},
-#     {match: "", submatches: [], index: 8, end_index: 8, cost: 3, errors: {insertions: 0, deletions: 3, substitutions: 0}}
-#    ]
+#  {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
+#  {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}},
+#  {match: "", submatches: [], index: 8, end_index: 8, cost: 3, errors: {insertions: 0, deletions: 3, substitutions: 0}}
+# ]
 ```
 **Best Practice**: if you need a high `max_errors` limit but want to prevent the engine from matching empty strings, explicitly cap the `max_deletions` option so that at least one character of your pattern must survive
@@ -252,9 +252,9 @@ regex.match_all('cot, cow', max_errors: 3).to_a
 # Allow 3 total errors, but strictly forbid the engine from deleting more than 2 characters
 regex.match_all('cot, cow', max_errors: 3, max_deletions: 2).to_a
 # => [
-#     {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
-#     {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}}
-#    ] # The empty match is mathematically prevented
+#  {match: "cot", submatches: [], index: 0, end_index: 3, cost: 1, errors: {insertions: 0, deletions: 0, substitutions: 1}},
+#  {match: "cow", submatches: [], index: 5, end_index: 8, cost: 2, errors: {insertions: 0, deletions: 0, substitutions: 2}}
+# ] # The empty match is mathematically prevented
 ```
 ### POSIX vs. PCRE Syntax

data/lib/tre_regex/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module TreRegex
-  VERSION = '0.2.1'
+  VERSION = '0.2.2'
 end

data/lib/tre_regex.rb CHANGED Viewed

@@ -199,43 +199,71 @@ module TreRegex
       end
     end
+    # Helper to safely align TRE's raw byte offsets to valid UTF-8 boundaries
+    def align_bounds(text, absolute_so, absolute_eo)
+      safe_so = absolute_so.clamp(0, text.bytesize)
+      # Shift start backward to the nearest valid character boundary
+      # (byte & 0xC0) == 0x80 checks if the byte is a UTF-8 continuation byte
+      safe_so -= 1 while safe_so.positive? && (text.getbyte(safe_so) & 0xC0) == 0x80
+      safe_eo = absolute_eo.clamp(0, text.bytesize)
+      # Shift end forward to the nearest valid character boundary
+      safe_eo += 1 while safe_eo < text.bytesize && (text.getbyte(safe_eo) & 0xC0) == 0x80
+      safe_so = safe_eo if safe_so > safe_eo
+      [safe_so, safe_eo]
+    end
     def extract_match_payload(text, byte_off, char_off, m_info)
       pmatch_array, nmatch, match_data = m_info
+      abs_so, abs_eo = primary_match_bounds(text, byte_off, pmatch_array)
-      # Read the full match boundaries from index 0
-      full_rm = Native::RegMatch.new(pmatch_array)
-      rm_so = full_rm[:rm_so]
-      rm_eo = full_rm[:rm_eo]
+      match_str = text.byteslice(abs_so...abs_eo) || ''
+      start_index = char_off + (text.byteslice(byte_off...abs_so) || '').length
+      payload = format_payload(
+        match_str, start_index, match_data,
+        extract_submatches(text, byte_off, pmatch_array, nmatch)
+      )
-      prefix_len = (text.byteslice(byte_off, rm_so) || '').length
-      match_str = text.byteslice((byte_off + rm_so)...(byte_off + rm_eo))
+      [payload, abs_eo - byte_off, start_index - char_off + match_str.length]
+    end
-      payload = {
+    def primary_match_bounds(text, byte_off, pmatch_array)
+      full_rm = Native::RegMatch.new(pmatch_array)
+      align_bounds(text, byte_off + full_rm[:rm_so], byte_off + full_rm[:rm_eo])
+    end
+    def format_payload(match_str, start_index, match_data, submatches)
+      {
         match: match_str,
-        submatches: extract_submatches(text, byte_off, pmatch_array, nmatch),
-        index: char_off + prefix_len,
-        end_index: char_off + prefix_len + match_str.length,
+        submatches:,
+        index: start_index,
+        end_index: start_index + match_str.length,
         cost: match_data[:cost],
         errors: parse_errors(match_data)
       }
-      [payload, rm_eo, prefix_len + match_str.length]
     end
     def extract_submatches(text, byte_off, pmatch_array, nmatch)
       submatches = (1...nmatch).map do |i|
         # Advance the memory pointer by the size of the struct for each index
         rm = Native::RegMatch.new(pmatch_array + (i * Native::RegMatch.size))
-        sub_so = rm[:rm_so]
-        sub_eo = rm[:rm_eo]
-        # Safely extract the group, inserting nil if it was optional and unmatched
-        sub_so == -1 ? nil : text.byteslice((byte_off + sub_so)...(byte_off + sub_eo))
+        raw_so = rm[:rm_so]
+        raw_eo = rm[:rm_eo]
+        if raw_so == -1 || raw_so > raw_eo
+          nil
+        else
+          abs_so, abs_eo = align_bounds(text, byte_off + raw_so, byte_off + raw_eo)
+          text.byteslice(abs_so...abs_eo)
+        end
       end
-      # Cleanup: Remove trailing nil values (unused capture groups)
       submatches.pop while submatches.last.nil? && !submatches.empty?
       submatches
     end

data/tre_regex.gemspec CHANGED Viewed

@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
   spec.summary = 'A fast Ruby FFI wrapper for the TRE approximate regex matching library'
   spec.description = [
-    'TreRegex provides a high-performance Ruby interface to the TRE C library using FFI.',
+    'TreRegex provides a high-performance Ruby interface to the TRE C library.',
     'It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring',
     'multi-byte Unicode string safety, and granular error limits'
   ].join(' ')

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tre_regex
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
 platform: ruby
 authors:
 - Oleksii Vasyliev
@@ -23,9 +23,9 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '1.0'
-description: TreRegex provides a high-performance Ruby interface to the TRE C library
-  using FFI. It brings robust approximate (fuzzy) regular expression matching to Ruby,
-  featuring multi-byte Unicode string safety, and granular error limits
+description: TreRegex provides a high-performance Ruby interface to the TRE C library.
+  It brings robust approximate (fuzzy) regular expression matching to Ruby, featuring
+  multi-byte Unicode string safety, and granular error limits
 email:
 - leopard.not.a@gmail.com
 executables: []
@@ -64,7 +64,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.6
+rubygems_version: 4.0.10
 specification_version: 4
 summary: A fast Ruby FFI wrapper for the TRE approximate regex matching library
 test_files: []