PyPI - multiregex - Versions diffs - 2.0.2__py3-none-any.whl - Mend

multiregex 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

multiregex/__init__.py +338 -0
multiregex/py.typed +0 -0
multiregex-2.0.2.dist-info/LICENSE +11 -0
multiregex-2.0.2.dist-info/METADATA +125 -0
multiregex-2.0.2.dist-info/RECORD +7 -0
multiregex-2.0.2.dist-info/WHEEL +5 -0
multiregex-2.0.2.dist-info/top_level.txt +1 -0

multiregex/__init__.py ADDED Viewed

@@ -0,0 +1,338 @@
+r"""Speed up regex matching with non-regex substring "prematchers", similar to
+Bloom filters.
+For each regex pattern we use a list of simple (non-regex) substring prematchers.
+When evaluating regex patterns on a string, we use the prematchers to restrict
+the set of regex patterns to be run. Hence, the prematchers _must_ match each string
+unless it's impossible for the corresponding regex to match, similar to Bloom filters.
+Examples:
+    r"\bfoo\b"          -> ["foo"]
+    r"(foo|bar) \s*"    -> ["foo ", "bar "]
+    r"Gemäß Richtlinie" -> ["gemäß richtlinie"]
+    # Prematchers are all-lowercase (to support re.IGNORECASE).
+Prematchers are attempted to be automatically generated from the regexes, see
+`RegexMatcher.generate_prematchers`.  You must provide a handcrafted list of
+prematchers for regexes that this fails for.  You may also override the
+automatically generated prematchers.
+"""
+import collections
+import functools
+import importlib
+import re
+import warnings
+try:
+    sre_constants = re._constants  # type: ignore
+    sre_parse = re._parser  # type: ignore
+except AttributeError:
+    import sre_constants
+    import sre_parse
+from typing import (
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Pattern,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+import ahocorasick
+try:
+    __version__ = importlib.metadata.version(__name__)
+except importlib.metadata.PackageNotFoundError as e:
+    warnings.warn(f"Could not determine version of {__name__}", stacklevel=1)
+    warnings.warn(str(e), stacklevel=1)
+    __version__ = "unknown"
+V = TypeVar("V")
+PatternOrStr = Union[Pattern, str]
+Prematchers = Set[str]
+FalsePositivesCounter = Dict[str, int]
+class AhocorasickError(Exception):
+    pass
+class RegexMatcher:
+    def __init__(
+        self,
+        patterns: Iterable[
+            Union[PatternOrStr, Tuple[PatternOrStr, Optional[Iterable[str]]]]
+        ],
+        count_prematcher_false_positives=False,
+    ):
+        """
+        Parameters
+        ----------
+        patterns : list of patterns or (pattern, prematchers) tuples
+            The patterns to match against. Patterns may either be instances of
+            `re.Pattern` (results from `re.compile`) or strings.
+            If given as list of `(pattern, prematchers)` tuples, `prematchers`
+            are custom prematchers (iterables of strings) or `None` for automatic
+            prematchers using `generate_prematchers`. To disable prematchers for
+            a specific pattern (ie., always run the "slow" matcher without any
+            prematching), use a `(pattern, []`) tuple.
+        count_prematcher_false_positives : bool, default: False
+            If true, enable "profiling" to check the effectiveness of prematchers on
+            the input strings given to ``search``, ``match``, and ``fullmatch``.
+            Use ``format_prematcher_false_positives`` to retrieve the profile.
+        """
+        patterns = self._normalize_patterns(patterns)
+        patterns = self._generate_missing_prematchers(patterns)
+        self.patterns = [pattern for pattern, _ in patterns]
+        self.prematchers = dict(patterns)
+        enumerated_patterns = list(enumerate(patterns))
+        self.patterns_without_prematchers = {
+            (idx, pattern)
+            for idx, (pattern, prematchers) in enumerated_patterns
+            if not prematchers
+        }
+        self.automaton = self._make_automaton(enumerated_patterns)
+        self.count_prematcher_false_positives = count_prematcher_false_positives
+        if count_prematcher_false_positives:
+            self.prematcher_false_positives = {
+                pattern: {"positives": 0, "false_positives": 0}
+                for pattern in self.patterns
+            }
+    @classmethod
+    def generate_prematchers(cls, pattern: Pattern) -> Prematchers:
+        """Generate prematchers for the given pattern."""
+        return generate_prematchers(pattern)
+    @staticmethod
+    def _normalize_patterns(patterns):
+        """Normalize `patterns` param given to `__init__`."""
+        def safe_set(iterable):
+            if isinstance(iterable, str):
+                raise TypeError(
+                    f"Refusing to interpret {iterable!r} as a list of patterns, pass a list of strings instead"
+                )
+            else:
+                return set(iterable)
+        patterns = list(patterns)
+        if patterns and not isinstance(patterns[0], tuple):
+            return [(re.compile(pattern), None) for pattern in patterns]
+        else:
+            return [
+                (
+                    re.compile(pattern),
+                    None if prematchers is None else safe_set(prematchers),
+                )
+                for pattern, prematchers in patterns
+            ]
+    def _generate_missing_prematchers(self, patterns):
+        patterns = [
+            (
+                pattern,
+                (
+                    self.generate_prematchers(pattern)
+                    if prematchers is None
+                    else prematchers
+                ),
+            )
+            for pattern, prematchers in patterns
+        ]
+        for _, prematchers in patterns:
+            for prematcher in prematchers:
+                validate_prematcher(prematcher)
+        return patterns
+    @staticmethod
+    def _make_automaton(enumerated_patterns):
+        """Create the pyahocorasick automaton."""
+        pattern_candidates_by_prematchers = collections.defaultdict(set)
+        for pattern_idx, (pattern, prematchers) in enumerated_patterns:
+            for prematcher in prematchers:
+                # `pattern_idx` is used for keeping patterns in order, see `get_pattern_candidates`.
+                pattern_candidates_by_prematchers[prematcher].add(
+                    (pattern_idx, pattern)
+                )
+        return _ahocorasick_make_automaton(pattern_candidates_by_prematchers)
+    def run(self, match_func, s, enable_prematchers=True):
+        """Quickly run `match_func` against `s` for all patterns.
+        Parameters
+        ----------
+        match_func : Callable[str] -> Match
+            The base matching function, eg. `re.search`.
+        s : str
+            The string to match against.
+        enable_prematchers : bool (default True)
+            If false, do not use prematchers; use `match_func` only.
+        """
+        if enable_prematchers:
+            candidates = self.get_pattern_candidates(s)
+        else:
+            candidates = self.patterns
+        # Inlined versions for match_func = re.match/search, up to 30% faster.
+        if match_func is re.search:
+            re_results = [(pattern, pattern.search(s)) for pattern in candidates]
+        elif match_func is re.match:
+            re_results = [(pattern, pattern.match(s)) for pattern in candidates]
+        elif match_func is re.fullmatch:
+            re_results = [(pattern, pattern.fullmatch(s)) for pattern in candidates]
+        else:
+            re_results = [(pattern, match_func(pattern, s)) for pattern in candidates]
+        if self.count_prematcher_false_positives:
+            for pattern, match in re_results:
+                self.prematcher_false_positives[pattern]["positives"] += 1
+                if match is None:
+                    self.prematcher_false_positives[pattern]["false_positives"] += 1
+        return [(pattern, match) for pattern, match in re_results if match is not None]
+    """Alias for ``run(re.search, ...)``."""
+    search = functools.partialmethod(run, re.search)
+    """Alias for ``run(re.match, ...)``."""
+    match = functools.partialmethod(run, re.match)
+    """Alias for ``run(re.fullmatch, ...)``."""
+    fullmatch = functools.partialmethod(run, re.fullmatch)
+    def get_pattern_candidates(self, s: str) -> List[Pattern]:
+        """Get a list of patterns that potentially match `s`.
+        Pattern order is the same the order of `patterns` given to `__init__`.
+        """
+        matches = self.automaton.iter(s.lower())
+        unordered_candidates = self.patterns_without_prematchers.union(
+            *(candidates for _, candidates in matches)
+        )
+        # Sort by `pattern_idx`, see `_make_automaton`.
+        ordered_candidates = sorted(unordered_candidates, key=lambda x: x[0])
+        return [pattern for _, pattern in ordered_candidates]
+    def get_prematcher_false_positives(
+        self,
+    ) -> List[Tuple[Pattern, FalsePositivesCounter]]:
+        if not self.count_prematcher_false_positives:
+            raise RuntimeError("Prematcher profiling not enabled")
+        return sorted(
+            (
+                (pattern, fp_counter)
+                for pattern, fp_counter in self.prematcher_false_positives.items()
+                if fp_counter["false_positives"]
+            ),
+            key=lambda x: -x[1]["false_positives"],
+        )
+    def format_prematcher_false_positives(self, worst_n: Optional[int] = None) -> str:
+        output = [
+            "FP count | FP rate | Pattern / Prematchers",
+            "---------+---------+----------------------",
+        ]
+        fp_data = self.get_prematcher_false_positives()[:worst_n]
+        if fp_data:
+            for pattern, fp_counter in fp_data:
+                output.append(
+                    "{:>8d} |    {:.2f} | {} / {}".format(
+                        fp_counter["false_positives"],
+                        fp_counter["false_positives"] / fp_counter["positives"],
+                        pattern.pattern,
+                        self.prematchers[pattern],
+                    )
+                )
+        else:
+            output.append("(No data)")
+        return "\n".join(output)
+def validate_prematcher(prematcher: str) -> None:
+    if not prematcher or any(map(str.isupper, prematcher)):
+        raise ValueError(
+            f"Prematcher {prematcher!r} must be non-empty, all-lowercase, all-ASCII"
+        )
+def generate_prematchers(pattern: Pattern) -> Prematchers:
+    """Generate fallback/default prematchers for the given regex `pattern`.
+    Currently the fallback prematcher is just the set of longest
+    terminal texts in the pattern, eg. "Fast(er)? regex(es| matching)"
+    -> {" regex"}. One level of branches with the "|" character is
+    supported, ie. "(a|bb|ccc)" -> {"ccc", "a", "bb"}.
+    """
+    def _get_top_level_prematcher(sre_ast):
+        return max(_sre_find_terminals(sre_ast), key=len, default="").lower()
+    sre_ast = _simplify_sre_ast(sre_parse.parse(pattern.pattern))
+    # Simple case: We find a top-level terminal string (eg. r"Fast(er)" -> "Fast").
+    top_level_prematcher = _get_top_level_prematcher(sre_ast)
+    if top_level_prematcher:
+        return {top_level_prematcher}
+    # Branch case: We find a first-level terminal string in a branch (eg. r"(abc|de)" -> {"abc", "de"}).
+    # Each of the children must have a top-level simple prematcher. Nesting is not supported.
+    sre_branches = (
+        value[1] for type_, value in sre_ast if type_ == sre_constants.BRANCH
+    )
+    for children in sre_branches:
+        simplified_children = map(_simplify_sre_ast, children)
+        child_prematchers = set(map(_get_top_level_prematcher, simplified_children))
+        if all(child_prematchers):
+            return child_prematchers
+    raise ValueError(f"Could not generate prematchers for {pattern.pattern!r}")
+def _simplify_sre_ast(sre_ast):
+    """Simplify an sre AST.
+    - Transform pattern r"(...)" to r"...".
+    """
+    if len(sre_ast) == 1 and sre_ast[0][0] is sre_constants.SUBPATTERN:
+        if len(sre_ast[0][1]) == 2:
+            # Python < 3.6 has no subpattern flags support
+            return sre_ast[0][1][1]
+        else:
+            _, add_flags, del_flags, p = sre_ast[0][1]
+            if not add_flags and not del_flags:
+                return p
+    return sre_ast
+def _sre_find_terminals(sre_ast):
+    """Find all terminals (streaks of LITERALs) in an sre AST."""
+    i = 0
+    while i < len(sre_ast):
+        chars = []
+        while i < len(sre_ast) and sre_ast[i][0] is sre_constants.LITERAL:
+            chars.append(cast(int, sre_ast[i][1]))
+            i += 1
+        yield "".join(map(chr, chars))
+        i += 1
+def _ahocorasick_make_automaton(words: Dict[str, V]) -> "ahocorasick.Automaton[V]":
+    """Make an ahocorasick automaton from a dictionary of `needle -> value`
+    items."""
+    automaton = ahocorasick.Automaton()  # type: ahocorasick.Automaton[V]
+    for word, value in words.items():
+        _ahocorasick_ensure_successful(automaton.add_word(word, value))
+    _ahocorasick_ensure_successful(automaton.make_automaton())
+    return automaton
+def _ahocorasick_ensure_successful(res):
+    """Pyahocorasick returns errors as bools."""
+    if res is False:
+        raise AhocorasickError("Error performing ahocorasick call")

multiregex/py.typed ADDED Viewed

File without changes

multiregex-2.0.2.dist-info/LICENSE ADDED Viewed

@@ -0,0 +1,11 @@
+Copyright 2022 QuantCo Inc
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

multiregex-2.0.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,125 @@
+Metadata-Version: 2.1
+Name: multiregex
+Version: 2.0.2
+Summary: Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
+Author-email: "QuantCo, Inc." <noreply@quantco.com>, Jonas Haag <jonas@lophus.org>
+Maintainer-email: Bela Stoyan <bela.stoyan@quantco.com>
+Project-URL: Home, https://github.com/quantco/multiregex
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pyahocorasick
+# multiregex
+[![CI](https://github.com/Quantco/multiregex/actions/workflows/ci.yml/badge.svg)](https://github.com/Quantco/multiregex/actions/workflows/ci.yml)
+[![Documentation](https://img.shields.io/badge/docs-latest-success?style=plastic)](https://docs.dev.quantco.cloud/qc-github-artifacts/Quantco/multiregex/latest/index.html)
+[![conda-forge](https://img.shields.io/conda/vn/conda-forge/multiregex?logoColor=white&logo=conda-forge)](https://anaconda.org/conda-forge/multiregex)
+[![pypi-version](https://img.shields.io/pypi/v/multiregex.svg?logo=pypi&logoColor=white)](https://pypi.org/project/multiregex)
+[![python-version](https://img.shields.io/pypi/pyversions/multiregex?logoColor=white&logo=python)](https://pypi.org/project/multiregex)
+Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
+## Introduction
+See [this introductory blog post](https://tech.quantco.com/2022/07/31/multiregex.html).
+## Installation
+You can install the package in development mode using:
+```bash
+git clone https://github.com/quantco/multiregex
+cd multiregex
+pixi run pre-commit-install
+pixi run postinstall
+pixi run test
+```
+## Usage
+```py
+import multiregex
+# Create matcher from multiple regexes.
+my_patterns = [r"\w+@\w+\.com", r"\w\.com"]
+matcher = multiregex.RegexMatcher(my_patterns)
+# Run `re.search` for all regexes.
+# Returns a set of matches as (re.Pattern, re.Match) tuples.
+matcher.search("john.doe@example.com")
+# => [(re.compile('\\w+@\\w+\\.com'), <re.Match ... 'doe@example.com'>),
+#     (re.compile('\\w+\\.com'), <re.Match ... 'example.com'>)]
+# Same as above, but with `re.match`.
+matcher.match(...)
+# Same as above, but with `re.fullmatch`.
+matcher.fullmatch(...)
+```
+### Custom prematchers
+To be able to quickly match many regexes against a string, `multiregex` uses
+"prematchers" under the hood. Prematchers are lists of non-regex strings of which
+at least one can be assumed to be present in the haystack if the corresponding regex matches.
+As an example, a valid prematcher of `r"\w+\.com"` could be `[".com"]` and a valid
+prematcher of `r"(B|b)aNäNa"` could be `["b"]` or `["anäna"]`.
+Note that prematchers must be all-lowercase (in order for `multiregex` to be able to support `re.IGNORECASE`).
+You will likely have to provide your own prematchers for all but the simplest
+regex patterns:
+```py
+multiregex.RegexMatcher([r"\d+"])
+# => ValueError: Could not generate prematcher : '\\d+'
+```
+To provide custom prematchers, pass `(pattern, prematchers)` tuples:
+```py
+multiregex.RegexMatcher([(r"\d+", map(str, range(10)))])
+```
+To use a mixture of automatic and custom prematchers, pass `prematchers=None`:
+```py
+matcher = multiregex.RegexMatcher([(r"\d+", map(str, range(10))), (r"\w+\.com", None)])
+matcher.prematchers
+# => {(re.compile('\\d+'), {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}),
+#     (re.compile('\\w+\\.com'), {'com'})}
+```
+### Disabling prematchers
+To disable prematching for certain pattern entirely (ie., always run the regex
+without first running any prematchers), pass an empty list of prematchers:
+```py
+multiregex.RegexMatcher([(r"super complicated regex", [])])
+```
+### Profiling prematchers
+To check if your prematchers are effective, you can use the built-in prematcher "profiler":
+```py
+yyyy_mm_dd = r"(19|20)\d\d-\d\d-\d\d"  # Default prematchers: {'-'}
+matcher = multiregex.RegexMatcher([yyyy_mm_dd], count_prematcher_false_positives=True)
+for string in my_benchmark_dataset:
+    matcher.search(string)
+print(matcher.format_prematcher_false_positives())
+# => For example:
+# FP count | FP rate | Pattern / Prematchers
+# ---------+---------+----------------------
+#      137 |    0.72 | (19|20)\d\d-\d\d-\d\d / {'-'}
+```
+In this example, there were 137 input strings that were matched positive by the prematcher but negative by the regex.
+In other words, the prematcher failed to prevent slow regex evaluation in 72% of the cases.

multiregex-2.0.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+multiregex/__init__.py,sha256=6y-8Kj9Ka3wg9t-QDJ4QZHj1rZ3Z45jtgh560RbIo1M,12776
+multiregex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+multiregex-2.0.2.dist-info/LICENSE,sha256=8iTh63RQnJRHksLaxQWFA2C4WxaXZUnN9LsvUjOdKFg,1456
+multiregex-2.0.2.dist-info/METADATA,sha256=JbQ8DP39FIJu0Y31U05lpAflI_shFJj7Ixp_MgN78es,4719
+multiregex-2.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+multiregex-2.0.2.dist-info/top_level.txt,sha256=sHA7Yg-eHBrGliGfhxx5V2urLZXf4TlhOgovX-5m0kY,11
+multiregex-2.0.2.dist-info/RECORD,,

multiregex-2.0.2.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.43.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

multiregex-2.0.2.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ multiregex