multiregex 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
multiregex/__init__.py ADDED
@@ -0,0 +1,338 @@
1
+ r"""Speed up regex matching with non-regex substring "prematchers", similar to
2
+ Bloom filters.
3
+
4
+ For each regex pattern we use a list of simple (non-regex) substring prematchers.
5
+ When evaluating regex patterns on a string, we use the prematchers to restrict
6
+ the set of regex patterns to be run. Hence, the prematchers _must_ match each string
7
+ unless it's impossible for the corresponding regex to match, similar to Bloom filters.
8
+
9
+ Examples:
10
+ r"\bfoo\b" -> ["foo"]
11
+ r"(foo|bar) \s*" -> ["foo ", "bar "]
12
+ r"Gemäß Richtlinie" -> ["gemäß richtlinie"]
13
+ # Prematchers are all-lowercase (to support re.IGNORECASE).
14
+
15
+ Prematchers are attempted to be automatically generated from the regexes, see
16
+ `RegexMatcher.generate_prematchers`. You must provide a handcrafted list of
17
+ prematchers for regexes that this fails for. You may also override the
18
+ automatically generated prematchers.
19
+ """
20
+
21
+ import collections
22
+ import functools
23
+ import importlib
24
+ import re
25
+ import warnings
26
+
27
+ try:
28
+ sre_constants = re._constants # type: ignore
29
+ sre_parse = re._parser # type: ignore
30
+ except AttributeError:
31
+ import sre_constants
32
+ import sre_parse
33
+ from typing import (
34
+ Dict,
35
+ Iterable,
36
+ List,
37
+ Optional,
38
+ Pattern,
39
+ Set,
40
+ Tuple,
41
+ TypeVar,
42
+ Union,
43
+ cast,
44
+ )
45
+
46
+ import ahocorasick
47
+
48
+ try:
49
+ __version__ = importlib.metadata.version(__name__)
50
+ except importlib.metadata.PackageNotFoundError as e:
51
+ warnings.warn(f"Could not determine version of {__name__}", stacklevel=1)
52
+ warnings.warn(str(e), stacklevel=1)
53
+ __version__ = "unknown"
54
+
55
+
56
+ V = TypeVar("V")
57
+ PatternOrStr = Union[Pattern, str]
58
+ Prematchers = Set[str]
59
+ FalsePositivesCounter = Dict[str, int]
60
+
61
+
62
+ class AhocorasickError(Exception):
63
+ pass
64
+
65
+
66
+ class RegexMatcher:
67
+ def __init__(
68
+ self,
69
+ patterns: Iterable[
70
+ Union[PatternOrStr, Tuple[PatternOrStr, Optional[Iterable[str]]]]
71
+ ],
72
+ count_prematcher_false_positives=False,
73
+ ):
74
+ """
75
+ Parameters
76
+ ----------
77
+ patterns : list of patterns or (pattern, prematchers) tuples
78
+ The patterns to match against. Patterns may either be instances of
79
+ `re.Pattern` (results from `re.compile`) or strings.
80
+ If given as list of `(pattern, prematchers)` tuples, `prematchers`
81
+ are custom prematchers (iterables of strings) or `None` for automatic
82
+ prematchers using `generate_prematchers`. To disable prematchers for
83
+ a specific pattern (ie., always run the "slow" matcher without any
84
+ prematching), use a `(pattern, []`) tuple.
85
+ count_prematcher_false_positives : bool, default: False
86
+ If true, enable "profiling" to check the effectiveness of prematchers on
87
+ the input strings given to ``search``, ``match``, and ``fullmatch``.
88
+ Use ``format_prematcher_false_positives`` to retrieve the profile.
89
+ """
90
+ patterns = self._normalize_patterns(patterns)
91
+ patterns = self._generate_missing_prematchers(patterns)
92
+ self.patterns = [pattern for pattern, _ in patterns]
93
+ self.prematchers = dict(patterns)
94
+ enumerated_patterns = list(enumerate(patterns))
95
+ self.patterns_without_prematchers = {
96
+ (idx, pattern)
97
+ for idx, (pattern, prematchers) in enumerated_patterns
98
+ if not prematchers
99
+ }
100
+ self.automaton = self._make_automaton(enumerated_patterns)
101
+
102
+ self.count_prematcher_false_positives = count_prematcher_false_positives
103
+ if count_prematcher_false_positives:
104
+ self.prematcher_false_positives = {
105
+ pattern: {"positives": 0, "false_positives": 0}
106
+ for pattern in self.patterns
107
+ }
108
+
109
+ @classmethod
110
+ def generate_prematchers(cls, pattern: Pattern) -> Prematchers:
111
+ """Generate prematchers for the given pattern."""
112
+ return generate_prematchers(pattern)
113
+
114
+ @staticmethod
115
+ def _normalize_patterns(patterns):
116
+ """Normalize `patterns` param given to `__init__`."""
117
+
118
+ def safe_set(iterable):
119
+ if isinstance(iterable, str):
120
+ raise TypeError(
121
+ f"Refusing to interpret {iterable!r} as a list of patterns, pass a list of strings instead"
122
+ )
123
+ else:
124
+ return set(iterable)
125
+
126
+ patterns = list(patterns)
127
+ if patterns and not isinstance(patterns[0], tuple):
128
+ return [(re.compile(pattern), None) for pattern in patterns]
129
+ else:
130
+ return [
131
+ (
132
+ re.compile(pattern),
133
+ None if prematchers is None else safe_set(prematchers),
134
+ )
135
+ for pattern, prematchers in patterns
136
+ ]
137
+
138
+ def _generate_missing_prematchers(self, patterns):
139
+ patterns = [
140
+ (
141
+ pattern,
142
+ (
143
+ self.generate_prematchers(pattern)
144
+ if prematchers is None
145
+ else prematchers
146
+ ),
147
+ )
148
+ for pattern, prematchers in patterns
149
+ ]
150
+ for _, prematchers in patterns:
151
+ for prematcher in prematchers:
152
+ validate_prematcher(prematcher)
153
+ return patterns
154
+
155
+ @staticmethod
156
+ def _make_automaton(enumerated_patterns):
157
+ """Create the pyahocorasick automaton."""
158
+ pattern_candidates_by_prematchers = collections.defaultdict(set)
159
+ for pattern_idx, (pattern, prematchers) in enumerated_patterns:
160
+ for prematcher in prematchers:
161
+ # `pattern_idx` is used for keeping patterns in order, see `get_pattern_candidates`.
162
+ pattern_candidates_by_prematchers[prematcher].add(
163
+ (pattern_idx, pattern)
164
+ )
165
+ return _ahocorasick_make_automaton(pattern_candidates_by_prematchers)
166
+
167
+ def run(self, match_func, s, enable_prematchers=True):
168
+ """Quickly run `match_func` against `s` for all patterns.
169
+
170
+ Parameters
171
+ ----------
172
+ match_func : Callable[str] -> Match
173
+ The base matching function, eg. `re.search`.
174
+ s : str
175
+ The string to match against.
176
+ enable_prematchers : bool (default True)
177
+ If false, do not use prematchers; use `match_func` only.
178
+ """
179
+ if enable_prematchers:
180
+ candidates = self.get_pattern_candidates(s)
181
+ else:
182
+ candidates = self.patterns
183
+
184
+ # Inlined versions for match_func = re.match/search, up to 30% faster.
185
+ if match_func is re.search:
186
+ re_results = [(pattern, pattern.search(s)) for pattern in candidates]
187
+ elif match_func is re.match:
188
+ re_results = [(pattern, pattern.match(s)) for pattern in candidates]
189
+ elif match_func is re.fullmatch:
190
+ re_results = [(pattern, pattern.fullmatch(s)) for pattern in candidates]
191
+ else:
192
+ re_results = [(pattern, match_func(pattern, s)) for pattern in candidates]
193
+
194
+ if self.count_prematcher_false_positives:
195
+ for pattern, match in re_results:
196
+ self.prematcher_false_positives[pattern]["positives"] += 1
197
+ if match is None:
198
+ self.prematcher_false_positives[pattern]["false_positives"] += 1
199
+
200
+ return [(pattern, match) for pattern, match in re_results if match is not None]
201
+
202
+ """Alias for ``run(re.search, ...)``."""
203
+ search = functools.partialmethod(run, re.search)
204
+ """Alias for ``run(re.match, ...)``."""
205
+ match = functools.partialmethod(run, re.match)
206
+ """Alias for ``run(re.fullmatch, ...)``."""
207
+ fullmatch = functools.partialmethod(run, re.fullmatch)
208
+
209
+ def get_pattern_candidates(self, s: str) -> List[Pattern]:
210
+ """Get a list of patterns that potentially match `s`.
211
+
212
+ Pattern order is the same the order of `patterns` given to `__init__`.
213
+ """
214
+ matches = self.automaton.iter(s.lower())
215
+ unordered_candidates = self.patterns_without_prematchers.union(
216
+ *(candidates for _, candidates in matches)
217
+ )
218
+ # Sort by `pattern_idx`, see `_make_automaton`.
219
+ ordered_candidates = sorted(unordered_candidates, key=lambda x: x[0])
220
+ return [pattern for _, pattern in ordered_candidates]
221
+
222
+ def get_prematcher_false_positives(
223
+ self,
224
+ ) -> List[Tuple[Pattern, FalsePositivesCounter]]:
225
+ if not self.count_prematcher_false_positives:
226
+ raise RuntimeError("Prematcher profiling not enabled")
227
+ return sorted(
228
+ (
229
+ (pattern, fp_counter)
230
+ for pattern, fp_counter in self.prematcher_false_positives.items()
231
+ if fp_counter["false_positives"]
232
+ ),
233
+ key=lambda x: -x[1]["false_positives"],
234
+ )
235
+
236
+ def format_prematcher_false_positives(self, worst_n: Optional[int] = None) -> str:
237
+ output = [
238
+ "FP count | FP rate | Pattern / Prematchers",
239
+ "---------+---------+----------------------",
240
+ ]
241
+ fp_data = self.get_prematcher_false_positives()[:worst_n]
242
+ if fp_data:
243
+ for pattern, fp_counter in fp_data:
244
+ output.append(
245
+ "{:>8d} | {:.2f} | {} / {}".format(
246
+ fp_counter["false_positives"],
247
+ fp_counter["false_positives"] / fp_counter["positives"],
248
+ pattern.pattern,
249
+ self.prematchers[pattern],
250
+ )
251
+ )
252
+ else:
253
+ output.append("(No data)")
254
+ return "\n".join(output)
255
+
256
+
257
+ def validate_prematcher(prematcher: str) -> None:
258
+ if not prematcher or any(map(str.isupper, prematcher)):
259
+ raise ValueError(
260
+ f"Prematcher {prematcher!r} must be non-empty, all-lowercase, all-ASCII"
261
+ )
262
+
263
+
264
+ def generate_prematchers(pattern: Pattern) -> Prematchers:
265
+ """Generate fallback/default prematchers for the given regex `pattern`.
266
+
267
+ Currently the fallback prematcher is just the set of longest
268
+ terminal texts in the pattern, eg. "Fast(er)? regex(es| matching)"
269
+ -> {" regex"}. One level of branches with the "|" character is
270
+ supported, ie. "(a|bb|ccc)" -> {"ccc", "a", "bb"}.
271
+ """
272
+
273
+ def _get_top_level_prematcher(sre_ast):
274
+ return max(_sre_find_terminals(sre_ast), key=len, default="").lower()
275
+
276
+ sre_ast = _simplify_sre_ast(sre_parse.parse(pattern.pattern))
277
+
278
+ # Simple case: We find a top-level terminal string (eg. r"Fast(er)" -> "Fast").
279
+ top_level_prematcher = _get_top_level_prematcher(sre_ast)
280
+ if top_level_prematcher:
281
+ return {top_level_prematcher}
282
+
283
+ # Branch case: We find a first-level terminal string in a branch (eg. r"(abc|de)" -> {"abc", "de"}).
284
+ # Each of the children must have a top-level simple prematcher. Nesting is not supported.
285
+ sre_branches = (
286
+ value[1] for type_, value in sre_ast if type_ == sre_constants.BRANCH
287
+ )
288
+ for children in sre_branches:
289
+ simplified_children = map(_simplify_sre_ast, children)
290
+ child_prematchers = set(map(_get_top_level_prematcher, simplified_children))
291
+ if all(child_prematchers):
292
+ return child_prematchers
293
+
294
+ raise ValueError(f"Could not generate prematchers for {pattern.pattern!r}")
295
+
296
+
297
+ def _simplify_sre_ast(sre_ast):
298
+ """Simplify an sre AST.
299
+
300
+ - Transform pattern r"(...)" to r"...".
301
+ """
302
+ if len(sre_ast) == 1 and sre_ast[0][0] is sre_constants.SUBPATTERN:
303
+ if len(sre_ast[0][1]) == 2:
304
+ # Python < 3.6 has no subpattern flags support
305
+ return sre_ast[0][1][1]
306
+ else:
307
+ _, add_flags, del_flags, p = sre_ast[0][1]
308
+ if not add_flags and not del_flags:
309
+ return p
310
+ return sre_ast
311
+
312
+
313
+ def _sre_find_terminals(sre_ast):
314
+ """Find all terminals (streaks of LITERALs) in an sre AST."""
315
+ i = 0
316
+ while i < len(sre_ast):
317
+ chars = []
318
+ while i < len(sre_ast) and sre_ast[i][0] is sre_constants.LITERAL:
319
+ chars.append(cast(int, sre_ast[i][1]))
320
+ i += 1
321
+ yield "".join(map(chr, chars))
322
+ i += 1
323
+
324
+
325
+ def _ahocorasick_make_automaton(words: Dict[str, V]) -> "ahocorasick.Automaton[V]":
326
+ """Make an ahocorasick automaton from a dictionary of `needle -> value`
327
+ items."""
328
+ automaton = ahocorasick.Automaton() # type: ahocorasick.Automaton[V]
329
+ for word, value in words.items():
330
+ _ahocorasick_ensure_successful(automaton.add_word(word, value))
331
+ _ahocorasick_ensure_successful(automaton.make_automaton())
332
+ return automaton
333
+
334
+
335
+ def _ahocorasick_ensure_successful(res):
336
+ """Pyahocorasick returns errors as bools."""
337
+ if res is False:
338
+ raise AhocorasickError("Error performing ahocorasick call")
multiregex/py.typed ADDED
File without changes
@@ -0,0 +1,11 @@
1
+ Copyright 2022 QuantCo Inc
2
+
3
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4
+
5
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6
+
7
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8
+
9
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10
+
11
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,125 @@
1
+ Metadata-Version: 2.1
2
+ Name: multiregex
3
+ Version: 2.0.2
4
+ Summary: Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
5
+ Author-email: "QuantCo, Inc." <noreply@quantco.com>, Jonas Haag <jonas@lophus.org>
6
+ Maintainer-email: Bela Stoyan <bela.stoyan@quantco.com>
7
+ Project-URL: Home, https://github.com/quantco/multiregex
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.8
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Requires-Dist: pyahocorasick
18
+
19
+ # multiregex
20
+
21
+ [![CI](https://github.com/Quantco/multiregex/actions/workflows/ci.yml/badge.svg)](https://github.com/Quantco/multiregex/actions/workflows/ci.yml)
22
+ [![Documentation](https://img.shields.io/badge/docs-latest-success?style=plastic)](https://docs.dev.quantco.cloud/qc-github-artifacts/Quantco/multiregex/latest/index.html)
23
+ [![conda-forge](https://img.shields.io/conda/vn/conda-forge/multiregex?logoColor=white&logo=conda-forge)](https://anaconda.org/conda-forge/multiregex)
24
+ [![pypi-version](https://img.shields.io/pypi/v/multiregex.svg?logo=pypi&logoColor=white)](https://pypi.org/project/multiregex)
25
+ [![python-version](https://img.shields.io/pypi/pyversions/multiregex?logoColor=white&logo=python)](https://pypi.org/project/multiregex)
26
+
27
+ Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
28
+
29
+ ## Introduction
30
+
31
+ See [this introductory blog post](https://tech.quantco.com/2022/07/31/multiregex.html).
32
+
33
+ ## Installation
34
+
35
+ You can install the package in development mode using:
36
+
37
+ ```bash
38
+ git clone https://github.com/quantco/multiregex
39
+ cd multiregex
40
+
41
+ pixi run pre-commit-install
42
+ pixi run postinstall
43
+ pixi run test
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ ```py
49
+ import multiregex
50
+
51
+ # Create matcher from multiple regexes.
52
+ my_patterns = [r"\w+@\w+\.com", r"\w\.com"]
53
+ matcher = multiregex.RegexMatcher(my_patterns)
54
+
55
+ # Run `re.search` for all regexes.
56
+ # Returns a set of matches as (re.Pattern, re.Match) tuples.
57
+ matcher.search("john.doe@example.com")
58
+ # => [(re.compile('\\w+@\\w+\\.com'), <re.Match ... 'doe@example.com'>),
59
+ # (re.compile('\\w+\\.com'), <re.Match ... 'example.com'>)]
60
+
61
+ # Same as above, but with `re.match`.
62
+ matcher.match(...)
63
+ # Same as above, but with `re.fullmatch`.
64
+ matcher.fullmatch(...)
65
+ ```
66
+
67
+ ### Custom prematchers
68
+
69
+ To be able to quickly match many regexes against a string, `multiregex` uses
70
+ "prematchers" under the hood. Prematchers are lists of non-regex strings of which
71
+ at least one can be assumed to be present in the haystack if the corresponding regex matches.
72
+ As an example, a valid prematcher of `r"\w+\.com"` could be `[".com"]` and a valid
73
+ prematcher of `r"(B|b)aNäNa"` could be `["b"]` or `["anäna"]`.
74
+ Note that prematchers must be all-lowercase (in order for `multiregex` to be able to support `re.IGNORECASE`).
75
+
76
+ You will likely have to provide your own prematchers for all but the simplest
77
+ regex patterns:
78
+
79
+ ```py
80
+ multiregex.RegexMatcher([r"\d+"])
81
+ # => ValueError: Could not generate prematcher : '\\d+'
82
+ ```
83
+
84
+ To provide custom prematchers, pass `(pattern, prematchers)` tuples:
85
+
86
+ ```py
87
+ multiregex.RegexMatcher([(r"\d+", map(str, range(10)))])
88
+ ```
89
+
90
+ To use a mixture of automatic and custom prematchers, pass `prematchers=None`:
91
+
92
+ ```py
93
+ matcher = multiregex.RegexMatcher([(r"\d+", map(str, range(10))), (r"\w+\.com", None)])
94
+ matcher.prematchers
95
+ # => {(re.compile('\\d+'), {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}),
96
+ # (re.compile('\\w+\\.com'), {'com'})}
97
+ ```
98
+
99
+ ### Disabling prematchers
100
+
101
+ To disable prematching for certain pattern entirely (ie., always run the regex
102
+ without first running any prematchers), pass an empty list of prematchers:
103
+
104
+ ```py
105
+ multiregex.RegexMatcher([(r"super complicated regex", [])])
106
+ ```
107
+
108
+ ### Profiling prematchers
109
+
110
+ To check if your prematchers are effective, you can use the built-in prematcher "profiler":
111
+
112
+ ```py
113
+ yyyy_mm_dd = r"(19|20)\d\d-\d\d-\d\d" # Default prematchers: {'-'}
114
+ matcher = multiregex.RegexMatcher([yyyy_mm_dd], count_prematcher_false_positives=True)
115
+ for string in my_benchmark_dataset:
116
+ matcher.search(string)
117
+ print(matcher.format_prematcher_false_positives())
118
+ # => For example:
119
+ # FP count | FP rate | Pattern / Prematchers
120
+ # ---------+---------+----------------------
121
+ # 137 | 0.72 | (19|20)\d\d-\d\d-\d\d / {'-'}
122
+ ```
123
+
124
+ In this example, there were 137 input strings that were matched positive by the prematcher but negative by the regex.
125
+ In other words, the prematcher failed to prevent slow regex evaluation in 72% of the cases.
@@ -0,0 +1,7 @@
1
+ multiregex/__init__.py,sha256=6y-8Kj9Ka3wg9t-QDJ4QZHj1rZ3Z45jtgh560RbIo1M,12776
2
+ multiregex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ multiregex-2.0.2.dist-info/LICENSE,sha256=8iTh63RQnJRHksLaxQWFA2C4WxaXZUnN9LsvUjOdKFg,1456
4
+ multiregex-2.0.2.dist-info/METADATA,sha256=JbQ8DP39FIJu0Y31U05lpAflI_shFJj7Ixp_MgN78es,4719
5
+ multiregex-2.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
+ multiregex-2.0.2.dist-info/top_level.txt,sha256=sHA7Yg-eHBrGliGfhxx5V2urLZXf4TlhOgovX-5m0kY,11
7
+ multiregex-2.0.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.43.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ multiregex