purere2 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- purere2-0.1.0/LICENSE +21 -0
- purere2-0.1.0/PKG-INFO +108 -0
- purere2-0.1.0/README.md +80 -0
- purere2-0.1.0/pyproject.toml +41 -0
- purere2-0.1.0/setup.cfg +4 -0
- purere2-0.1.0/src/purere2/__init__.py +281 -0
- purere2-0.1.0/src/purere2/compiler.py +151 -0
- purere2-0.1.0/src/purere2/parser.py +520 -0
- purere2-0.1.0/src/purere2/pikevm.py +128 -0
- purere2-0.1.0/src/purere2.egg-info/PKG-INFO +108 -0
- purere2-0.1.0/src/purere2.egg-info/SOURCES.txt +14 -0
- purere2-0.1.0/src/purere2.egg-info/dependency_links.txt +1 -0
- purere2-0.1.0/src/purere2.egg-info/top_level.txt +1 -0
- purere2-0.1.0/tests/test_basic.py +76 -0
- purere2-0.1.0/tests/test_conformance.py +113 -0
- purere2-0.1.0/tests/test_redos.py +46 -0
purere2-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 adam2go
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
purere2-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: purere2
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: RE2 in pure Python: linear-time, ReDoS-safe regular expressions - no C extension, no backtracking
|
|
5
|
+
Author: adam2go
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/adam2go/purere2
|
|
8
|
+
Keywords: regex,re2,redos,linear-time,nfa,pure-python,security,untrusted
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
20
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
22
|
+
Classifier: Topic :: Security
|
|
23
|
+
Classifier: Topic :: Text Processing
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# purere2
|
|
30
|
+
|
|
31
|
+
[](https://github.com/adam2go/purere2/actions/workflows/ci.yml)
|
|
32
|
+
[](https://pypi.org/project/purere2/)
|
|
33
|
+
[](.github/workflows/ci.yml)
|
|
34
|
+
[](EXPECTED_DIVERGENCES.md)
|
|
35
|
+
[](LICENSE)
|
|
36
|
+
|
|
37
|
+
**[RE2](https://github.com/google/re2) in pure Python: linear-time,
|
|
38
|
+
ReDoS-safe regular expressions — no C extension, no backtracking.**
|
|
39
|
+
|
|
40
|
+
Python's built-in `re` (like PCRE and Perl) backtracks, so a pattern like
|
|
41
|
+
`(a+)+$` against a non-matching string can run for years on a few dozen
|
|
42
|
+
characters — the classic **ReDoS** denial-of-service. purere2 compiles every
|
|
43
|
+
pattern to an NFA and runs it with a Pike VM, so **matching is always linear
|
|
44
|
+
in the input** and no pattern can blow up. That guarantee is exactly why
|
|
45
|
+
[RE2 exists](https://github.com/google/re2/wiki/WhyRE2) — and why it has no
|
|
46
|
+
backreferences or lookaround.
|
|
47
|
+
|
|
48
|
+
```sh
|
|
49
|
+
pip install purere2
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
import purere2
|
|
54
|
+
|
|
55
|
+
# linear time: this returns instantly; re.search would hang for minutes
|
|
56
|
+
purere2.search(r"(a+)+$", "a" * 50 + "!") # None, in microseconds
|
|
57
|
+
|
|
58
|
+
purere2.search(r"(\w+)@(\w+)", "x@y").groups() # ('x', 'y')
|
|
59
|
+
purere2.findall(r"\d{4}-\d\d-\d\d", "2026-06-19") # ['2026-06-19']
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
The API mirrors the common subset of the stdlib `re` module
|
|
63
|
+
(`compile`, `search`, `match`, `fullmatch`, `finditer`, `findall`, `sub`,
|
|
64
|
+
`subn`, `split`, flags `I/M/S`, named groups), so it is close to a drop-in
|
|
65
|
+
replacement for **running untrusted or LLM-generated patterns safely**.
|
|
66
|
+
|
|
67
|
+
## Why pure Python
|
|
68
|
+
|
|
69
|
+
`google-re2` and `pyre2` already wrap RE2 — but they need the RE2 **C++
|
|
70
|
+
library** (and a compiler, or a matching binary wheel). There was no pure
|
|
71
|
+
Python RE2, even though [RE2/J](https://github.com/google/re2j) (Java) and
|
|
72
|
+
[RE2JS](https://github.com/le0pard/re2js) (JavaScript) have existed for years.
|
|
73
|
+
purere2 is the missing one: zero dependencies, zero binaries, runs anywhere
|
|
74
|
+
Python runs — Pyodide/WASM, AWS Lambda, locked-down sandboxes — exactly where
|
|
75
|
+
you most want to run a pattern you don't trust.
|
|
76
|
+
|
|
77
|
+
The trade-off, stated honestly: on ordinary patterns purere2 is **slower than
|
|
78
|
+
the C-backed `re`** (a pure-Python NFA can't beat a C engine). Its value is
|
|
79
|
+
**safety and portability**, not raw speed — use it where a pattern is
|
|
80
|
+
untrusted, or where a C extension isn't an option, not as a blanket `re`
|
|
81
|
+
replacement.
|
|
82
|
+
|
|
83
|
+
## Verified against the real RE2
|
|
84
|
+
|
|
85
|
+
Conformance is differential, the same way [purefzf](https://github.com/adam2go/purefzf)
|
|
86
|
+
checks itself against the `fzf` binary: random RE2 patterns and inputs are run
|
|
87
|
+
through both purere2 and `google-re2` and compared byte-for-byte. Across
|
|
88
|
+
**150,000+ random checks, agreement is ~99.996%**; the residue is one
|
|
89
|
+
documented edge (a lazy quantifier nested in a greedy loop) — see
|
|
90
|
+
[EXPECTED_DIVERGENCES.md](EXPECTED_DIVERGENCES.md). The conformance test locks
|
|
91
|
+
that level, so any regression fails CI. There is also a ReDoS-safety suite of
|
|
92
|
+
patterns that hang stdlib `re` and must finish in milliseconds here.
|
|
93
|
+
|
|
94
|
+
## Supported syntax (v0.1)
|
|
95
|
+
|
|
96
|
+
Literals, `.`, character classes `[...]` with ranges / negation / POSIX
|
|
97
|
+
`[[:alpha:]]`, perl classes `\d \w \s` (ASCII, per RE2) and negations,
|
|
98
|
+
anchors `^ $ \A \z \b \B`, groups `(...)` / `(?:...)` / `(?P<name>...)`,
|
|
99
|
+
alternation `|`, quantifiers `* + ? {m} {m,n}` greedy and lazy, inline flags
|
|
100
|
+
`(?i) (?m) (?s)` and scoped `(?i:...)`, escapes including `\xHH` / `\x{...}`.
|
|
101
|
+
|
|
102
|
+
**Intentionally absent** (this is what makes it safe): backreferences and
|
|
103
|
+
lookaround. `(a)\1` raises `RegexError`. Deferred to a later version: Unicode
|
|
104
|
+
property classes `\p{...}` and full Unicode case folding.
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
[MIT](LICENSE)
|
purere2-0.1.0/README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# purere2
|
|
2
|
+
|
|
3
|
+
[](https://github.com/adam2go/purere2/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/purere2/)
|
|
5
|
+
[](.github/workflows/ci.yml)
|
|
6
|
+
[](EXPECTED_DIVERGENCES.md)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
**[RE2](https://github.com/google/re2) in pure Python: linear-time,
|
|
10
|
+
ReDoS-safe regular expressions — no C extension, no backtracking.**
|
|
11
|
+
|
|
12
|
+
Python's built-in `re` (like PCRE and Perl) backtracks, so a pattern like
|
|
13
|
+
`(a+)+$` against a non-matching string can run for years on a few dozen
|
|
14
|
+
characters — the classic **ReDoS** denial-of-service. purere2 compiles every
|
|
15
|
+
pattern to an NFA and runs it with a Pike VM, so **matching is always linear
|
|
16
|
+
in the input** and no pattern can blow up. That guarantee is exactly why
|
|
17
|
+
[RE2 exists](https://github.com/google/re2/wiki/WhyRE2) — and why it has no
|
|
18
|
+
backreferences or lookaround.
|
|
19
|
+
|
|
20
|
+
```sh
|
|
21
|
+
pip install purere2
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
import purere2
|
|
26
|
+
|
|
27
|
+
# linear time: this returns instantly; re.search would hang for minutes
|
|
28
|
+
purere2.search(r"(a+)+$", "a" * 50 + "!") # None, in microseconds
|
|
29
|
+
|
|
30
|
+
purere2.search(r"(\w+)@(\w+)", "x@y").groups() # ('x', 'y')
|
|
31
|
+
purere2.findall(r"\d{4}-\d\d-\d\d", "2026-06-19") # ['2026-06-19']
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
The API mirrors the common subset of the stdlib `re` module
|
|
35
|
+
(`compile`, `search`, `match`, `fullmatch`, `finditer`, `findall`, `sub`,
|
|
36
|
+
`subn`, `split`, flags `I/M/S`, named groups), so it is close to a drop-in
|
|
37
|
+
replacement for **running untrusted or LLM-generated patterns safely**.
|
|
38
|
+
|
|
39
|
+
## Why pure Python
|
|
40
|
+
|
|
41
|
+
`google-re2` and `pyre2` already wrap RE2 — but they need the RE2 **C++
|
|
42
|
+
library** (and a compiler, or a matching binary wheel). There was no pure
|
|
43
|
+
Python RE2, even though [RE2/J](https://github.com/google/re2j) (Java) and
|
|
44
|
+
[RE2JS](https://github.com/le0pard/re2js) (JavaScript) have existed for years.
|
|
45
|
+
purere2 is the missing one: zero dependencies, zero binaries, runs anywhere
|
|
46
|
+
Python runs — Pyodide/WASM, AWS Lambda, locked-down sandboxes — exactly where
|
|
47
|
+
you most want to run a pattern you don't trust.
|
|
48
|
+
|
|
49
|
+
The trade-off, stated honestly: on ordinary patterns purere2 is **slower than
|
|
50
|
+
the C-backed `re`** (a pure-Python NFA can't beat a C engine). Its value is
|
|
51
|
+
**safety and portability**, not raw speed — use it where a pattern is
|
|
52
|
+
untrusted, or where a C extension isn't an option, not as a blanket `re`
|
|
53
|
+
replacement.
|
|
54
|
+
|
|
55
|
+
## Verified against the real RE2
|
|
56
|
+
|
|
57
|
+
Conformance is differential, the same way [purefzf](https://github.com/adam2go/purefzf)
|
|
58
|
+
checks itself against the `fzf` binary: random RE2 patterns and inputs are run
|
|
59
|
+
through both purere2 and `google-re2` and compared byte-for-byte. Across
|
|
60
|
+
**150,000+ random checks, agreement is ~99.996%**; the residue is one
|
|
61
|
+
documented edge (a lazy quantifier nested in a greedy loop) — see
|
|
62
|
+
[EXPECTED_DIVERGENCES.md](EXPECTED_DIVERGENCES.md). The conformance test locks
|
|
63
|
+
that level, so any regression fails CI. There is also a ReDoS-safety suite of
|
|
64
|
+
patterns that hang stdlib `re` and must finish in milliseconds here.
|
|
65
|
+
|
|
66
|
+
## Supported syntax (v0.1)
|
|
67
|
+
|
|
68
|
+
Literals, `.`, character classes `[...]` with ranges / negation / POSIX
|
|
69
|
+
`[[:alpha:]]`, perl classes `\d \w \s` (ASCII, per RE2) and negations,
|
|
70
|
+
anchors `^ $ \A \z \b \B`, groups `(...)` / `(?:...)` / `(?P<name>...)`,
|
|
71
|
+
alternation `|`, quantifiers `* + ? {m} {m,n}` greedy and lazy, inline flags
|
|
72
|
+
`(?i) (?m) (?s)` and scoped `(?i:...)`, escapes including `\xHH` / `\x{...}`.
|
|
73
|
+
|
|
74
|
+
**Intentionally absent** (this is what makes it safe): backreferences and
|
|
75
|
+
lookaround. `(a)\1` raises `RegexError`. Deferred to a later version: Unicode
|
|
76
|
+
property classes `\p{...}` and full Unicode case folding.
|
|
77
|
+
|
|
78
|
+
## License
|
|
79
|
+
|
|
80
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "purere2"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "RE2 in pure Python: linear-time, ReDoS-safe regular expressions - no C extension, no backtracking"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "adam2go" }]
|
|
13
|
+
keywords = ["regex", "re2", "redos", "linear-time", "nfa", "pure-python",
|
|
14
|
+
"security", "untrusted"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Programming Language :: Python :: 3.14",
|
|
26
|
+
"Programming Language :: Python :: Implementation :: CPython",
|
|
27
|
+
"Programming Language :: Python :: Implementation :: PyPy",
|
|
28
|
+
"Topic :: Software Development :: Libraries",
|
|
29
|
+
"Topic :: Security",
|
|
30
|
+
"Topic :: Text Processing",
|
|
31
|
+
]
|
|
32
|
+
dependencies = []
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/adam2go/purere2"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
where = ["src"]
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
testpaths = ["tests"]
|
purere2-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
"""purere2: RE2 in pure Python - linear-time, ReDoS-safe regular expressions.
|
|
2
|
+
|
|
3
|
+
A pattern compiles to an NFA run by a Pike VM, so matching is always linear in
|
|
4
|
+
the input and no pattern can cause catastrophic backtracking. Like RE2 (and
|
|
5
|
+
unlike Python's `re`), backreferences and lookaround are intentionally absent -
|
|
6
|
+
that absence is exactly what makes the linear-time guarantee possible.
|
|
7
|
+
|
|
8
|
+
import purere2
|
|
9
|
+
purere2.search(r"(\\w+)@(\\w+)", "x@y").groups() # ('x', 'y')
|
|
10
|
+
pat = purere2.compile(r"\\d{4}-\\d\\d-\\d\\d")
|
|
11
|
+
pat.findall("2026-06-19 and 2027-01-01")
|
|
12
|
+
|
|
13
|
+
The API mirrors the common subset of the stdlib `re` module, so it is close to
|
|
14
|
+
a drop-in replacement for running untrusted or LLM-generated patterns safely.
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from .compiler import compile_ast
|
|
19
|
+
from .parser import Anchor, Concat, RegexError, parse
|
|
20
|
+
from .pikevm import Program
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
__all__ = ["compile", "search", "match", "fullmatch", "finditer", "findall",
|
|
24
|
+
"sub", "subn", "split", "escape", "Pattern", "Match", "RegexError",
|
|
25
|
+
"error", "I", "IGNORECASE", "M", "MULTILINE", "S", "DOTALL"]
|
|
26
|
+
|
|
27
|
+
# flag constants (compatible values with re)
|
|
28
|
+
I = IGNORECASE = 2
|
|
29
|
+
M = MULTILINE = 8
|
|
30
|
+
S = DOTALL = 16
|
|
31
|
+
|
|
32
|
+
error = RegexError
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _nslots(ngroups):
|
|
36
|
+
return 2 * (ngroups + 1)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Match:
|
|
40
|
+
__slots__ = ("_slots", "string", "_names", "re", "pos", "endpos")
|
|
41
|
+
|
|
42
|
+
def __init__(self, slots, string, names, pattern):
|
|
43
|
+
self._slots = slots
|
|
44
|
+
self.string = string
|
|
45
|
+
self._names = names
|
|
46
|
+
self.re = pattern
|
|
47
|
+
|
|
48
|
+
def _idx(self, group):
|
|
49
|
+
if isinstance(group, str):
|
|
50
|
+
if group not in self._names:
|
|
51
|
+
raise IndexError("no such group %r" % group)
|
|
52
|
+
return self._names[group]
|
|
53
|
+
return group
|
|
54
|
+
|
|
55
|
+
def group(self, *groups):
|
|
56
|
+
if not groups:
|
|
57
|
+
groups = (0,)
|
|
58
|
+
res = tuple(self._one(g) for g in groups)
|
|
59
|
+
return res[0] if len(res) == 1 else res
|
|
60
|
+
|
|
61
|
+
def _one(self, group):
|
|
62
|
+
g = self._idx(group)
|
|
63
|
+
a = self._slots[2 * g]
|
|
64
|
+
b = self._slots[2 * g + 1]
|
|
65
|
+
if a is None or b is None:
|
|
66
|
+
return None
|
|
67
|
+
return self.string[a:b]
|
|
68
|
+
|
|
69
|
+
def groups(self, default=None):
|
|
70
|
+
out = []
|
|
71
|
+
for g in range(1, len(self._slots) // 2):
|
|
72
|
+
v = self._one(g)
|
|
73
|
+
out.append(default if v is None else v)
|
|
74
|
+
return tuple(out)
|
|
75
|
+
|
|
76
|
+
def groupdict(self, default=None):
|
|
77
|
+
return {name: (self._one(i) if self._one(i) is not None else default)
|
|
78
|
+
for name, i in self._names.items()}
|
|
79
|
+
|
|
80
|
+
def start(self, group=0):
|
|
81
|
+
s = self._slots[2 * self._idx(group)]
|
|
82
|
+
return -1 if s is None else s
|
|
83
|
+
|
|
84
|
+
def end(self, group=0):
|
|
85
|
+
e = self._slots[2 * self._idx(group) + 1]
|
|
86
|
+
return -1 if e is None else e
|
|
87
|
+
|
|
88
|
+
def span(self, group=0):
|
|
89
|
+
return (self.start(group), self.end(group))
|
|
90
|
+
|
|
91
|
+
def __getitem__(self, group):
|
|
92
|
+
return self._one(group)
|
|
93
|
+
|
|
94
|
+
def __repr__(self):
|
|
95
|
+
return "<purere2.Match span=%r match=%r>" % (self.span(), self.group(0))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Pattern:
|
|
99
|
+
__slots__ = ("pattern", "flags", "groups", "groupindex", "_prog", "_full",
|
|
100
|
+
"_ast")
|
|
101
|
+
|
|
102
|
+
def __init__(self, pattern, flags=0):
|
|
103
|
+
self.pattern = pattern
|
|
104
|
+
self.flags = flags
|
|
105
|
+
ast, ngroups, names = parse(pattern,
|
|
106
|
+
flags_i=bool(flags & I),
|
|
107
|
+
flags_m=bool(flags & M),
|
|
108
|
+
flags_s=bool(flags & S))
|
|
109
|
+
self._ast = ast
|
|
110
|
+
self.groups = ngroups
|
|
111
|
+
self.groupindex = dict(names)
|
|
112
|
+
self._prog = Program(compile_ast(ast, ngroups), _nslots(ngroups))
|
|
113
|
+
self._full = None
|
|
114
|
+
|
|
115
|
+
def _fullprog(self):
|
|
116
|
+
if self._full is None:
|
|
117
|
+
ast = Concat([self._ast, Anchor("\\z")])
|
|
118
|
+
self._full = Program(compile_ast(ast, self.groups),
|
|
119
|
+
_nslots(self.groups))
|
|
120
|
+
return self._full
|
|
121
|
+
|
|
122
|
+
def search(self, string, pos=0, endpos=None):
|
|
123
|
+
return self._run(self._prog, string, pos, endpos, anchored=False)
|
|
124
|
+
|
|
125
|
+
def match(self, string, pos=0, endpos=None):
|
|
126
|
+
return self._run(self._prog, string, pos, endpos, anchored=True)
|
|
127
|
+
|
|
128
|
+
def fullmatch(self, string, pos=0, endpos=None):
|
|
129
|
+
return self._run(self._fullprog(), string, pos, endpos, anchored=True)
|
|
130
|
+
|
|
131
|
+
def _run(self, prog, string, pos, endpos, anchored):
|
|
132
|
+
slots = prog.search(string, pos,
|
|
133
|
+
anchored=anchored) if endpos is None else \
|
|
134
|
+
prog.search(string[:endpos], pos, anchored=anchored)
|
|
135
|
+
if slots is None:
|
|
136
|
+
return None
|
|
137
|
+
return Match(slots, string, self.groupindex, self)
|
|
138
|
+
|
|
139
|
+
def finditer(self, string, pos=0, endpos=None):
|
|
140
|
+
s = string if endpos is None else string[:endpos]
|
|
141
|
+
n = len(s)
|
|
142
|
+
while pos <= n:
|
|
143
|
+
slots = self._prog.search(s, pos, anchored=False)
|
|
144
|
+
if slots is None:
|
|
145
|
+
return
|
|
146
|
+
yield Match(slots, string, self.groupindex, self)
|
|
147
|
+
a, b = slots[0], slots[1]
|
|
148
|
+
pos = b + 1 if b == a else b # avoid stalling on empty matches
|
|
149
|
+
|
|
150
|
+
def findall(self, string, pos=0, endpos=None):
|
|
151
|
+
out = []
|
|
152
|
+
for m in self.finditer(string, pos, endpos):
|
|
153
|
+
if self.groups == 0:
|
|
154
|
+
out.append(m.group(0))
|
|
155
|
+
elif self.groups == 1:
|
|
156
|
+
out.append(m.group(1) or "")
|
|
157
|
+
else:
|
|
158
|
+
out.append(tuple(g or "" for g in m.groups()))
|
|
159
|
+
return out
|
|
160
|
+
|
|
161
|
+
def sub(self, repl, string, count=0):
|
|
162
|
+
return self.subn(repl, string, count)[0]
|
|
163
|
+
|
|
164
|
+
def subn(self, repl, string, count=0):
|
|
165
|
+
out = []
|
|
166
|
+
last = 0
|
|
167
|
+
n = 0
|
|
168
|
+
for m in self.finditer(string):
|
|
169
|
+
if count and n >= count:
|
|
170
|
+
break
|
|
171
|
+
out.append(string[last:m.start()])
|
|
172
|
+
out.append(repl(m) if callable(repl) else _expand(repl, m))
|
|
173
|
+
last = m.end()
|
|
174
|
+
n += 1
|
|
175
|
+
out.append(string[last:])
|
|
176
|
+
return "".join(out), n
|
|
177
|
+
|
|
178
|
+
def split(self, string, maxsplit=0):
|
|
179
|
+
out = []
|
|
180
|
+
last = 0
|
|
181
|
+
n = 0
|
|
182
|
+
for m in self.finditer(string):
|
|
183
|
+
if maxsplit and n >= maxsplit:
|
|
184
|
+
break
|
|
185
|
+
if m.end() == m.start():
|
|
186
|
+
continue # don't split on empty matches (re semantics)
|
|
187
|
+
out.append(string[last:m.start()])
|
|
188
|
+
out.extend(m.groups())
|
|
189
|
+
last = m.end()
|
|
190
|
+
n += 1
|
|
191
|
+
out.append(string[last:])
|
|
192
|
+
return out
|
|
193
|
+
|
|
194
|
+
def __repr__(self):
|
|
195
|
+
return "purere2.compile(%r)" % self.pattern
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _expand(template, m):
|
|
199
|
+
"""Expand a replacement template: \\1 \\g<1> \\g<name> and escapes."""
|
|
200
|
+
out = []
|
|
201
|
+
i = 0
|
|
202
|
+
n = len(template)
|
|
203
|
+
while i < n:
|
|
204
|
+
c = template[i]
|
|
205
|
+
if c != "\\":
|
|
206
|
+
out.append(c)
|
|
207
|
+
i += 1
|
|
208
|
+
continue
|
|
209
|
+
i += 1
|
|
210
|
+
if i >= n:
|
|
211
|
+
out.append("\\")
|
|
212
|
+
break
|
|
213
|
+
c = template[i]
|
|
214
|
+
if c == "g":
|
|
215
|
+
j = template.index(">", i) if ">" in template[i:] else -1
|
|
216
|
+
assert template[i + 1] == "<" and j != -1, "bad group reference"
|
|
217
|
+
ref = template[i + 2:j]
|
|
218
|
+
i = j + 1
|
|
219
|
+
out.append(m.group(int(ref) if ref.isdigit() else ref) or "")
|
|
220
|
+
elif c.isdigit():
|
|
221
|
+
j = i
|
|
222
|
+
while j < n and template[j].isdigit():
|
|
223
|
+
j += 1
|
|
224
|
+
out.append(m.group(int(template[i:j])) or "")
|
|
225
|
+
i = j
|
|
226
|
+
else:
|
|
227
|
+
out.append({"n": "\n", "t": "\t", "r": "\r", "\\": "\\"}.get(c, c))
|
|
228
|
+
i += 1
|
|
229
|
+
return "".join(out)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
_cache = {}
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def compile(pattern, flags=0):
|
|
236
|
+
key = (pattern, flags)
|
|
237
|
+
p = _cache.get(key)
|
|
238
|
+
if p is None:
|
|
239
|
+
p = Pattern(pattern, flags)
|
|
240
|
+
if len(_cache) < 512:
|
|
241
|
+
_cache[key] = p
|
|
242
|
+
return p
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def search(pattern, string, flags=0):
|
|
246
|
+
return compile(pattern, flags).search(string)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def match(pattern, string, flags=0):
|
|
250
|
+
return compile(pattern, flags).match(string)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def fullmatch(pattern, string, flags=0):
|
|
254
|
+
return compile(pattern, flags).fullmatch(string)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def finditer(pattern, string, flags=0):
|
|
258
|
+
return compile(pattern, flags).finditer(string)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def findall(pattern, string, flags=0):
|
|
262
|
+
return compile(pattern, flags).findall(string)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def sub(pattern, repl, string, count=0, flags=0):
|
|
266
|
+
return compile(pattern, flags).sub(repl, string, count)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def subn(pattern, repl, string, count=0, flags=0):
|
|
270
|
+
return compile(pattern, flags).subn(repl, string, count)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def split(pattern, string, maxsplit=0, flags=0):
|
|
274
|
+
return compile(pattern, flags).split(string, maxsplit)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
_SPECIAL = set("\\^$.|?*+()[]{}")
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def escape(s):
|
|
281
|
+
return "".join("\\" + c if c in _SPECIAL or c.isspace() else c for c in s)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Compile an AST into a flat NFA program for the Pike VM.
|
|
2
|
+
|
|
3
|
+
Instructions (tuples, pc-indexed list):
|
|
4
|
+
("char", cp) consume one codepoint == cp
|
|
5
|
+
("set", ranges) consume one codepoint within ranges
|
|
6
|
+
("any", dotall) consume any codepoint (excl. \\n unless dotall)
|
|
7
|
+
("split", x, y) epsilon-fork; x has priority (greedy preference)
|
|
8
|
+
("jmp", x) epsilon jump
|
|
9
|
+
("save", slot) record current input position into capture slot
|
|
10
|
+
("assert", kind, ml) zero-width assertion (^ $ \\b \\B \\A \\z \\Z)
|
|
11
|
+
("match",) accept
|
|
12
|
+
|
|
13
|
+
Greedy vs non-greedy is encoded purely by the order of split targets, so the
|
|
14
|
+
VM stays a plain priority-ordered Thompson simulation - linear in input.
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from . import parser as P
|
|
19
|
+
|
|
20
|
+
__all__ = ["compile_ast"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _Compiler:
|
|
24
|
+
def __init__(self):
|
|
25
|
+
self.prog = []
|
|
26
|
+
|
|
27
|
+
def emit(self, instr):
|
|
28
|
+
self.prog.append(instr)
|
|
29
|
+
return len(self.prog) - 1
|
|
30
|
+
|
|
31
|
+
def patch(self, pc, instr):
|
|
32
|
+
self.prog[pc] = instr
|
|
33
|
+
|
|
34
|
+
def compile(self, node, ngroups):
|
|
35
|
+
self.emit(("save", 0))
|
|
36
|
+
self._c(node)
|
|
37
|
+
self.emit(("save", 1))
|
|
38
|
+
self.emit(("match",))
|
|
39
|
+
return self.prog
|
|
40
|
+
|
|
41
|
+
# emit code for `node`; falls through to the next instruction on success
|
|
42
|
+
def _c(self, node):
|
|
43
|
+
t = type(node)
|
|
44
|
+
if t is P.Empty:
|
|
45
|
+
return
|
|
46
|
+
if t is P.Lit:
|
|
47
|
+
if len(node.cps) == 1:
|
|
48
|
+
self.emit(("char", node.cps[0]))
|
|
49
|
+
else:
|
|
50
|
+
self.emit(("set", tuple((c, c) for c in node.cps)))
|
|
51
|
+
elif t is P.Dot:
|
|
52
|
+
self.emit(("any", node.dotall))
|
|
53
|
+
elif t is P.Class:
|
|
54
|
+
self.emit(("set", node.ranges))
|
|
55
|
+
elif t is P.Anchor:
|
|
56
|
+
self.emit(("assert", node.kind, node.multiline))
|
|
57
|
+
elif t is P.Concat:
|
|
58
|
+
for it in node.items:
|
|
59
|
+
self._c(it)
|
|
60
|
+
elif t is P.Group:
|
|
61
|
+
if node.index is not None:
|
|
62
|
+
self.emit(("save", 2 * node.index))
|
|
63
|
+
self._c(node.sub)
|
|
64
|
+
self.emit(("save", 2 * node.index + 1))
|
|
65
|
+
else:
|
|
66
|
+
self._c(node.sub)
|
|
67
|
+
elif t is P.Alt:
|
|
68
|
+
self._alt(node.opts)
|
|
69
|
+
elif t is P.Star:
|
|
70
|
+
self._star(node.sub, node.greedy)
|
|
71
|
+
elif t is P.Plus:
|
|
72
|
+
self._plus(node.sub, node.greedy)
|
|
73
|
+
elif t is P.Quest:
|
|
74
|
+
self._quest(node.sub, node.greedy)
|
|
75
|
+
elif t is P.Repeat:
|
|
76
|
+
self._repeat(node)
|
|
77
|
+
else:
|
|
78
|
+
raise AssertionError("unknown node %r" % t)
|
|
79
|
+
|
|
80
|
+
def _alt(self, opts):
|
|
81
|
+
# opt0 | (opt1 | (opt2 | ...)) with priority left-to-right
|
|
82
|
+
jmps = []
|
|
83
|
+
for k, opt in enumerate(opts):
|
|
84
|
+
last = k == len(opts) - 1
|
|
85
|
+
if not last:
|
|
86
|
+
sp = self.emit(("split", None, None))
|
|
87
|
+
a = len(self.prog)
|
|
88
|
+
self._c(opt)
|
|
89
|
+
jmps.append(self.emit(("jmp", None)))
|
|
90
|
+
b = len(self.prog)
|
|
91
|
+
self.patch(sp, ("split", a, b))
|
|
92
|
+
else:
|
|
93
|
+
self._c(opt)
|
|
94
|
+
end = len(self.prog)
|
|
95
|
+
for j in jmps:
|
|
96
|
+
self.patch(j, ("jmp", end))
|
|
97
|
+
|
|
98
|
+
def _star(self, sub, greedy):
|
|
99
|
+
# Compile e* as (e+)? rather than a bare loop. With a plain loop the
|
|
100
|
+
# exit branch is taken before the body's `save` instructions run, so a
|
|
101
|
+
# nullable body (e.g. a capturing group that can match empty) would
|
|
102
|
+
# never record its empty capture. Routing through e+ runs the body's
|
|
103
|
+
# captures once before the optional exit - matching RE2's submatches.
|
|
104
|
+
sp = self.emit(("split", None, None))
|
|
105
|
+
plus = len(self.prog)
|
|
106
|
+
self._plus(sub, greedy)
|
|
107
|
+
exit_ = len(self.prog)
|
|
108
|
+
self.patch(sp, ("split", plus, exit_) if greedy
|
|
109
|
+
else ("split", exit_, plus))
|
|
110
|
+
|
|
111
|
+
def _plus(self, sub, greedy):
|
|
112
|
+
body = len(self.prog)
|
|
113
|
+
self._c(sub)
|
|
114
|
+
sp = self.emit(("split", None, None))
|
|
115
|
+
exit_ = len(self.prog)
|
|
116
|
+
self.patch(sp, ("split", body, exit_) if greedy else ("split", exit_, body))
|
|
117
|
+
|
|
118
|
+
def _quest(self, sub, greedy):
|
|
119
|
+
sp = self.emit(("split", None, None))
|
|
120
|
+
body = len(self.prog)
|
|
121
|
+
self._c(sub)
|
|
122
|
+
exit_ = len(self.prog)
|
|
123
|
+
self.patch(sp, ("split", body, exit_) if greedy else ("split", exit_, body))
|
|
124
|
+
|
|
125
|
+
def _repeat(self, node):
|
|
126
|
+
lo, hi = node.lo, node.hi
|
|
127
|
+
for _ in range(lo):
|
|
128
|
+
self._c(node.sub)
|
|
129
|
+
if hi is None:
|
|
130
|
+
if lo == 0:
|
|
131
|
+
self._star(node.sub, node.greedy)
|
|
132
|
+
else:
|
|
133
|
+
self._star(node.sub, node.greedy)
|
|
134
|
+
else:
|
|
135
|
+
# (hi - lo) nested optionals: sub(sub(...)?)? to preserve greedy
|
|
136
|
+
n_opt = hi - lo
|
|
137
|
+
self._nested_opt(node.sub, n_opt, node.greedy)
|
|
138
|
+
|
|
139
|
+
def _nested_opt(self, sub, k, greedy):
|
|
140
|
+
if k <= 0:
|
|
141
|
+
return
|
|
142
|
+
sp = self.emit(("split", None, None))
|
|
143
|
+
body = len(self.prog)
|
|
144
|
+
self._c(sub)
|
|
145
|
+
self._nested_opt(sub, k - 1, greedy)
|
|
146
|
+
exit_ = len(self.prog)
|
|
147
|
+
self.patch(sp, ("split", body, exit_) if greedy else ("split", exit_, body))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def compile_ast(node, ngroups):
|
|
151
|
+
return _Compiler().compile(node, ngroups)
|