purere2 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
purere2-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 adam2go
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
purere2-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: purere2
3
+ Version: 0.1.0
4
+ Summary: RE2 in pure Python: linear-time, ReDoS-safe regular expressions - no C extension, no backtracking
5
+ Author: adam2go
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/adam2go/purere2
8
+ Keywords: regex,re2,redos,linear-time,nfa,pure-python,security,untrusted
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Programming Language :: Python :: Implementation :: CPython
20
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
21
+ Classifier: Topic :: Software Development :: Libraries
22
+ Classifier: Topic :: Security
23
+ Classifier: Topic :: Text Processing
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Dynamic: license-file
28
+
29
+ # purere2
30
+
31
+ [![CI](https://github.com/adam2go/purere2/actions/workflows/ci.yml/badge.svg)](https://github.com/adam2go/purere2/actions/workflows/ci.yml)
32
+ [![PyPI](https://img.shields.io/pypi/v/purere2)](https://pypi.org/project/purere2/)
33
+ [![Python](https://img.shields.io/badge/python-3.9%E2%80%933.14%20%7C%20PyPy-blue)](.github/workflows/ci.yml)
34
+ [![RE2 conformance](https://img.shields.io/badge/vs%20real%20RE2-99.996%25-brightgreen)](EXPECTED_DIVERGENCES.md)
35
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
36
+
37
+ **[RE2](https://github.com/google/re2) in pure Python: linear-time,
38
+ ReDoS-safe regular expressions — no C extension, no backtracking.**
39
+
40
+ Python's built-in `re` (like PCRE and Perl) backtracks, so a pattern like
41
+ `(a+)+$` against a non-matching string can run for years on a few dozen
42
+ characters — the classic **ReDoS** denial-of-service. purere2 compiles every
43
+ pattern to an NFA and runs it with a Pike VM, so **matching is always linear
44
+ in the input** and no pattern can blow up. That guarantee is exactly why
45
+ [RE2 exists](https://github.com/google/re2/wiki/WhyRE2) — and why it has no
46
+ backreferences or lookaround.
47
+
48
+ ```sh
49
+ pip install purere2
50
+ ```
51
+
52
+ ```python
53
+ import purere2
54
+
55
+ # linear time: this returns instantly; re.search would hang for minutes
56
+ purere2.search(r"(a+)+$", "a" * 50 + "!") # None, in microseconds
57
+
58
+ purere2.search(r"(\w+)@(\w+)", "x@y").groups() # ('x', 'y')
59
+ purere2.findall(r"\d{4}-\d\d-\d\d", "2026-06-19") # ['2026-06-19']
60
+ ```
61
+
62
+ The API mirrors the common subset of the stdlib `re` module
63
+ (`compile`, `search`, `match`, `fullmatch`, `finditer`, `findall`, `sub`,
64
+ `subn`, `split`, flags `I/M/S`, named groups), so it is close to a drop-in
65
+ replacement for **running untrusted or LLM-generated patterns safely**.
66
+
67
+ ## Why pure Python
68
+
69
+ `google-re2` and `pyre2` already wrap RE2 — but they need the RE2 **C++
70
+ library** (and a compiler, or a matching binary wheel). There was no pure
71
+ Python RE2, even though [RE2/J](https://github.com/google/re2j) (Java) and
72
+ [RE2JS](https://github.com/le0pard/re2js) (JavaScript) have existed for years.
73
+ purere2 is the missing one: zero dependencies, zero binaries, runs anywhere
74
+ Python runs — Pyodide/WASM, AWS Lambda, locked-down sandboxes — exactly where
75
+ you most want to run a pattern you don't trust.
76
+
77
+ The trade-off, stated honestly: on ordinary patterns purere2 is **slower than
78
+ the C-backed `re`** (a pure-Python NFA can't beat a C engine). Its value is
79
+ **safety and portability**, not raw speed — use it where a pattern is
80
+ untrusted, or where a C extension isn't an option, not as a blanket `re`
81
+ replacement.
82
+
83
+ ## Verified against the real RE2
84
+
85
+ Conformance is differential, the same way [purefzf](https://github.com/adam2go/purefzf)
86
+ checks itself against the `fzf` binary: random RE2 patterns and inputs are run
87
+ through both purere2 and `google-re2` and compared byte-for-byte. Across
88
+ **150,000+ random checks, agreement is ~99.996%**; the residue is one
89
+ documented edge (a lazy quantifier nested in a greedy loop) — see
90
+ [EXPECTED_DIVERGENCES.md](EXPECTED_DIVERGENCES.md). The conformance test locks
91
+ that level, so any regression fails CI. There is also a ReDoS-safety suite of
92
+ patterns that hang stdlib `re` and must finish in milliseconds here.
93
+
94
+ ## Supported syntax (v0.1)
95
+
96
+ Literals, `.`, character classes `[...]` with ranges / negation / POSIX
97
+ `[[:alpha:]]`, perl classes `\d \w \s` (ASCII, per RE2) and negations,
98
+ anchors `^ $ \A \z \b \B`, groups `(...)` / `(?:...)` / `(?P<name>...)`,
99
+ alternation `|`, quantifiers `* + ? {m} {m,n}` greedy and lazy, inline flags
100
+ `(?i) (?m) (?s)` and scoped `(?i:...)`, escapes including `\xHH` / `\x{...}`.
101
+
102
+ **Intentionally absent** (this is what makes it safe): backreferences and
103
+ lookaround. `(a)\1` raises `RegexError`. Deferred to a later version: Unicode
104
+ property classes `\p{...}` and full Unicode case folding.
105
+
106
+ ## License
107
+
108
+ [MIT](LICENSE)
@@ -0,0 +1,80 @@
1
+ # purere2
2
+
3
+ [![CI](https://github.com/adam2go/purere2/actions/workflows/ci.yml/badge.svg)](https://github.com/adam2go/purere2/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/purere2)](https://pypi.org/project/purere2/)
5
+ [![Python](https://img.shields.io/badge/python-3.9%E2%80%933.14%20%7C%20PyPy-blue)](.github/workflows/ci.yml)
6
+ [![RE2 conformance](https://img.shields.io/badge/vs%20real%20RE2-99.996%25-brightgreen)](EXPECTED_DIVERGENCES.md)
7
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
8
+
9
+ **[RE2](https://github.com/google/re2) in pure Python: linear-time,
10
+ ReDoS-safe regular expressions — no C extension, no backtracking.**
11
+
12
+ Python's built-in `re` (like PCRE and Perl) backtracks, so a pattern like
13
+ `(a+)+$` against a non-matching string can run for years on a few dozen
14
+ characters — the classic **ReDoS** denial-of-service. purere2 compiles every
15
+ pattern to an NFA and runs it with a Pike VM, so **matching is always linear
16
+ in the input** and no pattern can blow up. That guarantee is exactly why
17
+ [RE2 exists](https://github.com/google/re2/wiki/WhyRE2) — and why it has no
18
+ backreferences or lookaround.
19
+
20
+ ```sh
21
+ pip install purere2
22
+ ```
23
+
24
+ ```python
25
+ import purere2
26
+
27
+ # linear time: this returns instantly; re.search would hang for minutes
28
+ purere2.search(r"(a+)+$", "a" * 50 + "!") # None, in microseconds
29
+
30
+ purere2.search(r"(\w+)@(\w+)", "x@y").groups() # ('x', 'y')
31
+ purere2.findall(r"\d{4}-\d\d-\d\d", "2026-06-19") # ['2026-06-19']
32
+ ```
33
+
34
+ The API mirrors the common subset of the stdlib `re` module
35
+ (`compile`, `search`, `match`, `fullmatch`, `finditer`, `findall`, `sub`,
36
+ `subn`, `split`, flags `I/M/S`, named groups), so it is close to a drop-in
37
+ replacement for **running untrusted or LLM-generated patterns safely**.
38
+
39
+ ## Why pure Python
40
+
41
+ `google-re2` and `pyre2` already wrap RE2 — but they need the RE2 **C++
42
+ library** (and a compiler, or a matching binary wheel). There was no pure
43
+ Python RE2, even though [RE2/J](https://github.com/google/re2j) (Java) and
44
+ [RE2JS](https://github.com/le0pard/re2js) (JavaScript) have existed for years.
45
+ purere2 is the missing one: zero dependencies, zero binaries, runs anywhere
46
+ Python runs — Pyodide/WASM, AWS Lambda, locked-down sandboxes — exactly where
47
+ you most want to run a pattern you don't trust.
48
+
49
+ The trade-off, stated honestly: on ordinary patterns purere2 is **slower than
50
+ the C-backed `re`** (a pure-Python NFA can't beat a C engine). Its value is
51
+ **safety and portability**, not raw speed — use it where a pattern is
52
+ untrusted, or where a C extension isn't an option, not as a blanket `re`
53
+ replacement.
54
+
55
+ ## Verified against the real RE2
56
+
57
+ Conformance is differential, the same way [purefzf](https://github.com/adam2go/purefzf)
58
+ checks itself against the `fzf` binary: random RE2 patterns and inputs are run
59
+ through both purere2 and `google-re2` and compared byte-for-byte. Across
60
+ **150,000+ random checks, agreement is ~99.996%**; the residue is one
61
+ documented edge (a lazy quantifier nested in a greedy loop) — see
62
+ [EXPECTED_DIVERGENCES.md](EXPECTED_DIVERGENCES.md). The conformance test locks
63
+ that level, so any regression fails CI. There is also a ReDoS-safety suite of
64
+ patterns that hang stdlib `re` and must finish in milliseconds here.
65
+
66
+ ## Supported syntax (v0.1)
67
+
68
+ Literals, `.`, character classes `[...]` with ranges / negation / POSIX
69
+ `[[:alpha:]]`, perl classes `\d \w \s` (ASCII, per RE2) and negations,
70
+ anchors `^ $ \A \z \b \B`, groups `(...)` / `(?:...)` / `(?P<name>...)`,
71
+ alternation `|`, quantifiers `* + ? {m} {m,n}` greedy and lazy, inline flags
72
+ `(?i) (?m) (?s)` and scoped `(?i:...)`, escapes including `\xHH` / `\x{...}`.
73
+
74
+ **Intentionally absent** (this is what makes it safe): backreferences and
75
+ lookaround. `(a)\1` raises `RegexError`. Deferred to a later version: Unicode
76
+ property classes `\p{...}` and full Unicode case folding.
77
+
78
+ ## License
79
+
80
+ [MIT](LICENSE)
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "purere2"
7
+ version = "0.1.0"
8
+ description = "RE2 in pure Python: linear-time, ReDoS-safe regular expressions - no C extension, no backtracking"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "adam2go" }]
13
+ keywords = ["regex", "re2", "redos", "linear-time", "nfa", "pure-python",
14
+ "security", "untrusted"]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Programming Language :: Python :: 3.14",
26
+ "Programming Language :: Python :: Implementation :: CPython",
27
+ "Programming Language :: Python :: Implementation :: PyPy",
28
+ "Topic :: Software Development :: Libraries",
29
+ "Topic :: Security",
30
+ "Topic :: Text Processing",
31
+ ]
32
+ dependencies = []
33
+
34
+ [project.urls]
35
+ Homepage = "https://github.com/adam2go/purere2"
36
+
37
+ [tool.setuptools.packages.find]
38
+ where = ["src"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,281 @@
1
+ """purere2: RE2 in pure Python - linear-time, ReDoS-safe regular expressions.
2
+
3
+ A pattern compiles to an NFA run by a Pike VM, so matching is always linear in
4
+ the input and no pattern can cause catastrophic backtracking. Like RE2 (and
5
+ unlike Python's `re`), backreferences and lookaround are intentionally absent -
6
+ that absence is exactly what makes the linear-time guarantee possible.
7
+
8
+ import purere2
9
+ purere2.search(r"(\\w+)@(\\w+)", "x@y").groups() # ('x', 'y')
10
+ pat = purere2.compile(r"\\d{4}-\\d\\d-\\d\\d")
11
+ pat.findall("2026-06-19 and 2027-01-01")
12
+
13
+ The API mirrors the common subset of the stdlib `re` module, so it is close to
14
+ a drop-in replacement for running untrusted or LLM-generated patterns safely.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ from .compiler import compile_ast
19
+ from .parser import Anchor, Concat, RegexError, parse
20
+ from .pikevm import Program
21
+
22
+ __version__ = "0.1.0"
23
+ __all__ = ["compile", "search", "match", "fullmatch", "finditer", "findall",
24
+ "sub", "subn", "split", "escape", "Pattern", "Match", "RegexError",
25
+ "error", "I", "IGNORECASE", "M", "MULTILINE", "S", "DOTALL"]
26
+
27
+ # flag constants (compatible values with re)
28
+ I = IGNORECASE = 2
29
+ M = MULTILINE = 8
30
+ S = DOTALL = 16
31
+
32
+ error = RegexError
33
+
34
+
35
+ def _nslots(ngroups):
36
+ return 2 * (ngroups + 1)
37
+
38
+
39
+ class Match:
40
+ __slots__ = ("_slots", "string", "_names", "re", "pos", "endpos")
41
+
42
+ def __init__(self, slots, string, names, pattern):
43
+ self._slots = slots
44
+ self.string = string
45
+ self._names = names
46
+ self.re = pattern
47
+
48
+ def _idx(self, group):
49
+ if isinstance(group, str):
50
+ if group not in self._names:
51
+ raise IndexError("no such group %r" % group)
52
+ return self._names[group]
53
+ return group
54
+
55
+ def group(self, *groups):
56
+ if not groups:
57
+ groups = (0,)
58
+ res = tuple(self._one(g) for g in groups)
59
+ return res[0] if len(res) == 1 else res
60
+
61
+ def _one(self, group):
62
+ g = self._idx(group)
63
+ a = self._slots[2 * g]
64
+ b = self._slots[2 * g + 1]
65
+ if a is None or b is None:
66
+ return None
67
+ return self.string[a:b]
68
+
69
+ def groups(self, default=None):
70
+ out = []
71
+ for g in range(1, len(self._slots) // 2):
72
+ v = self._one(g)
73
+ out.append(default if v is None else v)
74
+ return tuple(out)
75
+
76
+ def groupdict(self, default=None):
77
+ return {name: (self._one(i) if self._one(i) is not None else default)
78
+ for name, i in self._names.items()}
79
+
80
+ def start(self, group=0):
81
+ s = self._slots[2 * self._idx(group)]
82
+ return -1 if s is None else s
83
+
84
+ def end(self, group=0):
85
+ e = self._slots[2 * self._idx(group) + 1]
86
+ return -1 if e is None else e
87
+
88
+ def span(self, group=0):
89
+ return (self.start(group), self.end(group))
90
+
91
+ def __getitem__(self, group):
92
+ return self._one(group)
93
+
94
+ def __repr__(self):
95
+ return "<purere2.Match span=%r match=%r>" % (self.span(), self.group(0))
96
+
97
+
98
+ class Pattern:
99
+ __slots__ = ("pattern", "flags", "groups", "groupindex", "_prog", "_full",
100
+ "_ast")
101
+
102
+ def __init__(self, pattern, flags=0):
103
+ self.pattern = pattern
104
+ self.flags = flags
105
+ ast, ngroups, names = parse(pattern,
106
+ flags_i=bool(flags & I),
107
+ flags_m=bool(flags & M),
108
+ flags_s=bool(flags & S))
109
+ self._ast = ast
110
+ self.groups = ngroups
111
+ self.groupindex = dict(names)
112
+ self._prog = Program(compile_ast(ast, ngroups), _nslots(ngroups))
113
+ self._full = None
114
+
115
+ def _fullprog(self):
116
+ if self._full is None:
117
+ ast = Concat([self._ast, Anchor("\\z")])
118
+ self._full = Program(compile_ast(ast, self.groups),
119
+ _nslots(self.groups))
120
+ return self._full
121
+
122
+ def search(self, string, pos=0, endpos=None):
123
+ return self._run(self._prog, string, pos, endpos, anchored=False)
124
+
125
+ def match(self, string, pos=0, endpos=None):
126
+ return self._run(self._prog, string, pos, endpos, anchored=True)
127
+
128
+ def fullmatch(self, string, pos=0, endpos=None):
129
+ return self._run(self._fullprog(), string, pos, endpos, anchored=True)
130
+
131
+ def _run(self, prog, string, pos, endpos, anchored):
132
+ slots = prog.search(string, pos,
133
+ anchored=anchored) if endpos is None else \
134
+ prog.search(string[:endpos], pos, anchored=anchored)
135
+ if slots is None:
136
+ return None
137
+ return Match(slots, string, self.groupindex, self)
138
+
139
+ def finditer(self, string, pos=0, endpos=None):
140
+ s = string if endpos is None else string[:endpos]
141
+ n = len(s)
142
+ while pos <= n:
143
+ slots = self._prog.search(s, pos, anchored=False)
144
+ if slots is None:
145
+ return
146
+ yield Match(slots, string, self.groupindex, self)
147
+ a, b = slots[0], slots[1]
148
+ pos = b + 1 if b == a else b # avoid stalling on empty matches
149
+
150
+ def findall(self, string, pos=0, endpos=None):
151
+ out = []
152
+ for m in self.finditer(string, pos, endpos):
153
+ if self.groups == 0:
154
+ out.append(m.group(0))
155
+ elif self.groups == 1:
156
+ out.append(m.group(1) or "")
157
+ else:
158
+ out.append(tuple(g or "" for g in m.groups()))
159
+ return out
160
+
161
+ def sub(self, repl, string, count=0):
162
+ return self.subn(repl, string, count)[0]
163
+
164
+ def subn(self, repl, string, count=0):
165
+ out = []
166
+ last = 0
167
+ n = 0
168
+ for m in self.finditer(string):
169
+ if count and n >= count:
170
+ break
171
+ out.append(string[last:m.start()])
172
+ out.append(repl(m) if callable(repl) else _expand(repl, m))
173
+ last = m.end()
174
+ n += 1
175
+ out.append(string[last:])
176
+ return "".join(out), n
177
+
178
+ def split(self, string, maxsplit=0):
179
+ out = []
180
+ last = 0
181
+ n = 0
182
+ for m in self.finditer(string):
183
+ if maxsplit and n >= maxsplit:
184
+ break
185
+ if m.end() == m.start():
186
+ continue # don't split on empty matches (re semantics)
187
+ out.append(string[last:m.start()])
188
+ out.extend(m.groups())
189
+ last = m.end()
190
+ n += 1
191
+ out.append(string[last:])
192
+ return out
193
+
194
+ def __repr__(self):
195
+ return "purere2.compile(%r)" % self.pattern
196
+
197
+
198
+ def _expand(template, m):
199
+ """Expand a replacement template: \\1 \\g<1> \\g<name> and escapes."""
200
+ out = []
201
+ i = 0
202
+ n = len(template)
203
+ while i < n:
204
+ c = template[i]
205
+ if c != "\\":
206
+ out.append(c)
207
+ i += 1
208
+ continue
209
+ i += 1
210
+ if i >= n:
211
+ out.append("\\")
212
+ break
213
+ c = template[i]
214
+ if c == "g":
215
+ j = template.index(">", i) if ">" in template[i:] else -1
216
+ assert template[i + 1] == "<" and j != -1, "bad group reference"
217
+ ref = template[i + 2:j]
218
+ i = j + 1
219
+ out.append(m.group(int(ref) if ref.isdigit() else ref) or "")
220
+ elif c.isdigit():
221
+ j = i
222
+ while j < n and template[j].isdigit():
223
+ j += 1
224
+ out.append(m.group(int(template[i:j])) or "")
225
+ i = j
226
+ else:
227
+ out.append({"n": "\n", "t": "\t", "r": "\r", "\\": "\\"}.get(c, c))
228
+ i += 1
229
+ return "".join(out)
230
+
231
+
232
+ _cache = {}
233
+
234
+
235
+ def compile(pattern, flags=0):
236
+ key = (pattern, flags)
237
+ p = _cache.get(key)
238
+ if p is None:
239
+ p = Pattern(pattern, flags)
240
+ if len(_cache) < 512:
241
+ _cache[key] = p
242
+ return p
243
+
244
+
245
+ def search(pattern, string, flags=0):
246
+ return compile(pattern, flags).search(string)
247
+
248
+
249
+ def match(pattern, string, flags=0):
250
+ return compile(pattern, flags).match(string)
251
+
252
+
253
+ def fullmatch(pattern, string, flags=0):
254
+ return compile(pattern, flags).fullmatch(string)
255
+
256
+
257
+ def finditer(pattern, string, flags=0):
258
+ return compile(pattern, flags).finditer(string)
259
+
260
+
261
+ def findall(pattern, string, flags=0):
262
+ return compile(pattern, flags).findall(string)
263
+
264
+
265
+ def sub(pattern, repl, string, count=0, flags=0):
266
+ return compile(pattern, flags).sub(repl, string, count)
267
+
268
+
269
+ def subn(pattern, repl, string, count=0, flags=0):
270
+ return compile(pattern, flags).subn(repl, string, count)
271
+
272
+
273
+ def split(pattern, string, maxsplit=0, flags=0):
274
+ return compile(pattern, flags).split(string, maxsplit)
275
+
276
+
277
+ _SPECIAL = set("\\^$.|?*+()[]{}")
278
+
279
+
280
+ def escape(s):
281
+ return "".join("\\" + c if c in _SPECIAL or c.isspace() else c for c in s)
@@ -0,0 +1,151 @@
1
+ """Compile an AST into a flat NFA program for the Pike VM.
2
+
3
+ Instructions (tuples, pc-indexed list):
4
+ ("char", cp) consume one codepoint == cp
5
+ ("set", ranges) consume one codepoint within ranges
6
+ ("any", dotall) consume any codepoint (excl. \\n unless dotall)
7
+ ("split", x, y) epsilon-fork; x has priority (greedy preference)
8
+ ("jmp", x) epsilon jump
9
+ ("save", slot) record current input position into capture slot
10
+ ("assert", kind, ml) zero-width assertion (^ $ \\b \\B \\A \\z \\Z)
11
+ ("match",) accept
12
+
13
+ Greedy vs non-greedy is encoded purely by the order of split targets, so the
14
+ VM stays a plain priority-ordered Thompson simulation - linear in input.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ from . import parser as P
19
+
20
+ __all__ = ["compile_ast"]
21
+
22
+
23
+ class _Compiler:
24
+ def __init__(self):
25
+ self.prog = []
26
+
27
+ def emit(self, instr):
28
+ self.prog.append(instr)
29
+ return len(self.prog) - 1
30
+
31
+ def patch(self, pc, instr):
32
+ self.prog[pc] = instr
33
+
34
+ def compile(self, node, ngroups):
35
+ self.emit(("save", 0))
36
+ self._c(node)
37
+ self.emit(("save", 1))
38
+ self.emit(("match",))
39
+ return self.prog
40
+
41
+ # emit code for `node`; falls through to the next instruction on success
42
+ def _c(self, node):
43
+ t = type(node)
44
+ if t is P.Empty:
45
+ return
46
+ if t is P.Lit:
47
+ if len(node.cps) == 1:
48
+ self.emit(("char", node.cps[0]))
49
+ else:
50
+ self.emit(("set", tuple((c, c) for c in node.cps)))
51
+ elif t is P.Dot:
52
+ self.emit(("any", node.dotall))
53
+ elif t is P.Class:
54
+ self.emit(("set", node.ranges))
55
+ elif t is P.Anchor:
56
+ self.emit(("assert", node.kind, node.multiline))
57
+ elif t is P.Concat:
58
+ for it in node.items:
59
+ self._c(it)
60
+ elif t is P.Group:
61
+ if node.index is not None:
62
+ self.emit(("save", 2 * node.index))
63
+ self._c(node.sub)
64
+ self.emit(("save", 2 * node.index + 1))
65
+ else:
66
+ self._c(node.sub)
67
+ elif t is P.Alt:
68
+ self._alt(node.opts)
69
+ elif t is P.Star:
70
+ self._star(node.sub, node.greedy)
71
+ elif t is P.Plus:
72
+ self._plus(node.sub, node.greedy)
73
+ elif t is P.Quest:
74
+ self._quest(node.sub, node.greedy)
75
+ elif t is P.Repeat:
76
+ self._repeat(node)
77
+ else:
78
+ raise AssertionError("unknown node %r" % t)
79
+
80
+ def _alt(self, opts):
81
+ # opt0 | (opt1 | (opt2 | ...)) with priority left-to-right
82
+ jmps = []
83
+ for k, opt in enumerate(opts):
84
+ last = k == len(opts) - 1
85
+ if not last:
86
+ sp = self.emit(("split", None, None))
87
+ a = len(self.prog)
88
+ self._c(opt)
89
+ jmps.append(self.emit(("jmp", None)))
90
+ b = len(self.prog)
91
+ self.patch(sp, ("split", a, b))
92
+ else:
93
+ self._c(opt)
94
+ end = len(self.prog)
95
+ for j in jmps:
96
+ self.patch(j, ("jmp", end))
97
+
98
+ def _star(self, sub, greedy):
99
+ # Compile e* as (e+)? rather than a bare loop. With a plain loop the
100
+ # exit branch is taken before the body's `save` instructions run, so a
101
+ # nullable body (e.g. a capturing group that can match empty) would
102
+ # never record its empty capture. Routing through e+ runs the body's
103
+ # captures once before the optional exit - matching RE2's submatches.
104
+ sp = self.emit(("split", None, None))
105
+ plus = len(self.prog)
106
+ self._plus(sub, greedy)
107
+ exit_ = len(self.prog)
108
+ self.patch(sp, ("split", plus, exit_) if greedy
109
+ else ("split", exit_, plus))
110
+
111
+ def _plus(self, sub, greedy):
112
+ body = len(self.prog)
113
+ self._c(sub)
114
+ sp = self.emit(("split", None, None))
115
+ exit_ = len(self.prog)
116
+ self.patch(sp, ("split", body, exit_) if greedy else ("split", exit_, body))
117
+
118
+ def _quest(self, sub, greedy):
119
+ sp = self.emit(("split", None, None))
120
+ body = len(self.prog)
121
+ self._c(sub)
122
+ exit_ = len(self.prog)
123
+ self.patch(sp, ("split", body, exit_) if greedy else ("split", exit_, body))
124
+
125
+ def _repeat(self, node):
126
+ lo, hi = node.lo, node.hi
127
+ for _ in range(lo):
128
+ self._c(node.sub)
129
+ if hi is None:
130
+ if lo == 0:
131
+ self._star(node.sub, node.greedy)
132
+ else:
133
+ self._star(node.sub, node.greedy)
134
+ else:
135
+ # (hi - lo) nested optionals: sub(sub(...)?)? to preserve greedy
136
+ n_opt = hi - lo
137
+ self._nested_opt(node.sub, n_opt, node.greedy)
138
+
139
+ def _nested_opt(self, sub, k, greedy):
140
+ if k <= 0:
141
+ return
142
+ sp = self.emit(("split", None, None))
143
+ body = len(self.prog)
144
+ self._c(sub)
145
+ self._nested_opt(sub, k - 1, greedy)
146
+ exit_ = len(self.prog)
147
+ self.patch(sp, ("split", body, exit_) if greedy else ("split", exit_, body))
148
+
149
+
150
+ def compile_ast(node, ngroups):
151
+ return _Compiler().compile(node, ngroups)