stream-replace 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stream_replace-0.1.0/.github/workflows/ci.yml +21 -0
- stream_replace-0.1.0/.github/workflows/publish.yml +28 -0
- stream_replace-0.1.0/.gitignore +10 -0
- stream_replace-0.1.0/.python-version +1 -0
- stream_replace-0.1.0/PKG-INFO +126 -0
- stream_replace-0.1.0/README.md +114 -0
- stream_replace-0.1.0/main.py +6 -0
- stream_replace-0.1.0/pyproject.toml +21 -0
- stream_replace-0.1.0/stream_replace/__init__.py +4 -0
- stream_replace-0.1.0/stream_replace/_functional.py +22 -0
- stream_replace-0.1.0/stream_replace/_replacer.py +146 -0
- stream_replace-0.1.0/stream_replace/_rule.py +109 -0
- stream_replace-0.1.0/stream_replace/_utils.py +50 -0
- stream_replace-0.1.0/tests/__init__.py +0 -0
- stream_replace-0.1.0/tests/test_stream_replace.py +241 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.13"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
- run: pip install -e ".[dev]" pytest pytest-asyncio
|
|
21
|
+
- run: pytest
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
environment: pypi
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.13"
|
|
20
|
+
|
|
21
|
+
- name: Install build tools
|
|
22
|
+
run: pip install build
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: python -m build
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: stream-replace
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Streaming text replacement for AI token streams — handles partial matches across chunk boundaries
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Topic :: Text Processing :: Filters
|
|
9
|
+
Classifier: Typing :: Typed
|
|
10
|
+
Requires-Python: >=3.13
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# stream-replace
|
|
14
|
+
|
|
15
|
+
Streaming text replacement for AI token streams — correctly handles partial matches across chunk boundaries.
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install stream-replace
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
import re
|
|
27
|
+
from stream_replace import Replacer
|
|
28
|
+
|
|
29
|
+
r = Replacer([
|
|
30
|
+
("敏感词", "***"), # string → string
|
|
31
|
+
("secret", lambda s: s[0] + "***"), # string → callable
|
|
32
|
+
(re.compile(r"1[3-9]\d{9}"), "[PHONE]"), # regex → string
|
|
33
|
+
(re.compile(r"<think>[\s\S]*?</think>"), ""), # regex → remove
|
|
34
|
+
(re.compile(r"(\d+)"), lambda m: str(int(m.group()) * 2)), # regex → callable
|
|
35
|
+
])
|
|
36
|
+
|
|
37
|
+
for chunk in ai_stream:
|
|
38
|
+
safe_text = r.feed(chunk)
|
|
39
|
+
print(safe_text, end="")
|
|
40
|
+
|
|
41
|
+
print(r.flush(), end="")
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Why?
|
|
45
|
+
|
|
46
|
+
AI models stream tokens incrementally. A word you want to replace may be split across chunks:
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
chunk 1: "hel"
|
|
50
|
+
chunk 2: "lo world"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Naive per-chunk replacement would miss `"hello"`. **stream-replace** buffers just enough text at chunk boundaries to detect partial matches, while emitting safe text as early as possible.
|
|
54
|
+
|
|
55
|
+
## API
|
|
56
|
+
|
|
57
|
+
### `Replacer(rules)`
|
|
58
|
+
|
|
59
|
+
Create a replacer with a list of `(pattern, replacement)` tuples.
|
|
60
|
+
|
|
61
|
+
| Pattern | Replacement | Description |
|
|
62
|
+
|---|---|---|
|
|
63
|
+
| `str` | `str` | Exact string replacement |
|
|
64
|
+
| `str` | `callable(matched_str) → str` | Dynamic string replacement |
|
|
65
|
+
| `re.Pattern` | `str` | Regex replacement (supports `\1` backrefs) |
|
|
66
|
+
| `re.Pattern` | `callable(re.Match) → str` | Dynamic regex replacement |
|
|
67
|
+
|
|
68
|
+
#### `r.feed(chunk: str) → str`
|
|
69
|
+
|
|
70
|
+
Process one incoming chunk. Returns text that is safe to emit (fully resolved, no pending partial matches).
|
|
71
|
+
|
|
72
|
+
#### `r.flush() → str`
|
|
73
|
+
|
|
74
|
+
Flush the internal buffer after the stream ends. Must be called once to get any remaining text.
|
|
75
|
+
|
|
76
|
+
#### `r.reset()`
|
|
77
|
+
|
|
78
|
+
Clear internal state so the replacer can be reused for another stream.
|
|
79
|
+
|
|
80
|
+
#### `r.wrap(iterable) → Iterable[str]`
|
|
81
|
+
|
|
82
|
+
Convenience wrapper for a sync chunk stream. Handles `feed` + `flush` automatically.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
for text in r.wrap(chunks):
|
|
86
|
+
print(text, end="")
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
#### `r.wrap_async(async_iterable) → AsyncIterable[str]`
|
|
90
|
+
|
|
91
|
+
Same as `wrap`, but for async iterables.
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
async for text in r.wrap_async(async_chunks):
|
|
95
|
+
print(text, end="")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Functional API
|
|
99
|
+
|
|
100
|
+
For one-off use without creating a `Replacer` instance:
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from stream_replace import stream_replace, astream_replace
|
|
104
|
+
|
|
105
|
+
# sync
|
|
106
|
+
for text in stream_replace(chunks, [("hello", "world")]):
|
|
107
|
+
print(text, end="")
|
|
108
|
+
|
|
109
|
+
# async
|
|
110
|
+
async for text in astream_replace(async_chunks, [("hello", "world")]):
|
|
111
|
+
print(text, end="")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## How It Works
|
|
115
|
+
|
|
116
|
+
1. **Buffer**: Incoming chunks accumulate in an internal buffer.
|
|
117
|
+
2. **Match**: On each `feed()`, the buffer is scanned for complete matches across all rules. The earliest match wins.
|
|
118
|
+
3. **Replace**: Matched text is replaced; scanning continues from after the replacement.
|
|
119
|
+
4. **Hold back**: After all matches, the buffer tail is checked for *potential* partial matches (a suffix that could be the start of a pattern). This tail is held back for the next `feed()`.
|
|
120
|
+
5. **Flush**: On `flush()`, the remaining buffer is processed without holding anything back.
|
|
121
|
+
|
|
122
|
+
For regex rules, the library automatically extracts literal prefixes from the pattern (e.g., `"<think>"` from `r"<think>[\s\S]*?</think>"`) to detect both partial prefix matches and open-but-unclosed matches spanning multiple chunks.
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
MIT
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# stream-replace
|
|
2
|
+
|
|
3
|
+
Streaming text replacement for AI token streams — correctly handles partial matches across chunk boundaries.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install stream-replace
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import re
|
|
15
|
+
from stream_replace import Replacer
|
|
16
|
+
|
|
17
|
+
r = Replacer([
|
|
18
|
+
("敏感词", "***"), # string → string
|
|
19
|
+
("secret", lambda s: s[0] + "***"), # string → callable
|
|
20
|
+
(re.compile(r"1[3-9]\d{9}"), "[PHONE]"), # regex → string
|
|
21
|
+
(re.compile(r"<think>[\s\S]*?</think>"), ""), # regex → remove
|
|
22
|
+
(re.compile(r"(\d+)"), lambda m: str(int(m.group()) * 2)), # regex → callable
|
|
23
|
+
])
|
|
24
|
+
|
|
25
|
+
for chunk in ai_stream:
|
|
26
|
+
safe_text = r.feed(chunk)
|
|
27
|
+
print(safe_text, end="")
|
|
28
|
+
|
|
29
|
+
print(r.flush(), end="")
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Why?
|
|
33
|
+
|
|
34
|
+
AI models stream tokens incrementally. A word you want to replace may be split across chunks:
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
chunk 1: "hel"
|
|
38
|
+
chunk 2: "lo world"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Naive per-chunk replacement would miss `"hello"`. **stream-replace** buffers just enough text at chunk boundaries to detect partial matches, while emitting safe text as early as possible.
|
|
42
|
+
|
|
43
|
+
## API
|
|
44
|
+
|
|
45
|
+
### `Replacer(rules)`
|
|
46
|
+
|
|
47
|
+
Create a replacer with a list of `(pattern, replacement)` tuples.
|
|
48
|
+
|
|
49
|
+
| Pattern | Replacement | Description |
|
|
50
|
+
|---|---|---|
|
|
51
|
+
| `str` | `str` | Exact string replacement |
|
|
52
|
+
| `str` | `callable(matched_str) → str` | Dynamic string replacement |
|
|
53
|
+
| `re.Pattern` | `str` | Regex replacement (supports `\1` backrefs) |
|
|
54
|
+
| `re.Pattern` | `callable(re.Match) → str` | Dynamic regex replacement |
|
|
55
|
+
|
|
56
|
+
#### `r.feed(chunk: str) → str`
|
|
57
|
+
|
|
58
|
+
Process one incoming chunk. Returns text that is safe to emit (fully resolved, no pending partial matches).
|
|
59
|
+
|
|
60
|
+
#### `r.flush() → str`
|
|
61
|
+
|
|
62
|
+
Flush the internal buffer after the stream ends. Must be called once to get any remaining text.
|
|
63
|
+
|
|
64
|
+
#### `r.reset()`
|
|
65
|
+
|
|
66
|
+
Clear internal state so the replacer can be reused for another stream.
|
|
67
|
+
|
|
68
|
+
#### `r.wrap(iterable) → Iterable[str]`
|
|
69
|
+
|
|
70
|
+
Convenience wrapper for a sync chunk stream. Handles `feed` + `flush` automatically.
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
for text in r.wrap(chunks):
|
|
74
|
+
print(text, end="")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
#### `r.wrap_async(async_iterable) → AsyncIterable[str]`
|
|
78
|
+
|
|
79
|
+
Same as `wrap`, but for async iterables.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
async for text in r.wrap_async(async_chunks):
|
|
83
|
+
print(text, end="")
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Functional API
|
|
87
|
+
|
|
88
|
+
For one-off use without creating a `Replacer` instance:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from stream_replace import stream_replace, astream_replace
|
|
92
|
+
|
|
93
|
+
# sync
|
|
94
|
+
for text in stream_replace(chunks, [("hello", "world")]):
|
|
95
|
+
print(text, end="")
|
|
96
|
+
|
|
97
|
+
# async
|
|
98
|
+
async for text in astream_replace(async_chunks, [("hello", "world")]):
|
|
99
|
+
print(text, end="")
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## How It Works
|
|
103
|
+
|
|
104
|
+
1. **Buffer**: Incoming chunks accumulate in an internal buffer.
|
|
105
|
+
2. **Match**: On each `feed()`, the buffer is scanned for complete matches across all rules. The earliest match wins.
|
|
106
|
+
3. **Replace**: Matched text is replaced; scanning continues from after the replacement.
|
|
107
|
+
4. **Hold back**: After all matches, the buffer tail is checked for *potential* partial matches (a suffix that could be the start of a pattern). This tail is held back for the next `feed()`.
|
|
108
|
+
5. **Flush**: On `flush()`, the remaining buffer is processed without holding anything back.
|
|
109
|
+
|
|
110
|
+
For regex rules, the library automatically extracts literal prefixes from the pattern (e.g., `"<think>"` from `r"<think>[\s\S]*?</think>"`) to detect both partial prefix matches and open-but-unclosed matches spanning multiple chunks.
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
MIT
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "stream-replace"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Streaming text replacement for AI token streams — handles partial matches across chunk boundaries"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.13"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Topic :: Text Processing :: Filters",
|
|
16
|
+
"Typing :: Typed",
|
|
17
|
+
]
|
|
18
|
+
dependencies = []
|
|
19
|
+
|
|
20
|
+
[tool.pytest.ini_options]
|
|
21
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import AsyncIterable, Iterable
|
|
4
|
+
|
|
5
|
+
from stream_replace._replacer import Replacer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def stream_replace(
|
|
9
|
+
iterable: Iterable[str],
|
|
10
|
+
rules: list[tuple],
|
|
11
|
+
) -> Iterable[str]:
|
|
12
|
+
"""Convenience wrapper — apply replacement *rules* to a sync chunk stream."""
|
|
13
|
+
return Replacer(rules).wrap(iterable)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def astream_replace(
|
|
17
|
+
iterable: AsyncIterable[str],
|
|
18
|
+
rules: list[tuple],
|
|
19
|
+
) -> AsyncIterable[str]:
|
|
20
|
+
"""Convenience wrapper — apply replacement *rules* to an async chunk stream."""
|
|
21
|
+
async for chunk in Replacer(rules).wrap_async(iterable):
|
|
22
|
+
yield chunk
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import AsyncIterable, Iterable
|
|
4
|
+
|
|
5
|
+
from stream_replace._rule import Rule, parse_rule
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Replacer:
|
|
9
|
+
"""Streaming text replacer that correctly handles partial matches across
|
|
10
|
+
chunk boundaries.
|
|
11
|
+
|
|
12
|
+
Usage::
|
|
13
|
+
|
|
14
|
+
r = Replacer([("hello", "world")])
|
|
15
|
+
for chunk in token_stream:
|
|
16
|
+
yield r.feed(chunk)
|
|
17
|
+
yield r.flush()
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
__slots__ = ("_rules", "_buffer")
|
|
21
|
+
|
|
22
|
+
def __init__(self, rules: list[tuple]) -> None:
|
|
23
|
+
self._rules: list[Rule] = [parse_rule(r) for r in rules]
|
|
24
|
+
self._buffer: str = ""
|
|
25
|
+
|
|
26
|
+
# ------------------------------------------------------------------
|
|
27
|
+
# Core API
|
|
28
|
+
# ------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
def feed(self, chunk: str) -> str:
|
|
31
|
+
"""Process an incoming chunk. Returns text that is safe to emit."""
|
|
32
|
+
if not chunk:
|
|
33
|
+
return ""
|
|
34
|
+
self._buffer += chunk
|
|
35
|
+
return self._process(flushing=False)
|
|
36
|
+
|
|
37
|
+
def flush(self) -> str:
|
|
38
|
+
"""Flush any remaining buffered text. Call once after the stream ends."""
|
|
39
|
+
return self._process(flushing=True)
|
|
40
|
+
|
|
41
|
+
def reset(self) -> None:
|
|
42
|
+
"""Clear internal state so the instance can be reused."""
|
|
43
|
+
self._buffer = ""
|
|
44
|
+
|
|
45
|
+
# ------------------------------------------------------------------
|
|
46
|
+
# Convenience wrappers
|
|
47
|
+
# ------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
def wrap(self, iterable: Iterable[str]) -> Iterable[str]:
|
|
50
|
+
"""Wrap a sync iterable of chunks, yielding replaced text."""
|
|
51
|
+
self.reset()
|
|
52
|
+
for chunk in iterable:
|
|
53
|
+
out = self.feed(chunk)
|
|
54
|
+
if out:
|
|
55
|
+
yield out
|
|
56
|
+
out = self.flush()
|
|
57
|
+
if out:
|
|
58
|
+
yield out
|
|
59
|
+
|
|
60
|
+
async def wrap_async(self, iterable: AsyncIterable[str]) -> AsyncIterable[str]:
|
|
61
|
+
"""Wrap an async iterable of chunks, yielding replaced text."""
|
|
62
|
+
self.reset()
|
|
63
|
+
async for chunk in iterable:
|
|
64
|
+
out = self.feed(chunk)
|
|
65
|
+
if out:
|
|
66
|
+
yield out
|
|
67
|
+
out = self.flush()
|
|
68
|
+
if out:
|
|
69
|
+
yield out
|
|
70
|
+
|
|
71
|
+
# ------------------------------------------------------------------
|
|
72
|
+
# Internal
|
|
73
|
+
# ------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
def _process(self, *, flushing: bool) -> str:
|
|
76
|
+
result: list[str] = []
|
|
77
|
+
buf = self._buffer
|
|
78
|
+
|
|
79
|
+
# Phase 1: find and apply all complete matches iteratively
|
|
80
|
+
pos = 0
|
|
81
|
+
while pos < len(buf):
|
|
82
|
+
earliest = self._find_earliest_match(buf, pos)
|
|
83
|
+
if earliest is None:
|
|
84
|
+
break
|
|
85
|
+
result.append(buf[pos:earliest.start])
|
|
86
|
+
result.append(earliest.replacement_text)
|
|
87
|
+
pos = earliest.end
|
|
88
|
+
|
|
89
|
+
remaining = buf[pos:]
|
|
90
|
+
|
|
91
|
+
if flushing:
|
|
92
|
+
# No more data coming — do one final replacement pass on the
|
|
93
|
+
# remaining tail, then emit everything.
|
|
94
|
+
final = self._replace_all(remaining)
|
|
95
|
+
result.append(final)
|
|
96
|
+
self._buffer = ""
|
|
97
|
+
else:
|
|
98
|
+
# Determine safe-to-emit boundary in *remaining*.
|
|
99
|
+
safe = self._safe_boundary(remaining)
|
|
100
|
+
result.append(remaining[:safe])
|
|
101
|
+
self._buffer = remaining[safe:]
|
|
102
|
+
|
|
103
|
+
return "".join(result)
|
|
104
|
+
|
|
105
|
+
def _find_earliest_match(self, text: str, pos: int):
|
|
106
|
+
"""Return the earliest MatchResult among all rules, or None."""
|
|
107
|
+
from stream_replace._rule import MatchResult
|
|
108
|
+
|
|
109
|
+
best: MatchResult | None = None
|
|
110
|
+
for rule in self._rules:
|
|
111
|
+
m = rule.search(text, pos)
|
|
112
|
+
if m is not None and (best is None or m.start < best.start):
|
|
113
|
+
best = m
|
|
114
|
+
return best
|
|
115
|
+
|
|
116
|
+
def _safe_boundary(self, remaining: str) -> int:
|
|
117
|
+
"""How many characters from *remaining* can be safely emitted."""
|
|
118
|
+
if not remaining:
|
|
119
|
+
return 0
|
|
120
|
+
|
|
121
|
+
safe = len(remaining)
|
|
122
|
+
for rule in self._rules:
|
|
123
|
+
p = rule.find_partial_start(remaining)
|
|
124
|
+
if p is not None and p < safe:
|
|
125
|
+
safe = p
|
|
126
|
+
return safe
|
|
127
|
+
|
|
128
|
+
def _replace_all(self, text: str) -> str:
|
|
129
|
+
"""Run all rules on *text* until no more matches (for flush)."""
|
|
130
|
+
buf = text
|
|
131
|
+
changed = True
|
|
132
|
+
while changed:
|
|
133
|
+
changed = False
|
|
134
|
+
pos = 0
|
|
135
|
+
parts: list[str] = []
|
|
136
|
+
while pos < len(buf):
|
|
137
|
+
m = self._find_earliest_match(buf, pos)
|
|
138
|
+
if m is None:
|
|
139
|
+
break
|
|
140
|
+
parts.append(buf[pos:m.start])
|
|
141
|
+
parts.append(m.replacement_text)
|
|
142
|
+
pos = m.end
|
|
143
|
+
changed = True
|
|
144
|
+
parts.append(buf[pos:])
|
|
145
|
+
buf = "".join(parts)
|
|
146
|
+
return buf
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Callable, Union
|
|
6
|
+
|
|
7
|
+
from stream_replace._utils import extract_literal_prefix, find_suffix_prefix_overlap
|
|
8
|
+
|
|
9
|
+
Replacement = Union[str, Callable]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(slots=True)
|
|
13
|
+
class MatchResult:
|
|
14
|
+
start: int
|
|
15
|
+
end: int
|
|
16
|
+
replacement_text: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(slots=True)
|
|
20
|
+
class StringRule:
|
|
21
|
+
pattern: str
|
|
22
|
+
replacement: Replacement
|
|
23
|
+
|
|
24
|
+
def search(self, text: str, pos: int = 0) -> MatchResult | None:
|
|
25
|
+
idx = text.find(self.pattern, pos)
|
|
26
|
+
if idx == -1:
|
|
27
|
+
return None
|
|
28
|
+
matched = self.pattern
|
|
29
|
+
rep = self.replacement(matched) if callable(self.replacement) else self.replacement
|
|
30
|
+
return MatchResult(start=idx, end=idx + len(self.pattern), replacement_text=rep)
|
|
31
|
+
|
|
32
|
+
def find_partial_start(self, text: str) -> int | None:
|
|
33
|
+
return find_suffix_prefix_overlap(text, self.pattern)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(slots=True)
|
|
37
|
+
class RegexRule:
|
|
38
|
+
regex: re.Pattern[str]
|
|
39
|
+
replacement: Replacement
|
|
40
|
+
literal_prefix: str = field(init=False)
|
|
41
|
+
_fallback_lookback: int = 0
|
|
42
|
+
|
|
43
|
+
def __post_init__(self) -> None:
|
|
44
|
+
self.literal_prefix = extract_literal_prefix(self.regex)
|
|
45
|
+
if not self.literal_prefix and self._fallback_lookback == 0:
|
|
46
|
+
self._fallback_lookback = 32
|
|
47
|
+
|
|
48
|
+
def search(self, text: str, pos: int = 0) -> MatchResult | None:
|
|
49
|
+
m = self.regex.search(text, pos)
|
|
50
|
+
if m is None:
|
|
51
|
+
return None
|
|
52
|
+
if callable(self.replacement):
|
|
53
|
+
rep = self.replacement(m)
|
|
54
|
+
else:
|
|
55
|
+
rep = m.expand(self.replacement)
|
|
56
|
+
return MatchResult(start=m.start(), end=m.end(), replacement_text=rep)
|
|
57
|
+
|
|
58
|
+
def find_partial_start(self, text: str) -> int | None:
|
|
59
|
+
prefix = self.literal_prefix
|
|
60
|
+
if prefix:
|
|
61
|
+
# Check whether a complete literal prefix exists without a full
|
|
62
|
+
# regex match — that means an open (unfinished) match is in
|
|
63
|
+
# progress and we must hold back from that position.
|
|
64
|
+
search_start = 0
|
|
65
|
+
while True:
|
|
66
|
+
idx = text.find(prefix, search_start)
|
|
67
|
+
if idx == -1:
|
|
68
|
+
break
|
|
69
|
+
if self.regex.search(text, idx) is None:
|
|
70
|
+
return idx
|
|
71
|
+
search_start = idx + 1
|
|
72
|
+
|
|
73
|
+
# Also check if the very tail is a *partial* literal prefix
|
|
74
|
+
# (token split mid-prefix).
|
|
75
|
+
return find_suffix_prefix_overlap(text, prefix)
|
|
76
|
+
|
|
77
|
+
if self._fallback_lookback > 0 and len(text) > 0:
|
|
78
|
+
return max(0, len(text) - self._fallback_lookback)
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
Rule = StringRule | RegexRule
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def parse_rule(spec: tuple) -> Rule:
|
|
86
|
+
"""Convert a user-supplied tuple into a Rule object.
|
|
87
|
+
|
|
88
|
+
Accepted forms:
|
|
89
|
+
(str, str | callable)
|
|
90
|
+
(re.Pattern, str | callable)
|
|
91
|
+
(str, str | callable, dict) -- extra options forwarded
|
|
92
|
+
(re.Pattern, str | callable, dict)
|
|
93
|
+
"""
|
|
94
|
+
if len(spec) == 2:
|
|
95
|
+
pattern, replacement = spec
|
|
96
|
+
opts: dict = {}
|
|
97
|
+
elif len(spec) == 3:
|
|
98
|
+
pattern, replacement, opts = spec
|
|
99
|
+
else:
|
|
100
|
+
raise ValueError(f"Rule tuple must have 2 or 3 elements, got {len(spec)}")
|
|
101
|
+
|
|
102
|
+
if isinstance(pattern, str):
|
|
103
|
+
return StringRule(pattern=pattern, replacement=replacement)
|
|
104
|
+
if isinstance(pattern, re.Pattern):
|
|
105
|
+
lookback = opts.get("lookback", 0)
|
|
106
|
+
rule = RegexRule(regex=pattern, replacement=replacement, _fallback_lookback=lookback)
|
|
107
|
+
return rule
|
|
108
|
+
|
|
109
|
+
raise TypeError(f"Pattern must be str or re.Pattern, got {type(pattern).__name__}")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import re._parser as _sre_parse # Python 3.13+
|
|
7
|
+
except ImportError:
|
|
8
|
+
import sre_parse as _sre_parse # type: ignore[no-redef]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extract_literal_prefix(pattern: re.Pattern[str]) -> str:
|
|
12
|
+
"""Extract the leading literal character sequence from a compiled regex.
|
|
13
|
+
|
|
14
|
+
Examples:
|
|
15
|
+
r"<think>[\\s\\S]*?</think>" -> "<think>"
|
|
16
|
+
r"hello\\s+world" -> "hello"
|
|
17
|
+
r"\\d{11}" -> ""
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
parsed = _sre_parse.parse(pattern.pattern)
|
|
21
|
+
except Exception:
|
|
22
|
+
return ""
|
|
23
|
+
|
|
24
|
+
chars: list[str] = []
|
|
25
|
+
for op, av in parsed:
|
|
26
|
+
if op == _sre_parse.LITERAL:
|
|
27
|
+
chars.append(chr(av))
|
|
28
|
+
else:
|
|
29
|
+
break
|
|
30
|
+
return "".join(chars)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def find_suffix_prefix_overlap(text: str, pattern: str) -> int | None:
|
|
34
|
+
"""Find the earliest position in *text* where a suffix of *text* equals
|
|
35
|
+
a proper prefix of *pattern*.
|
|
36
|
+
|
|
37
|
+
Returns the start index in *text*, or None if no overlap exists.
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
text="abc hel", pattern="hello" -> 4 (suffix "hel" is prefix of "hello")
|
|
41
|
+
"""
|
|
42
|
+
if not pattern:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
max_check = min(len(text), len(pattern) - 1)
|
|
46
|
+
for length in range(max_check, 0, -1):
|
|
47
|
+
suffix = text[-length:]
|
|
48
|
+
if pattern[:length] == suffix:
|
|
49
|
+
return len(text) - length
|
|
50
|
+
return None
|
|
File without changes
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from stream_replace import Replacer, astream_replace, stream_replace
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ---------------------------------------------------------------
|
|
12
|
+
# Helpers
|
|
13
|
+
# ---------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
def collect(replacer: Replacer, chunks: list[str]) -> str:
|
|
16
|
+
parts = [replacer.feed(c) for c in chunks]
|
|
17
|
+
parts.append(replacer.flush())
|
|
18
|
+
return "".join(parts)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def _async_gen(chunks: list[str]):
|
|
22
|
+
for c in chunks:
|
|
23
|
+
yield c
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------
|
|
27
|
+
# Basic string replacement
|
|
28
|
+
# ---------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
class TestStringBasic:
|
|
31
|
+
def test_single_chunk(self):
|
|
32
|
+
r = Replacer([("hello", "world")])
|
|
33
|
+
assert collect(r, ["hello there"]) == "world there"
|
|
34
|
+
|
|
35
|
+
def test_multiple_occurrences(self):
|
|
36
|
+
r = Replacer([("ab", "X")])
|
|
37
|
+
assert collect(r, ["ab cd ab"]) == "X cd X"
|
|
38
|
+
|
|
39
|
+
def test_no_match(self):
|
|
40
|
+
r = Replacer([("xyz", "!")])
|
|
41
|
+
assert collect(r, ["hello world"]) == "hello world"
|
|
42
|
+
|
|
43
|
+
def test_empty_chunks(self):
|
|
44
|
+
r = Replacer([("a", "b")])
|
|
45
|
+
assert collect(r, ["", "a", "", "a", ""]) == "bb"
|
|
46
|
+
|
|
47
|
+
def test_replacement_with_callable(self):
|
|
48
|
+
r = Replacer([("bad", lambda s: s.upper())])
|
|
49
|
+
assert collect(r, ["this is bad"]) == "this is BAD"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------
|
|
53
|
+
# Cross-chunk partial matching (strings)
|
|
54
|
+
# ---------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
class TestStringPartial:
|
|
57
|
+
def test_split_across_two_chunks(self):
|
|
58
|
+
r = Replacer([("hello", "world")])
|
|
59
|
+
assert collect(r, ["hel", "lo"]) == "world"
|
|
60
|
+
|
|
61
|
+
def test_split_across_three_chunks(self):
|
|
62
|
+
r = Replacer([("hello", "world")])
|
|
63
|
+
assert collect(r, ["he", "ll", "o done"]) == "world done"
|
|
64
|
+
|
|
65
|
+
def test_partial_that_never_completes(self):
|
|
66
|
+
r = Replacer([("hello", "world")])
|
|
67
|
+
assert collect(r, ["hel", " nope"]) == "hel nope"
|
|
68
|
+
|
|
69
|
+
def test_overlapping_partial(self):
|
|
70
|
+
r = Replacer([("abc", "X")])
|
|
71
|
+
assert collect(r, ["ab", "ab", "c"]) == "abX"
|
|
72
|
+
|
|
73
|
+
def test_partial_at_stream_end(self):
|
|
74
|
+
r = Replacer([("hello", "world")])
|
|
75
|
+
assert collect(r, ["hel"]) == "hel"
|
|
76
|
+
|
|
77
|
+
def test_match_then_partial(self):
|
|
78
|
+
r = Replacer([("aa", "X")])
|
|
79
|
+
assert collect(r, ["aa", "a"]) == "Xa"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ---------------------------------------------------------------
|
|
83
|
+
# Regex replacement
|
|
84
|
+
# ---------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
class TestRegex:
|
|
87
|
+
def test_simple_regex(self):
|
|
88
|
+
r = Replacer([(re.compile(r"\d+"), "#")])
|
|
89
|
+
assert collect(r, ["abc 123 def"]) == "abc # def"
|
|
90
|
+
|
|
91
|
+
def test_phone_number_masking(self):
|
|
92
|
+
r = Replacer([(re.compile(r"1[3-9]\d{9}"), "[PHONE]")])
|
|
93
|
+
assert collect(r, ["call 13800138000 now"]) == "call [PHONE] now"
|
|
94
|
+
|
|
95
|
+
def test_regex_with_groups(self):
|
|
96
|
+
r = Replacer([(re.compile(r"(\d+)"), lambda m: str(int(m.group()) * 2))])
|
|
97
|
+
assert collect(r, ["val=5 x=10"]) == "val=10 x=20"
|
|
98
|
+
|
|
99
|
+
def test_regex_expand(self):
|
|
100
|
+
r = Replacer([(re.compile(r"(\w+)@(\w+)"), r"\1[at]\2")])
|
|
101
|
+
assert collect(r, ["email: a@b"]) == "email: a[at]b"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ---------------------------------------------------------------
|
|
105
|
+
# Tag-style removal (unbounded regex)
|
|
106
|
+
# ---------------------------------------------------------------
|
|
107
|
+
|
|
108
|
+
class TestTagRemoval:
|
|
109
|
+
def test_think_tag_single_chunk(self):
|
|
110
|
+
r = Replacer([(re.compile(r"<think>[\s\S]*?</think>"), "")])
|
|
111
|
+
assert collect(r, ["<think>reasoning</think>answer"]) == "answer"
|
|
112
|
+
|
|
113
|
+
def test_think_tag_across_chunks(self):
|
|
114
|
+
r = Replacer([(re.compile(r"<think>[\s\S]*?</think>"), "")])
|
|
115
|
+
chunks = ["<think>", "some reasoning", "</think>", "final answer"]
|
|
116
|
+
assert collect(r, chunks) == "final answer"
|
|
117
|
+
|
|
118
|
+
def test_think_tag_split_open(self):
|
|
119
|
+
r = Replacer([(re.compile(r"<think>[\s\S]*?</think>"), "")])
|
|
120
|
+
chunks = ["<thi", "nk>stuff</think>ok"]
|
|
121
|
+
assert collect(r, chunks) == "ok"
|
|
122
|
+
|
|
123
|
+
def test_think_tag_split_close(self):
|
|
124
|
+
r = Replacer([(re.compile(r"<think>[\s\S]*?</think>"), "")])
|
|
125
|
+
chunks = ["<think>stuff</thi", "nk>ok"]
|
|
126
|
+
assert collect(r, chunks) == "ok"
|
|
127
|
+
|
|
128
|
+
def test_text_before_think_tag(self):
|
|
129
|
+
r = Replacer([(re.compile(r"<think>[\s\S]*?</think>"), "")])
|
|
130
|
+
chunks = ["prefix <think>", "inner", "</think> suffix"]
|
|
131
|
+
assert collect(r, chunks) == "prefix suffix"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ---------------------------------------------------------------
|
|
135
|
+
# Multiple rules
|
|
136
|
+
# ---------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
class TestMultipleRules:
|
|
139
|
+
def test_string_and_regex(self):
|
|
140
|
+
r = Replacer([
|
|
141
|
+
("foo", "bar"),
|
|
142
|
+
(re.compile(r"\d+"), "#"),
|
|
143
|
+
])
|
|
144
|
+
assert collect(r, ["foo has 3 items"]) == "bar has # items"
|
|
145
|
+
|
|
146
|
+
def test_earliest_match_wins(self):
|
|
147
|
+
r = Replacer([
|
|
148
|
+
("bc", "X"),
|
|
149
|
+
("ab", "Y"),
|
|
150
|
+
])
|
|
151
|
+
assert collect(r, ["abc"]) == "Yc"
|
|
152
|
+
|
|
153
|
+
def test_non_overlapping(self):
|
|
154
|
+
r = Replacer([
|
|
155
|
+
("aa", "X"),
|
|
156
|
+
("bb", "Y"),
|
|
157
|
+
])
|
|
158
|
+
assert collect(r, ["aabb"]) == "XY"
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------
|
|
162
|
+
# wrap / wrap_async
|
|
163
|
+
# ---------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
class TestWrap:
|
|
166
|
+
def test_wrap_sync(self):
|
|
167
|
+
r = Replacer([("hello", "world")])
|
|
168
|
+
result = "".join(r.wrap(["hel", "lo there"]))
|
|
169
|
+
assert result == "world there"
|
|
170
|
+
|
|
171
|
+
def test_wrap_sync_functional(self):
|
|
172
|
+
result = "".join(stream_replace(["hel", "lo"], [("hello", "world")]))
|
|
173
|
+
assert result == "world"
|
|
174
|
+
|
|
175
|
+
@pytest.mark.asyncio
|
|
176
|
+
async def test_wrap_async(self):
|
|
177
|
+
r = Replacer([("hello", "world")])
|
|
178
|
+
parts = []
|
|
179
|
+
async for chunk in r.wrap_async(_async_gen(["hel", "lo there"])):
|
|
180
|
+
parts.append(chunk)
|
|
181
|
+
assert "".join(parts) == "world there"
|
|
182
|
+
|
|
183
|
+
@pytest.mark.asyncio
|
|
184
|
+
async def test_astream_replace(self):
|
|
185
|
+
parts = []
|
|
186
|
+
async for chunk in astream_replace(
|
|
187
|
+
_async_gen(["hel", "lo"]),
|
|
188
|
+
[("hello", "world")],
|
|
189
|
+
):
|
|
190
|
+
parts.append(chunk)
|
|
191
|
+
assert "".join(parts) == "world"
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# ---------------------------------------------------------------
|
|
195
|
+
# Reset / reuse
|
|
196
|
+
# ---------------------------------------------------------------
|
|
197
|
+
|
|
198
|
+
class TestReset:
|
|
199
|
+
def test_reset_clears_buffer(self):
|
|
200
|
+
r = Replacer([("hello", "world")])
|
|
201
|
+
r.feed("hel")
|
|
202
|
+
r.reset()
|
|
203
|
+
assert collect(r, ["hello"]) == "world"
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ---------------------------------------------------------------
|
|
207
|
+
# Edge cases
|
|
208
|
+
# ---------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
class TestEdgeCases:
|
|
211
|
+
def test_empty_stream(self):
|
|
212
|
+
r = Replacer([("x", "y")])
|
|
213
|
+
assert collect(r, []) == ""
|
|
214
|
+
|
|
215
|
+
def test_replacement_longer_than_pattern(self):
|
|
216
|
+
r = Replacer([("a", "xyz")])
|
|
217
|
+
assert collect(r, ["aaa"]) == "xyzxyzxyz"
|
|
218
|
+
|
|
219
|
+
def test_replacement_empty(self):
|
|
220
|
+
r = Replacer([("remove", "")])
|
|
221
|
+
assert collect(r, ["please remove this"]) == "please this"
|
|
222
|
+
|
|
223
|
+
def test_chinese_characters(self):
|
|
224
|
+
r = Replacer([("敏感词", "***")])
|
|
225
|
+
assert collect(r, ["这是一个敏", "感词测试"]) == "这是一个***测试"
|
|
226
|
+
|
|
227
|
+
def test_single_char_pattern(self):
|
|
228
|
+
r = Replacer([("x", "O")])
|
|
229
|
+
assert collect(r, ["axbxc"]) == "aObOc"
|
|
230
|
+
|
|
231
|
+
def test_pattern_same_as_replacement(self):
|
|
232
|
+
r = Replacer([("a", "a")])
|
|
233
|
+
assert collect(r, ["aaa"]) == "aaa"
|
|
234
|
+
|
|
235
|
+
def test_unicode_emoji(self):
|
|
236
|
+
r = Replacer([("😀", "smile")])
|
|
237
|
+
assert collect(r, ["hello 😀 world"]) == "hello smile world"
|
|
238
|
+
|
|
239
|
+
def test_many_small_chunks(self):
|
|
240
|
+
r = Replacer([("hello", "world")])
|
|
241
|
+
assert collect(r, list("hello")) == "world"
|