stream-replace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ from stream_replace._replacer import Replacer
2
+ from stream_replace._functional import stream_replace, astream_replace
3
+
4
+ __all__ = ["Replacer", "stream_replace", "astream_replace"]
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import AsyncIterable, Iterable
4
+
5
+ from stream_replace._replacer import Replacer
6
+
7
+
8
+ def stream_replace(
9
+ iterable: Iterable[str],
10
+ rules: list[tuple],
11
+ ) -> Iterable[str]:
12
+ """Convenience wrapper — apply replacement *rules* to a sync chunk stream."""
13
+ return Replacer(rules).wrap(iterable)
14
+
15
+
16
+ async def astream_replace(
17
+ iterable: AsyncIterable[str],
18
+ rules: list[tuple],
19
+ ) -> AsyncIterable[str]:
20
+ """Convenience wrapper — apply replacement *rules* to an async chunk stream."""
21
+ async for chunk in Replacer(rules).wrap_async(iterable):
22
+ yield chunk
@@ -0,0 +1,146 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import AsyncIterable, Iterable
4
+
5
+ from stream_replace._rule import Rule, parse_rule
6
+
7
+
8
+ class Replacer:
9
+ """Streaming text replacer that correctly handles partial matches across
10
+ chunk boundaries.
11
+
12
+ Usage::
13
+
14
+ r = Replacer([("hello", "world")])
15
+ for chunk in token_stream:
16
+ yield r.feed(chunk)
17
+ yield r.flush()
18
+ """
19
+
20
+ __slots__ = ("_rules", "_buffer")
21
+
22
+ def __init__(self, rules: list[tuple]) -> None:
23
+ self._rules: list[Rule] = [parse_rule(r) for r in rules]
24
+ self._buffer: str = ""
25
+
26
+ # ------------------------------------------------------------------
27
+ # Core API
28
+ # ------------------------------------------------------------------
29
+
30
+ def feed(self, chunk: str) -> str:
31
+ """Process an incoming chunk. Returns text that is safe to emit."""
32
+ if not chunk:
33
+ return ""
34
+ self._buffer += chunk
35
+ return self._process(flushing=False)
36
+
37
+ def flush(self) -> str:
38
+ """Flush any remaining buffered text. Call once after the stream ends."""
39
+ return self._process(flushing=True)
40
+
41
+ def reset(self) -> None:
42
+ """Clear internal state so the instance can be reused."""
43
+ self._buffer = ""
44
+
45
+ # ------------------------------------------------------------------
46
+ # Convenience wrappers
47
+ # ------------------------------------------------------------------
48
+
49
+ def wrap(self, iterable: Iterable[str]) -> Iterable[str]:
50
+ """Wrap a sync iterable of chunks, yielding replaced text."""
51
+ self.reset()
52
+ for chunk in iterable:
53
+ out = self.feed(chunk)
54
+ if out:
55
+ yield out
56
+ out = self.flush()
57
+ if out:
58
+ yield out
59
+
60
+ async def wrap_async(self, iterable: AsyncIterable[str]) -> AsyncIterable[str]:
61
+ """Wrap an async iterable of chunks, yielding replaced text."""
62
+ self.reset()
63
+ async for chunk in iterable:
64
+ out = self.feed(chunk)
65
+ if out:
66
+ yield out
67
+ out = self.flush()
68
+ if out:
69
+ yield out
70
+
71
+ # ------------------------------------------------------------------
72
+ # Internal
73
+ # ------------------------------------------------------------------
74
+
75
+ def _process(self, *, flushing: bool) -> str:
76
+ result: list[str] = []
77
+ buf = self._buffer
78
+
79
+ # Phase 1: find and apply all complete matches iteratively
80
+ pos = 0
81
+ while pos < len(buf):
82
+ earliest = self._find_earliest_match(buf, pos)
83
+ if earliest is None:
84
+ break
85
+ result.append(buf[pos:earliest.start])
86
+ result.append(earliest.replacement_text)
87
+ pos = earliest.end
88
+
89
+ remaining = buf[pos:]
90
+
91
+ if flushing:
92
+ # No more data coming — do one final replacement pass on the
93
+ # remaining tail, then emit everything.
94
+ final = self._replace_all(remaining)
95
+ result.append(final)
96
+ self._buffer = ""
97
+ else:
98
+ # Determine safe-to-emit boundary in *remaining*.
99
+ safe = self._safe_boundary(remaining)
100
+ result.append(remaining[:safe])
101
+ self._buffer = remaining[safe:]
102
+
103
+ return "".join(result)
104
+
105
+ def _find_earliest_match(self, text: str, pos: int):
106
+ """Return the earliest MatchResult among all rules, or None."""
107
+ from stream_replace._rule import MatchResult
108
+
109
+ best: MatchResult | None = None
110
+ for rule in self._rules:
111
+ m = rule.search(text, pos)
112
+ if m is not None and (best is None or m.start < best.start):
113
+ best = m
114
+ return best
115
+
116
+ def _safe_boundary(self, remaining: str) -> int:
117
+ """How many characters from *remaining* can be safely emitted."""
118
+ if not remaining:
119
+ return 0
120
+
121
+ safe = len(remaining)
122
+ for rule in self._rules:
123
+ p = rule.find_partial_start(remaining)
124
+ if p is not None and p < safe:
125
+ safe = p
126
+ return safe
127
+
128
+ def _replace_all(self, text: str) -> str:
129
+ """Run all rules on *text* until no more matches (for flush)."""
130
+ buf = text
131
+ changed = True
132
+ while changed:
133
+ changed = False
134
+ pos = 0
135
+ parts: list[str] = []
136
+ while pos < len(buf):
137
+ m = self._find_earliest_match(buf, pos)
138
+ if m is None:
139
+ break
140
+ parts.append(buf[pos:m.start])
141
+ parts.append(m.replacement_text)
142
+ pos = m.end
143
+ changed = True
144
+ parts.append(buf[pos:])
145
+ buf = "".join(parts)
146
+ return buf
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from typing import Callable, Union
6
+
7
+ from stream_replace._utils import extract_literal_prefix, find_suffix_prefix_overlap
8
+
9
+ Replacement = Union[str, Callable]
10
+
11
+
12
+ @dataclass(slots=True)
13
+ class MatchResult:
14
+ start: int
15
+ end: int
16
+ replacement_text: str
17
+
18
+
19
+ @dataclass(slots=True)
20
+ class StringRule:
21
+ pattern: str
22
+ replacement: Replacement
23
+
24
+ def search(self, text: str, pos: int = 0) -> MatchResult | None:
25
+ idx = text.find(self.pattern, pos)
26
+ if idx == -1:
27
+ return None
28
+ matched = self.pattern
29
+ rep = self.replacement(matched) if callable(self.replacement) else self.replacement
30
+ return MatchResult(start=idx, end=idx + len(self.pattern), replacement_text=rep)
31
+
32
+ def find_partial_start(self, text: str) -> int | None:
33
+ return find_suffix_prefix_overlap(text, self.pattern)
34
+
35
+
36
+ @dataclass(slots=True)
37
+ class RegexRule:
38
+ regex: re.Pattern[str]
39
+ replacement: Replacement
40
+ literal_prefix: str = field(init=False)
41
+ _fallback_lookback: int = 0
42
+
43
+ def __post_init__(self) -> None:
44
+ self.literal_prefix = extract_literal_prefix(self.regex)
45
+ if not self.literal_prefix and self._fallback_lookback == 0:
46
+ self._fallback_lookback = 32
47
+
48
+ def search(self, text: str, pos: int = 0) -> MatchResult | None:
49
+ m = self.regex.search(text, pos)
50
+ if m is None:
51
+ return None
52
+ if callable(self.replacement):
53
+ rep = self.replacement(m)
54
+ else:
55
+ rep = m.expand(self.replacement)
56
+ return MatchResult(start=m.start(), end=m.end(), replacement_text=rep)
57
+
58
+ def find_partial_start(self, text: str) -> int | None:
59
+ prefix = self.literal_prefix
60
+ if prefix:
61
+ # Check whether a complete literal prefix exists without a full
62
+ # regex match — that means an open (unfinished) match is in
63
+ # progress and we must hold back from that position.
64
+ search_start = 0
65
+ while True:
66
+ idx = text.find(prefix, search_start)
67
+ if idx == -1:
68
+ break
69
+ if self.regex.search(text, idx) is None:
70
+ return idx
71
+ search_start = idx + 1
72
+
73
+ # Also check if the very tail is a *partial* literal prefix
74
+ # (token split mid-prefix).
75
+ return find_suffix_prefix_overlap(text, prefix)
76
+
77
+ if self._fallback_lookback > 0 and len(text) > 0:
78
+ return max(0, len(text) - self._fallback_lookback)
79
+ return None
80
+
81
+
82
+ Rule = StringRule | RegexRule
83
+
84
+
85
+ def parse_rule(spec: tuple) -> Rule:
86
+ """Convert a user-supplied tuple into a Rule object.
87
+
88
+ Accepted forms:
89
+ (str, str | callable)
90
+ (re.Pattern, str | callable)
91
+ (str, str | callable, dict) -- extra options forwarded
92
+ (re.Pattern, str | callable, dict)
93
+ """
94
+ if len(spec) == 2:
95
+ pattern, replacement = spec
96
+ opts: dict = {}
97
+ elif len(spec) == 3:
98
+ pattern, replacement, opts = spec
99
+ else:
100
+ raise ValueError(f"Rule tuple must have 2 or 3 elements, got {len(spec)}")
101
+
102
+ if isinstance(pattern, str):
103
+ return StringRule(pattern=pattern, replacement=replacement)
104
+ if isinstance(pattern, re.Pattern):
105
+ lookback = opts.get("lookback", 0)
106
+ rule = RegexRule(regex=pattern, replacement=replacement, _fallback_lookback=lookback)
107
+ return rule
108
+
109
+ raise TypeError(f"Pattern must be str or re.Pattern, got {type(pattern).__name__}")
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ try:
6
+ import re._parser as _sre_parse # Python 3.13+
7
+ except ImportError:
8
+ import sre_parse as _sre_parse # type: ignore[no-redef]
9
+
10
+
11
+ def extract_literal_prefix(pattern: re.Pattern[str]) -> str:
12
+ """Extract the leading literal character sequence from a compiled regex.
13
+
14
+ Examples:
15
+ r"<think>[\\s\\S]*?</think>" -> "<think>"
16
+ r"hello\\s+world" -> "hello"
17
+ r"\\d{11}" -> ""
18
+ """
19
+ try:
20
+ parsed = _sre_parse.parse(pattern.pattern)
21
+ except Exception:
22
+ return ""
23
+
24
+ chars: list[str] = []
25
+ for op, av in parsed:
26
+ if op == _sre_parse.LITERAL:
27
+ chars.append(chr(av))
28
+ else:
29
+ break
30
+ return "".join(chars)
31
+
32
+
33
+ def find_suffix_prefix_overlap(text: str, pattern: str) -> int | None:
34
+ """Find the earliest position in *text* where a suffix of *text* equals
35
+ a proper prefix of *pattern*.
36
+
37
+ Returns the start index in *text*, or None if no overlap exists.
38
+
39
+ Example:
40
+ text="abc hel", pattern="hello" -> 4 (suffix "hel" is prefix of "hello")
41
+ """
42
+ if not pattern:
43
+ return None
44
+
45
+ max_check = min(len(text), len(pattern) - 1)
46
+ for length in range(max_check, 0, -1):
47
+ suffix = text[-length:]
48
+ if pattern[:length] == suffix:
49
+ return len(text) - length
50
+ return None
@@ -0,0 +1,126 @@
1
+ Metadata-Version: 2.4
2
+ Name: stream-replace
3
+ Version: 0.1.0
4
+ Summary: Streaming text replacement for AI token streams — handles partial matches across chunk boundaries
5
+ License-Expression: MIT
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Topic :: Text Processing :: Filters
9
+ Classifier: Typing :: Typed
10
+ Requires-Python: >=3.13
11
+ Description-Content-Type: text/markdown
12
+
13
+ # stream-replace
14
+
15
+ Streaming text replacement for AI token streams — correctly handles partial matches across chunk boundaries.
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install stream-replace
21
+ ```
22
+
23
+ ## Quick Start
24
+
25
+ ```python
26
+ import re
27
+ from stream_replace import Replacer
28
+
29
+ r = Replacer([
30
+ ("敏感词", "***"), # string → string
31
+ ("secret", lambda s: s[0] + "***"), # string → callable
32
+ (re.compile(r"1[3-9]\d{9}"), "[PHONE]"), # regex → string
33
+ (re.compile(r"<think>[\s\S]*?</think>"), ""), # regex → remove
34
+ (re.compile(r"(\d+)"), lambda m: str(int(m.group()) * 2)), # regex → callable
35
+ ])
36
+
37
+ for chunk in ai_stream:
38
+ safe_text = r.feed(chunk)
39
+ print(safe_text, end="")
40
+
41
+ print(r.flush(), end="")
42
+ ```
43
+
44
+ ## Why?
45
+
46
+ AI models stream tokens incrementally. A word you want to replace may be split across chunks:
47
+
48
+ ```
49
+ chunk 1: "hel"
50
+ chunk 2: "lo world"
51
+ ```
52
+
53
+ Naive per-chunk replacement would miss `"hello"`. **stream-replace** buffers just enough text at chunk boundaries to detect partial matches, while emitting safe text as early as possible.
54
+
55
+ ## API
56
+
57
+ ### `Replacer(rules)`
58
+
59
+ Create a replacer with a list of `(pattern, replacement)` tuples.
60
+
61
+ | Pattern | Replacement | Description |
62
+ |---|---|---|
63
+ | `str` | `str` | Exact string replacement |
64
+ | `str` | `callable(matched_str) → str` | Dynamic string replacement |
65
+ | `re.Pattern` | `str` | Regex replacement (supports `\1` backrefs) |
66
+ | `re.Pattern` | `callable(re.Match) → str` | Dynamic regex replacement |
67
+
68
+ #### `r.feed(chunk: str) → str`
69
+
70
+ Process one incoming chunk. Returns text that is safe to emit (fully resolved, no pending partial matches).
71
+
72
+ #### `r.flush() → str`
73
+
74
+ Flush the internal buffer after the stream ends. Must be called once to get any remaining text.
75
+
76
+ #### `r.reset()`
77
+
78
+ Clear internal state so the replacer can be reused for another stream.
79
+
80
+ #### `r.wrap(iterable) → Iterable[str]`
81
+
82
+ Convenience wrapper for a sync chunk stream. Handles `feed` + `flush` automatically.
83
+
84
+ ```python
85
+ for text in r.wrap(chunks):
86
+ print(text, end="")
87
+ ```
88
+
89
+ #### `r.wrap_async(async_iterable) → AsyncIterable[str]`
90
+
91
+ Same as `wrap`, but for async iterables.
92
+
93
+ ```python
94
+ async for text in r.wrap_async(async_chunks):
95
+ print(text, end="")
96
+ ```
97
+
98
+ ### Functional API
99
+
100
+ For one-off use without creating a `Replacer` instance:
101
+
102
+ ```python
103
+ from stream_replace import stream_replace, astream_replace
104
+
105
+ # sync
106
+ for text in stream_replace(chunks, [("hello", "world")]):
107
+ print(text, end="")
108
+
109
+ # async
110
+ async for text in astream_replace(async_chunks, [("hello", "world")]):
111
+ print(text, end="")
112
+ ```
113
+
114
+ ## How It Works
115
+
116
+ 1. **Buffer**: Incoming chunks accumulate in an internal buffer.
117
+ 2. **Match**: On each `feed()`, the buffer is scanned for complete matches across all rules. The earliest match wins.
118
+ 3. **Replace**: Matched text is replaced; scanning continues from after the replacement.
119
+ 4. **Hold back**: After all matches, the buffer tail is checked for *potential* partial matches (a suffix that could be the start of a pattern). This tail is held back for the next `feed()`.
120
+ 5. **Flush**: On `flush()`, the remaining buffer is processed without holding anything back.
121
+
122
+ For regex rules, the library automatically extracts literal prefixes from the pattern (e.g., `"<think>"` from `r"<think>[\s\S]*?</think>"`) to detect both partial prefix matches and open-but-unclosed matches spanning multiple chunks.
123
+
124
+ ## License
125
+
126
+ MIT
@@ -0,0 +1,8 @@
1
+ stream_replace/__init__.py,sha256=9kR47PP6CpFrBOe53TxdjwfE3jELI_YnmfuAGsis8Vo,178
2
+ stream_replace/_functional.py,sha256=lpE5qWoGqPos3tiCB7i46rpFYFbSiYbkarc2EZUnAvw,626
3
+ stream_replace/_replacer.py,sha256=79wyOiywBWnm6dPOrOi4srD4IjDgisw4eQHENNTe3No,4753
4
+ stream_replace/_rule.py,sha256=USrAayLXt7bkatontMwmzFFHDAVcqfAml0UYZg2-tCA,3589
5
+ stream_replace/_utils.py,sha256=qRvEpeHuezRhyJZ-uHJWH1uw0Ylcx9Kg8Mxh2rkjF6g,1386
6
+ stream_replace-0.1.0.dist-info/METADATA,sha256=SfvlmHwZ4m3XBaDKzCkwhu7eA2sVrIsisAFAuECKifk,3902
7
+ stream_replace-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
8
+ stream_replace-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any