minder-cli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minder/__init__.py +12 -0
- minder/api/routers/prompts.py +177 -0
- minder/application/__init__.py +1 -0
- minder/application/admin/__init__.py +11 -0
- minder/application/admin/dto.py +453 -0
- minder/application/admin/jobs.py +327 -0
- minder/application/admin/use_cases.py +1895 -0
- minder/auth/__init__.py +12 -0
- minder/auth/context.py +26 -0
- minder/auth/middleware.py +70 -0
- minder/auth/principal.py +59 -0
- minder/auth/rate_limiter.py +89 -0
- minder/auth/rbac.py +60 -0
- minder/auth/service.py +541 -0
- minder/bootstrap/__init__.py +9 -0
- minder/bootstrap/providers.py +109 -0
- minder/bootstrap/transport.py +807 -0
- minder/cache/__init__.py +10 -0
- minder/cache/providers.py +140 -0
- minder/chunking/__init__.py +4 -0
- minder/chunking/code_splitter.py +184 -0
- minder/chunking/splitter.py +136 -0
- minder/cli.py +1542 -0
- minder/config.py +179 -0
- minder/continuity.py +363 -0
- minder/dev.py +160 -0
- minder/embedding/__init__.py +9 -0
- minder/embedding/base.py +7 -0
- minder/embedding/local.py +65 -0
- minder/embedding/openai.py +7 -0
- minder/graph/__init__.py +11 -0
- minder/graph/edges.py +13 -0
- minder/graph/executor.py +127 -0
- minder/graph/graph.py +263 -0
- minder/graph/nodes/__init__.py +27 -0
- minder/graph/nodes/evaluator.py +21 -0
- minder/graph/nodes/guard.py +64 -0
- minder/graph/nodes/llm.py +59 -0
- minder/graph/nodes/planning.py +30 -0
- minder/graph/nodes/reasoning.py +87 -0
- minder/graph/nodes/reranker.py +141 -0
- minder/graph/nodes/retriever.py +86 -0
- minder/graph/nodes/verification.py +230 -0
- minder/graph/nodes/workflow_planner.py +250 -0
- minder/graph/runtime.py +15 -0
- minder/graph/state.py +26 -0
- minder/llm/__init__.py +5 -0
- minder/llm/base.py +14 -0
- minder/llm/local.py +381 -0
- minder/llm/openai.py +89 -0
- minder/models/__init__.py +109 -0
- minder/models/base.py +10 -0
- minder/models/client.py +137 -0
- minder/models/document.py +34 -0
- minder/models/error.py +32 -0
- minder/models/graph.py +114 -0
- minder/models/history.py +32 -0
- minder/models/job.py +62 -0
- minder/models/prompt.py +41 -0
- minder/models/repository.py +62 -0
- minder/models/rule.py +68 -0
- minder/models/session.py +51 -0
- minder/models/skill.py +52 -0
- minder/models/user.py +41 -0
- minder/models/workflow.py +35 -0
- minder/observability/__init__.py +57 -0
- minder/observability/audit.py +243 -0
- minder/observability/logging.py +253 -0
- minder/observability/metrics.py +448 -0
- minder/observability/tracing.py +215 -0
- minder/presentation/__init__.py +1 -0
- minder/presentation/http/__init__.py +1 -0
- minder/presentation/http/admin/__init__.py +3 -0
- minder/presentation/http/admin/api.py +1309 -0
- minder/presentation/http/admin/context.py +94 -0
- minder/presentation/http/admin/dashboard.py +111 -0
- minder/presentation/http/admin/jobs.py +208 -0
- minder/presentation/http/admin/memories.py +185 -0
- minder/presentation/http/admin/prompts.py +219 -0
- minder/presentation/http/admin/routes.py +127 -0
- minder/presentation/http/admin/runtime.py +650 -0
- minder/presentation/http/admin/search.py +368 -0
- minder/presentation/http/admin/skills.py +230 -0
- minder/prompts/__init__.py +646 -0
- minder/prompts/formatter.py +142 -0
- minder/resources/__init__.py +318 -0
- minder/retrieval/__init__.py +5 -0
- minder/retrieval/hybrid.py +178 -0
- minder/retrieval/mmr.py +116 -0
- minder/retrieval/multi_hop.py +115 -0
- minder/runtime.py +15 -0
- minder/server.py +145 -0
- minder/store/__init__.py +64 -0
- minder/store/document.py +115 -0
- minder/store/error.py +82 -0
- minder/store/feedback.py +114 -0
- minder/store/graph.py +588 -0
- minder/store/history.py +57 -0
- minder/store/interfaces.py +512 -0
- minder/store/milvus/__init__.py +11 -0
- minder/store/milvus/client.py +26 -0
- minder/store/milvus/collections.py +15 -0
- minder/store/milvus/vector_store.py +232 -0
- minder/store/mongodb/__init__.py +11 -0
- minder/store/mongodb/client.py +49 -0
- minder/store/mongodb/indexes.py +90 -0
- minder/store/mongodb/operational_store.py +993 -0
- minder/store/relational.py +1087 -0
- minder/store/repo_state.py +58 -0
- minder/store/rule.py +93 -0
- minder/store/vector.py +79 -0
- minder/tools/__init__.py +47 -0
- minder/tools/auth.py +94 -0
- minder/tools/graph.py +839 -0
- minder/tools/ingest.py +353 -0
- minder/tools/memory.py +381 -0
- minder/tools/query.py +307 -0
- minder/tools/registry.py +269 -0
- minder/tools/repo_scanner.py +1266 -0
- minder/tools/search.py +15 -0
- minder/tools/session.py +316 -0
- minder/tools/skills.py +899 -0
- minder/tools/workflow.py +215 -0
- minder/transport/__init__.py +4 -0
- minder/transport/base.py +286 -0
- minder/transport/sse.py +252 -0
- minder/transport/stdio.py +29 -0
- minder_cli-0.2.0.dist-info/METADATA +318 -0
- minder_cli-0.2.0.dist-info/RECORD +132 -0
- minder_cli-0.2.0.dist-info/WHEEL +4 -0
- minder_cli-0.2.0.dist-info/entry_points.txt +2 -0
- minder_cli-0.2.0.dist-info/licenses/LICENSE +201 -0
minder/cache/__init__.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redis Cache Provider — implements ICacheProvider for runtime cache/session layer.
|
|
3
|
+
|
|
4
|
+
Provides async cache operations backed by Redis. Supports key-value storage,
|
|
5
|
+
TTL, namespaced operations, and health checks.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import redis.asyncio as aioredis
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RedisCacheProvider:
|
|
14
|
+
"""Async Redis cache provider implementing ICacheProvider."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
uri: str = "redis://localhost:6379/0",
|
|
19
|
+
*,
|
|
20
|
+
prefix: str = "minder:",
|
|
21
|
+
default_ttl: int = 3600,
|
|
22
|
+
) -> None:
|
|
23
|
+
self._prefix = prefix
|
|
24
|
+
self._default_ttl = default_ttl
|
|
25
|
+
self._client: aioredis.Redis = aioredis.from_url( # type: ignore[type-arg]
|
|
26
|
+
uri,
|
|
27
|
+
decode_responses=True,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
def _key(self, key: str) -> str:
|
|
31
|
+
"""Apply namespace prefix to key."""
|
|
32
|
+
return f"{self._prefix}{key}"
|
|
33
|
+
|
|
34
|
+
async def get(self, key: str) -> str | None:
|
|
35
|
+
"""Get a value by key."""
|
|
36
|
+
result = await self._client.get(self._key(key))
|
|
37
|
+
if isinstance(result, bytes):
|
|
38
|
+
return result.decode("utf-8")
|
|
39
|
+
return result # type: ignore[return-value]
|
|
40
|
+
|
|
41
|
+
async def set(self, key: str, value: str, *, ttl: int | None = None) -> None:
|
|
42
|
+
"""Set a key-value pair with optional TTL."""
|
|
43
|
+
effective_ttl = ttl if ttl is not None else self._default_ttl
|
|
44
|
+
await self._client.set(self._key(key), value, ex=effective_ttl)
|
|
45
|
+
|
|
46
|
+
async def delete(self, key: str) -> None:
|
|
47
|
+
"""Delete a key."""
|
|
48
|
+
await self._client.delete(self._key(key))
|
|
49
|
+
|
|
50
|
+
async def exists(self, key: str) -> bool:
|
|
51
|
+
"""Check if a key exists."""
|
|
52
|
+
return bool(await self._client.exists(self._key(key)))
|
|
53
|
+
|
|
54
|
+
async def expire(self, key: str, ttl: int) -> None:
|
|
55
|
+
"""Set expiration on an existing key."""
|
|
56
|
+
await self._client.expire(self._key(key), ttl)
|
|
57
|
+
|
|
58
|
+
async def incr(self, key: str) -> int:
|
|
59
|
+
"""Increment an integer value."""
|
|
60
|
+
result = await self._client.incr(self._key(key))
|
|
61
|
+
return int(result)
|
|
62
|
+
|
|
63
|
+
async def keys(self, pattern: str) -> list[str]:
|
|
64
|
+
"""Get keys matching a pattern (within namespace)."""
|
|
65
|
+
full_pattern = self._key(pattern)
|
|
66
|
+
raw_keys: list[str] = await self._client.keys(full_pattern) # type: ignore[assignment]
|
|
67
|
+
prefix_len = len(self._prefix)
|
|
68
|
+
return [k[prefix_len:] if k.startswith(self._prefix) else k for k in raw_keys]
|
|
69
|
+
|
|
70
|
+
async def flush_namespace(self, namespace: str) -> None:
|
|
71
|
+
"""Delete all keys under a specific namespace prefix."""
|
|
72
|
+
pattern = self._key(f"{namespace}:*")
|
|
73
|
+
raw_keys = await self._client.keys(pattern)
|
|
74
|
+
if raw_keys:
|
|
75
|
+
await self._client.delete(*raw_keys)
|
|
76
|
+
|
|
77
|
+
async def health_check(self) -> bool:
|
|
78
|
+
"""Ping Redis to check connectivity."""
|
|
79
|
+
try:
|
|
80
|
+
return bool(await self._client.ping()) # type: ignore[misc]
|
|
81
|
+
except Exception:
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
async def close(self) -> None:
|
|
85
|
+
"""Close the Redis connection."""
|
|
86
|
+
await self._client.aclose()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class LRUCacheProvider:
|
|
90
|
+
"""
|
|
91
|
+
In-memory LRU cache provider implementing ICacheProvider.
|
|
92
|
+
|
|
93
|
+
Used as zero-dependency fallback when Redis is not available.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
def __init__(self, *, max_size: int = 1000, default_ttl: int = 3600) -> None:
|
|
97
|
+
self._max_size = max_size
|
|
98
|
+
self._default_ttl = default_ttl
|
|
99
|
+
self._store: dict[str, str] = {}
|
|
100
|
+
|
|
101
|
+
async def get(self, key: str) -> str | None:
|
|
102
|
+
return self._store.get(key)
|
|
103
|
+
|
|
104
|
+
async def set(self, key: str, value: str, *, ttl: int | None = None) -> None:
|
|
105
|
+
if len(self._store) >= self._max_size:
|
|
106
|
+
# Evict oldest entry (FIFO as simple approximation)
|
|
107
|
+
oldest_key = next(iter(self._store))
|
|
108
|
+
del self._store[oldest_key]
|
|
109
|
+
self._store[key] = value
|
|
110
|
+
|
|
111
|
+
async def delete(self, key: str) -> None:
|
|
112
|
+
self._store.pop(key, None)
|
|
113
|
+
|
|
114
|
+
async def exists(self, key: str) -> bool:
|
|
115
|
+
return key in self._store
|
|
116
|
+
|
|
117
|
+
async def expire(self, key: str, ttl: int) -> None:
|
|
118
|
+
pass # No-op for in-memory store
|
|
119
|
+
|
|
120
|
+
async def incr(self, key: str) -> int:
|
|
121
|
+
current = int(self._store.get(key, "0"))
|
|
122
|
+
current += 1
|
|
123
|
+
self._store[key] = str(current)
|
|
124
|
+
return current
|
|
125
|
+
|
|
126
|
+
async def keys(self, pattern: str) -> list[str]:
|
|
127
|
+
import fnmatch
|
|
128
|
+
return [k for k in self._store if fnmatch.fnmatch(k, pattern)]
|
|
129
|
+
|
|
130
|
+
async def flush_namespace(self, namespace: str) -> None:
|
|
131
|
+
prefix = f"{namespace}:"
|
|
132
|
+
to_delete = [k for k in self._store if k.startswith(prefix)]
|
|
133
|
+
for k in to_delete:
|
|
134
|
+
del self._store[k]
|
|
135
|
+
|
|
136
|
+
async def health_check(self) -> bool:
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
async def close(self) -> None:
|
|
140
|
+
self._store.clear()
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AST-aware code chunking.
|
|
3
|
+
|
|
4
|
+
Python: uses the standard-library ``ast`` module to split at top-level
|
|
5
|
+
``def`` / ``async def`` / ``class`` boundaries. Module-level import
|
|
6
|
+
statements are prepended to every chunk for self-containedness.
|
|
7
|
+
|
|
8
|
+
TypeScript / JavaScript / Java: falls back to a brace-depth (``{`` / ``}``)
|
|
9
|
+
line-based splitter that cuts at depth-0 boundaries.
|
|
10
|
+
|
|
11
|
+
Any other language: attempts Python AST first, then brace-depth.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import ast
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
|
|
19
|
+
PythonSymbol = ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class CodeChunk:
|
|
24
|
+
content: str # Complete, self-contained chunk text
|
|
25
|
+
start_line: int # 1-indexed start line in the original source
|
|
26
|
+
end_line: int # 1-indexed end line in the original source
|
|
27
|
+
symbol_name: str | None # Function/class name, or None for file-level chunks
|
|
28
|
+
language: str
|
|
29
|
+
imports: str = field(default="") # Module-level imports prepended to content
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# CodeSplitter
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CodeSplitter:
|
|
38
|
+
"""
|
|
39
|
+
Split source code into logical chunks.
|
|
40
|
+
|
|
41
|
+
Usage::
|
|
42
|
+
|
|
43
|
+
splitter = CodeSplitter()
|
|
44
|
+
chunks = splitter.split(source_code, language="python")
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def split(self, code: str, language: str = "python") -> list[CodeChunk]:
|
|
48
|
+
"""
|
|
49
|
+
Split *code* into :class:`CodeChunk` objects.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
code: source code text.
|
|
53
|
+
language: one of ``"python"``, ``"typescript"``, ``"javascript"``,
|
|
54
|
+
``"ts"``, ``"js"``, ``"java"``. Anything else is attempted as
|
|
55
|
+
Python first, then falls back to brace-depth splitting.
|
|
56
|
+
"""
|
|
57
|
+
if not code.strip():
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
lang = language.lower()
|
|
61
|
+
if lang == "python":
|
|
62
|
+
return self._split_python(code)
|
|
63
|
+
if lang in {"typescript", "ts", "javascript", "js", "java"}:
|
|
64
|
+
return self._split_by_brace_depth(code, language=language)
|
|
65
|
+
# Unknown language: try Python AST, fall back to brace split
|
|
66
|
+
try:
|
|
67
|
+
return self._split_python(code)
|
|
68
|
+
except SyntaxError:
|
|
69
|
+
return self._split_by_brace_depth(code, language=language)
|
|
70
|
+
|
|
71
|
+
# ------------------------------------------------------------------
|
|
72
|
+
# Python (AST-aware)
|
|
73
|
+
# ------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
def _split_python(self, code: str) -> list[CodeChunk]:
|
|
76
|
+
lines = code.splitlines(keepends=True)
|
|
77
|
+
tree = ast.parse(code)
|
|
78
|
+
|
|
79
|
+
import_lines: list[str] = []
|
|
80
|
+
top_symbols: list[PythonSymbol] = []
|
|
81
|
+
|
|
82
|
+
for node in ast.iter_child_nodes(tree):
|
|
83
|
+
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
|
84
|
+
s = node.lineno - 1
|
|
85
|
+
e = getattr(node, "end_lineno", node.lineno) - 1
|
|
86
|
+
import_lines.extend(lines[s : e + 1])
|
|
87
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
88
|
+
top_symbols.append(node)
|
|
89
|
+
|
|
90
|
+
imports_str = "".join(import_lines).rstrip()
|
|
91
|
+
|
|
92
|
+
# No top-level symbols → whole file is one chunk
|
|
93
|
+
if not top_symbols:
|
|
94
|
+
return [
|
|
95
|
+
CodeChunk(
|
|
96
|
+
content=code,
|
|
97
|
+
start_line=1,
|
|
98
|
+
end_line=len(lines),
|
|
99
|
+
symbol_name=None,
|
|
100
|
+
language="python",
|
|
101
|
+
imports=imports_str,
|
|
102
|
+
)
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
chunks: list[CodeChunk] = []
|
|
106
|
+
for node in top_symbols:
|
|
107
|
+
start = node.lineno - 1
|
|
108
|
+
end = getattr(node, "end_lineno", node.lineno) - 1
|
|
109
|
+
body = "".join(lines[start : end + 1])
|
|
110
|
+
if imports_str:
|
|
111
|
+
content = imports_str + "\n\n" + body.rstrip()
|
|
112
|
+
else:
|
|
113
|
+
content = body.rstrip()
|
|
114
|
+
chunks.append(
|
|
115
|
+
CodeChunk(
|
|
116
|
+
content=content,
|
|
117
|
+
start_line=node.lineno,
|
|
118
|
+
end_line=getattr(node, "end_lineno", node.lineno),
|
|
119
|
+
symbol_name=node.name,
|
|
120
|
+
language="python",
|
|
121
|
+
imports=imports_str,
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return chunks
|
|
126
|
+
|
|
127
|
+
# ------------------------------------------------------------------
|
|
128
|
+
# Brace-depth (TypeScript / JavaScript / Java)
|
|
129
|
+
# ------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _split_by_brace_depth(code: str, *, language: str) -> list[CodeChunk]:
|
|
133
|
+
"""Split at top-level brace-balanced block boundaries (depth 0 → 1 → 0)."""
|
|
134
|
+
lines = code.splitlines(keepends=True)
|
|
135
|
+
chunks: list[CodeChunk] = []
|
|
136
|
+
depth = 0
|
|
137
|
+
chunk_start = 0
|
|
138
|
+
|
|
139
|
+
for i, line in enumerate(lines):
|
|
140
|
+
depth += line.count("{") - line.count("}")
|
|
141
|
+
if depth == 0 and i >= chunk_start:
|
|
142
|
+
body = "".join(lines[chunk_start : i + 1]).strip()
|
|
143
|
+
if body:
|
|
144
|
+
chunks.append(
|
|
145
|
+
CodeChunk(
|
|
146
|
+
content=body,
|
|
147
|
+
start_line=chunk_start + 1,
|
|
148
|
+
end_line=i + 1,
|
|
149
|
+
symbol_name=None,
|
|
150
|
+
language=language,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
chunk_start = i + 1
|
|
154
|
+
# Guard against unbalanced braces
|
|
155
|
+
if depth < 0:
|
|
156
|
+
depth = 0
|
|
157
|
+
|
|
158
|
+
# Trailing content after the last depth-0 point
|
|
159
|
+
if chunk_start < len(lines):
|
|
160
|
+
body = "".join(lines[chunk_start:]).strip()
|
|
161
|
+
if body:
|
|
162
|
+
chunks.append(
|
|
163
|
+
CodeChunk(
|
|
164
|
+
content=body,
|
|
165
|
+
start_line=chunk_start + 1,
|
|
166
|
+
end_line=len(lines),
|
|
167
|
+
symbol_name=None,
|
|
168
|
+
language=language,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Fallback: return whole file as one chunk
|
|
173
|
+
if not chunks:
|
|
174
|
+
return [
|
|
175
|
+
CodeChunk(
|
|
176
|
+
content=code,
|
|
177
|
+
start_line=1,
|
|
178
|
+
end_line=len(lines),
|
|
179
|
+
symbol_name=None,
|
|
180
|
+
language=language,
|
|
181
|
+
)
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
return chunks
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text chunking with markdown-heading awareness.
|
|
3
|
+
|
|
4
|
+
Splits documents by heading boundaries first, then applies a sliding-window
|
|
5
|
+
fallback for sections that exceed chunk_size.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_HEADING_RE = re.compile(r"^#{1,6}\s+.+$", re.MULTILINE)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class TextChunk:
|
|
19
|
+
content: str
|
|
20
|
+
start_char: int
|
|
21
|
+
end_char: int
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TextSplitter:
|
|
25
|
+
"""
|
|
26
|
+
Markdown-aware sliding-window text chunker.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
chunk_size: target chunk length in characters (default 512 ≈ ~128 tokens).
|
|
30
|
+
overlap: character overlap between adjacent window chunks (default 64).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, chunk_size: int = 512, overlap: int = 64) -> None:
|
|
34
|
+
if chunk_size <= 0:
|
|
35
|
+
raise ValueError(f"chunk_size must be positive, got {chunk_size}")
|
|
36
|
+
if overlap < 0 or overlap >= chunk_size:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"overlap must be in [0, chunk_size), got overlap={overlap} chunk_size={chunk_size}"
|
|
39
|
+
)
|
|
40
|
+
self._chunk_size = chunk_size
|
|
41
|
+
self._overlap = overlap
|
|
42
|
+
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
# Public API
|
|
45
|
+
# ------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
def split(self, text: str) -> list[TextChunk]:
|
|
48
|
+
"""
|
|
49
|
+
Split *text* into chunks.
|
|
50
|
+
|
|
51
|
+
Strategy:
|
|
52
|
+
1. Attempt a split at markdown heading boundaries.
|
|
53
|
+
2. Any resulting section that still exceeds chunk_size is
|
|
54
|
+
sub-split using a sliding window.
|
|
55
|
+
3. If no headings are found the whole text goes through the
|
|
56
|
+
sliding window directly.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Ordered list of :class:`TextChunk` objects.
|
|
60
|
+
"""
|
|
61
|
+
if not text:
|
|
62
|
+
return []
|
|
63
|
+
|
|
64
|
+
heading_sections = self._split_at_headings(text)
|
|
65
|
+
if len(heading_sections) > 1:
|
|
66
|
+
result: list[TextChunk] = []
|
|
67
|
+
for section in heading_sections:
|
|
68
|
+
if len(section.content) <= self._chunk_size:
|
|
69
|
+
result.append(section)
|
|
70
|
+
else:
|
|
71
|
+
result.extend(
|
|
72
|
+
self._sliding_window(section.content, char_offset=section.start_char)
|
|
73
|
+
)
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
return self._sliding_window(text)
|
|
77
|
+
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
# Internals
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def _split_at_headings(text: str) -> list[TextChunk]:
|
|
84
|
+
matches = list(_HEADING_RE.finditer(text))
|
|
85
|
+
if not matches:
|
|
86
|
+
return [TextChunk(content=text, start_char=0, end_char=len(text))]
|
|
87
|
+
|
|
88
|
+
chunks: list[TextChunk] = []
|
|
89
|
+
|
|
90
|
+
# Content before the first heading
|
|
91
|
+
first_heading_start = matches[0].start()
|
|
92
|
+
if first_heading_start > 0:
|
|
93
|
+
pre = text[:first_heading_start].strip()
|
|
94
|
+
if pre:
|
|
95
|
+
chunks.append(
|
|
96
|
+
TextChunk(content=pre, start_char=0, end_char=first_heading_start)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Each heading section: from heading start to next heading start (or EOF)
|
|
100
|
+
boundaries = [m.start() for m in matches] + [len(text)]
|
|
101
|
+
for i in range(len(boundaries) - 1):
|
|
102
|
+
start, end = boundaries[i], boundaries[i + 1]
|
|
103
|
+
content = text[start:end].strip()
|
|
104
|
+
if content:
|
|
105
|
+
chunks.append(TextChunk(content=content, start_char=start, end_char=end))
|
|
106
|
+
|
|
107
|
+
return chunks or [TextChunk(content=text, start_char=0, end_char=len(text))]
|
|
108
|
+
|
|
109
|
+
def _sliding_window(self, text: str, *, char_offset: int = 0) -> list[TextChunk]:
|
|
110
|
+
chunks: list[TextChunk] = []
|
|
111
|
+
step = self._chunk_size - self._overlap
|
|
112
|
+
pos = 0
|
|
113
|
+
while pos < len(text):
|
|
114
|
+
end = min(pos + self._chunk_size, len(text))
|
|
115
|
+
content = text[pos:end]
|
|
116
|
+
if content.strip():
|
|
117
|
+
chunks.append(
|
|
118
|
+
TextChunk(
|
|
119
|
+
content=content,
|
|
120
|
+
start_char=char_offset + pos,
|
|
121
|
+
end_char=char_offset + end,
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
if end == len(text):
|
|
125
|
+
break
|
|
126
|
+
pos += step
|
|
127
|
+
|
|
128
|
+
if not chunks and text.strip():
|
|
129
|
+
chunks.append(
|
|
130
|
+
TextChunk(
|
|
131
|
+
content=text,
|
|
132
|
+
start_char=char_offset,
|
|
133
|
+
end_char=char_offset + len(text),
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
return chunks
|