ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +64 -158
- ai_pipeline_core/deployment/__init__.py +6 -18
- ai_pipeline_core/deployment/base.py +392 -212
- ai_pipeline_core/deployment/contract.py +6 -10
- ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
- ai_pipeline_core/deployment/helpers.py +16 -17
- ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
- ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +12 -14
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +318 -1434
- ai_pipeline_core/documents/mime_type.py +11 -84
- ai_pipeline_core/documents/utils.py +4 -12
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +32 -85
- ai_pipeline_core/images/_processing.py +5 -11
- ai_pipeline_core/llm/__init__.py +6 -4
- ai_pipeline_core/llm/ai_messages.py +102 -90
- ai_pipeline_core/llm/client.py +229 -183
- ai_pipeline_core/llm/model_options.py +12 -84
- ai_pipeline_core/llm/model_response.py +53 -99
- ai_pipeline_core/llm/model_types.py +8 -23
- ai_pipeline_core/logging/__init__.py +2 -7
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -37
- ai_pipeline_core/logging/logging_mixin.py +15 -41
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
- ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
- ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
- ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +16 -102
- ai_pipeline_core/settings.py +26 -31
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
- ai_pipeline_core/debug/__init__.py +0 -26
- ai_pipeline_core/documents/document_list.py +0 -420
- ai_pipeline_core/documents/flow_document.py +0 -112
- ai_pipeline_core/documents/task_document.py +0 -117
- ai_pipeline_core/documents/temporary_document.py +0 -74
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -494
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -718
- ai_pipeline_core/prefect.py +0 -63
- ai_pipeline_core/prompt_builder/__init__.py +0 -5
- ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
- ai_pipeline_core/prompt_builder/global_cache.py +0 -78
- ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
- ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
- ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
- ai_pipeline_core/storage/__init__.py +0 -8
- ai_pipeline_core/storage/storage.py +0 -628
- ai_pipeline_core/utils/__init__.py +0 -8
- ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
- ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""Local filesystem document store for CLI/debug mode.
|
|
2
|
+
|
|
3
|
+
Layout:
|
|
4
|
+
{base_path}/{canonical_name}/{filename} <- raw content
|
|
5
|
+
{base_path}/{canonical_name}/{filename}.meta.json <- metadata
|
|
6
|
+
{base_path}/{canonical_name}/{filename}.att/ <- attachments directory
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from ai_pipeline_core.document_store._summary import SummaryGenerator
|
|
15
|
+
from ai_pipeline_core.document_store._summary_worker import SummaryWorker
|
|
16
|
+
from ai_pipeline_core.documents._context_vars import suppress_registration
|
|
17
|
+
from ai_pipeline_core.documents._hashing import compute_content_sha256, compute_document_sha256
|
|
18
|
+
from ai_pipeline_core.documents.attachment import Attachment
|
|
19
|
+
from ai_pipeline_core.documents.document import Document
|
|
20
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
21
|
+
|
|
22
|
+
logger = get_pipeline_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class LocalDocumentStore:
|
|
26
|
+
"""Filesystem-backed document store for local development and debugging.
|
|
27
|
+
|
|
28
|
+
Documents are stored as browsable files organized by canonical type name.
|
|
29
|
+
Write order (content before meta) ensures crash safety — load() ignores
|
|
30
|
+
content files without a valid .meta.json.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
base_path: Path | None = None,
|
|
36
|
+
*,
|
|
37
|
+
summary_generator: SummaryGenerator | None = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
self._base_path = base_path or Path.cwd()
|
|
40
|
+
self._meta_path_cache: dict[str, Path] = {} # "{run_scope}:{sha256}" -> meta file path
|
|
41
|
+
self._summary_worker: SummaryWorker | None = None
|
|
42
|
+
if summary_generator:
|
|
43
|
+
self._summary_worker = SummaryWorker(
|
|
44
|
+
generator=summary_generator,
|
|
45
|
+
update_fn=self.update_summary,
|
|
46
|
+
)
|
|
47
|
+
self._summary_worker.start()
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def base_path(self) -> Path:
|
|
51
|
+
"""Root directory for all stored documents."""
|
|
52
|
+
return self._base_path
|
|
53
|
+
|
|
54
|
+
async def save(self, document: Document, run_scope: str) -> None:
|
|
55
|
+
"""Save a document to disk. Idempotent — same SHA256 is a no-op."""
|
|
56
|
+
written = await asyncio.to_thread(self._save_sync, document, run_scope)
|
|
57
|
+
if written and self._summary_worker:
|
|
58
|
+
self._summary_worker.schedule(run_scope, document)
|
|
59
|
+
|
|
60
|
+
async def save_batch(self, documents: list[Document], run_scope: str) -> None:
|
|
61
|
+
"""Save multiple documents sequentially."""
|
|
62
|
+
for doc in documents:
|
|
63
|
+
await self.save(doc, run_scope)
|
|
64
|
+
|
|
65
|
+
async def load(self, run_scope: str, document_types: list[type[Document]]) -> list[Document]:
|
|
66
|
+
"""Load documents by type from the run scope directory."""
|
|
67
|
+
return await asyncio.to_thread(self._load_sync, run_scope, document_types)
|
|
68
|
+
|
|
69
|
+
async def has_documents(self, run_scope: str, document_type: type[Document]) -> bool:
|
|
70
|
+
"""Check for meta files in the type's directory without loading content."""
|
|
71
|
+
return await asyncio.to_thread(self._has_documents_sync, run_scope, document_type)
|
|
72
|
+
|
|
73
|
+
async def check_existing(self, sha256s: list[str]) -> set[str]:
|
|
74
|
+
"""Scan all meta files to find matching document_sha256 values."""
|
|
75
|
+
return await asyncio.to_thread(self._check_existing_sync, sha256s)
|
|
76
|
+
|
|
77
|
+
async def update_summary(self, run_scope: str, document_sha256: str, summary: str) -> None:
|
|
78
|
+
"""Update summary in the document's .meta.json file."""
|
|
79
|
+
await asyncio.to_thread(self._update_summary_sync, run_scope, document_sha256, summary)
|
|
80
|
+
|
|
81
|
+
async def load_summaries(self, run_scope: str, document_sha256s: list[str]) -> dict[str, str]:
|
|
82
|
+
"""Load summaries from .meta.json files."""
|
|
83
|
+
return await asyncio.to_thread(self._load_summaries_sync, run_scope, document_sha256s)
|
|
84
|
+
|
|
85
|
+
def flush(self) -> None:
|
|
86
|
+
"""Block until all pending document summaries are processed."""
|
|
87
|
+
if self._summary_worker:
|
|
88
|
+
self._summary_worker.flush()
|
|
89
|
+
|
|
90
|
+
def shutdown(self) -> None:
|
|
91
|
+
"""Flush pending summaries and stop the summary worker."""
|
|
92
|
+
if self._summary_worker:
|
|
93
|
+
self._summary_worker.shutdown()
|
|
94
|
+
|
|
95
|
+
# --- Sync implementation (called via asyncio.to_thread) ---
|
|
96
|
+
|
|
97
|
+
def _scope_path(self, run_scope: str) -> Path:
|
|
98
|
+
return self._base_path / run_scope
|
|
99
|
+
|
|
100
|
+
def _save_sync(self, document: Document, run_scope: str) -> bool:
|
|
101
|
+
canonical = document.canonical_name()
|
|
102
|
+
doc_dir = self._scope_path(run_scope) / canonical
|
|
103
|
+
doc_dir.mkdir(parents=True, exist_ok=True)
|
|
104
|
+
|
|
105
|
+
content_path = doc_dir / document.name
|
|
106
|
+
if not content_path.resolve().is_relative_to(doc_dir.resolve()):
|
|
107
|
+
raise ValueError(f"Path traversal detected: document name '{document.name}' escapes store directory")
|
|
108
|
+
meta_path = doc_dir / f"{document.name}.meta.json"
|
|
109
|
+
|
|
110
|
+
doc_sha256 = compute_document_sha256(document)
|
|
111
|
+
content_sha256 = compute_content_sha256(document.content)
|
|
112
|
+
|
|
113
|
+
# Check for concurrent access: if meta exists with different SHA256, log warning
|
|
114
|
+
if meta_path.exists():
|
|
115
|
+
existing_meta = self._read_meta(meta_path)
|
|
116
|
+
if existing_meta and existing_meta.get("document_sha256") == doc_sha256:
|
|
117
|
+
# Populate cache even for idempotent saves
|
|
118
|
+
self._meta_path_cache[f"{run_scope}:{doc_sha256}"] = meta_path
|
|
119
|
+
return False # Idempotent — same document already saved
|
|
120
|
+
if existing_meta:
|
|
121
|
+
logger.warning(
|
|
122
|
+
f"Overwriting document '{document.name}' in '{canonical}': "
|
|
123
|
+
f"existing SHA256 {existing_meta.get('document_sha256', '?')[:12]}... "
|
|
124
|
+
f"differs from new {doc_sha256[:12]}..."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Write content before meta (crash safety)
|
|
128
|
+
content_path.write_bytes(document.content)
|
|
129
|
+
|
|
130
|
+
# Write attachments
|
|
131
|
+
att_meta_list: list[dict[str, Any]] = []
|
|
132
|
+
if document.attachments:
|
|
133
|
+
att_dir = doc_dir / f"{document.name}.att"
|
|
134
|
+
att_dir.mkdir(exist_ok=True)
|
|
135
|
+
for att in document.attachments:
|
|
136
|
+
att_path = att_dir / att.name
|
|
137
|
+
if not att_path.resolve().is_relative_to(att_dir.resolve()):
|
|
138
|
+
raise ValueError(f"Path traversal detected: attachment name '{att.name}' escapes store directory")
|
|
139
|
+
att_path.write_bytes(att.content)
|
|
140
|
+
att_meta_list.append({
|
|
141
|
+
"name": att.name,
|
|
142
|
+
"description": att.description,
|
|
143
|
+
"sha256": compute_content_sha256(att.content),
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
# Write meta last (crash safety — content is already on disk)
|
|
147
|
+
meta = {
|
|
148
|
+
"document_sha256": doc_sha256,
|
|
149
|
+
"content_sha256": content_sha256,
|
|
150
|
+
"class_name": document.__class__.__name__,
|
|
151
|
+
"description": document.description,
|
|
152
|
+
"sources": list(document.sources),
|
|
153
|
+
"origins": list(document.origins),
|
|
154
|
+
"mime_type": document.mime_type,
|
|
155
|
+
"attachments": att_meta_list,
|
|
156
|
+
}
|
|
157
|
+
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
|
158
|
+
|
|
159
|
+
# Cache meta path for summary updates
|
|
160
|
+
self._meta_path_cache[f"{run_scope}:{doc_sha256}"] = meta_path
|
|
161
|
+
return True
|
|
162
|
+
|
|
163
|
+
def _load_sync(self, run_scope: str, document_types: list[type[Document]]) -> list[Document]:
|
|
164
|
+
scope_path = self._scope_path(run_scope)
|
|
165
|
+
if not scope_path.exists():
|
|
166
|
+
return []
|
|
167
|
+
|
|
168
|
+
# Build reverse map: canonical_name -> document type
|
|
169
|
+
type_by_canonical: dict[str, type[Document]] = {}
|
|
170
|
+
for doc_type in document_types:
|
|
171
|
+
cn = doc_type.canonical_name()
|
|
172
|
+
type_by_canonical[cn] = doc_type
|
|
173
|
+
|
|
174
|
+
documents: list[Document] = []
|
|
175
|
+
|
|
176
|
+
with suppress_registration():
|
|
177
|
+
for canonical, doc_type in type_by_canonical.items():
|
|
178
|
+
type_dir = scope_path / canonical
|
|
179
|
+
if not type_dir.is_dir():
|
|
180
|
+
continue
|
|
181
|
+
self._load_type_dir(type_dir, doc_type, documents)
|
|
182
|
+
|
|
183
|
+
return documents
|
|
184
|
+
|
|
185
|
+
def _load_type_dir(self, type_dir: Path, doc_type: type[Document], out: list[Document]) -> None:
|
|
186
|
+
"""Load all documents of a single type from its directory."""
|
|
187
|
+
for meta_path in type_dir.glob("*.meta.json"):
|
|
188
|
+
meta = self._read_meta(meta_path)
|
|
189
|
+
if meta is None:
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
content_name = meta_path.name.removesuffix(".meta.json")
|
|
193
|
+
content_path = type_dir / content_name
|
|
194
|
+
|
|
195
|
+
if not content_path.exists():
|
|
196
|
+
logger.warning(f"Meta file {meta_path} has no corresponding content file, skipping")
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
content = content_path.read_bytes()
|
|
200
|
+
|
|
201
|
+
# Load attachments
|
|
202
|
+
attachments: tuple[Attachment, ...] = ()
|
|
203
|
+
att_meta_list = meta.get("attachments", [])
|
|
204
|
+
if att_meta_list:
|
|
205
|
+
att_dir = type_dir / f"{content_name}.att"
|
|
206
|
+
att_list: list[Attachment] = []
|
|
207
|
+
for att_meta in att_meta_list:
|
|
208
|
+
att_path = att_dir / att_meta["name"]
|
|
209
|
+
if not att_path.exists():
|
|
210
|
+
logger.warning(f"Attachment file {att_path} missing, skipping")
|
|
211
|
+
continue
|
|
212
|
+
att_list.append(
|
|
213
|
+
Attachment(
|
|
214
|
+
name=att_meta["name"],
|
|
215
|
+
content=att_path.read_bytes(),
|
|
216
|
+
description=att_meta.get("description"),
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
attachments = tuple(att_list)
|
|
220
|
+
|
|
221
|
+
doc = doc_type(
|
|
222
|
+
name=content_name,
|
|
223
|
+
content=content,
|
|
224
|
+
description=meta.get("description"),
|
|
225
|
+
sources=tuple(meta.get("sources", ())),
|
|
226
|
+
origins=tuple(meta.get("origins", ())),
|
|
227
|
+
attachments=attachments or None,
|
|
228
|
+
)
|
|
229
|
+
out.append(doc)
|
|
230
|
+
|
|
231
|
+
def _has_documents_sync(self, run_scope: str, document_type: type[Document]) -> bool:
|
|
232
|
+
"""Check for meta files in the type's directory without loading content."""
|
|
233
|
+
scope_path = self._scope_path(run_scope)
|
|
234
|
+
canonical = document_type.canonical_name()
|
|
235
|
+
type_dir = scope_path / canonical
|
|
236
|
+
if not type_dir.is_dir():
|
|
237
|
+
return False
|
|
238
|
+
return any(type_dir.glob("*.meta.json"))
|
|
239
|
+
|
|
240
|
+
def _check_existing_sync(self, sha256s: list[str]) -> set[str]:
|
|
241
|
+
"""Scan all meta files to find matching document_sha256 values."""
|
|
242
|
+
target = set(sha256s)
|
|
243
|
+
found: set[str] = set()
|
|
244
|
+
if not self._base_path.exists():
|
|
245
|
+
return found
|
|
246
|
+
|
|
247
|
+
for meta_path in self._base_path.rglob("*.meta.json"):
|
|
248
|
+
meta = self._read_meta(meta_path)
|
|
249
|
+
if meta and meta.get("document_sha256") in target:
|
|
250
|
+
found.add(meta["document_sha256"])
|
|
251
|
+
if found == target:
|
|
252
|
+
break
|
|
253
|
+
return found
|
|
254
|
+
|
|
255
|
+
def _update_summary_sync(self, run_scope: str, document_sha256: str, summary: str) -> None:
|
|
256
|
+
"""Update summary in the document's .meta.json file."""
|
|
257
|
+
cache_key = f"{run_scope}:{document_sha256}"
|
|
258
|
+
meta_path = self._meta_path_cache.get(cache_key)
|
|
259
|
+
|
|
260
|
+
# Fallback: scan for the meta file if not cached
|
|
261
|
+
if meta_path is None or not meta_path.exists():
|
|
262
|
+
meta_path = self._find_meta_by_sha256(run_scope, document_sha256)
|
|
263
|
+
if meta_path is None:
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
meta = self._read_meta(meta_path)
|
|
267
|
+
if meta is None:
|
|
268
|
+
return
|
|
269
|
+
|
|
270
|
+
meta["summary"] = summary
|
|
271
|
+
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
|
272
|
+
self._meta_path_cache[cache_key] = meta_path
|
|
273
|
+
|
|
274
|
+
def _load_summaries_sync(self, run_scope: str, document_sha256s: list[str]) -> dict[str, str]:
|
|
275
|
+
"""Scan meta files for matching sha256s and return their summaries."""
|
|
276
|
+
target = set(document_sha256s)
|
|
277
|
+
result: dict[str, str] = {}
|
|
278
|
+
scope_path = self._scope_path(run_scope)
|
|
279
|
+
if not scope_path.exists():
|
|
280
|
+
return result
|
|
281
|
+
|
|
282
|
+
for meta_path in scope_path.rglob("*.meta.json"):
|
|
283
|
+
meta = self._read_meta(meta_path)
|
|
284
|
+
if meta is None:
|
|
285
|
+
continue
|
|
286
|
+
sha = meta.get("document_sha256")
|
|
287
|
+
summary = meta.get("summary")
|
|
288
|
+
if sha in target and summary:
|
|
289
|
+
result[sha] = summary
|
|
290
|
+
if len(result) == len(target):
|
|
291
|
+
break
|
|
292
|
+
return result
|
|
293
|
+
|
|
294
|
+
def _find_meta_by_sha256(self, run_scope: str, document_sha256: str) -> Path | None:
|
|
295
|
+
"""Scan meta files in run_scope to find one matching the given sha256."""
|
|
296
|
+
scope_path = self._scope_path(run_scope)
|
|
297
|
+
if not scope_path.exists():
|
|
298
|
+
return None
|
|
299
|
+
for meta_path in scope_path.rglob("*.meta.json"):
|
|
300
|
+
meta = self._read_meta(meta_path)
|
|
301
|
+
if meta and meta.get("document_sha256") == document_sha256:
|
|
302
|
+
return meta_path
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
@staticmethod
|
|
306
|
+
def _read_meta(meta_path: Path) -> dict[str, Any] | None:
|
|
307
|
+
"""Read and parse a meta.json file, returning None on any error."""
|
|
308
|
+
try:
|
|
309
|
+
return json.loads(meta_path.read_text(encoding="utf-8"))
|
|
310
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
311
|
+
logger.warning(f"Failed to read meta file {meta_path}: {e}")
|
|
312
|
+
return None
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""In-memory document store for testing.
|
|
2
|
+
|
|
3
|
+
Simple dict-based storage implementing the full DocumentStore protocol.
|
|
4
|
+
Not for production use — all data is lost when the process exits.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ai_pipeline_core.document_store._summary import SummaryGenerator
|
|
8
|
+
from ai_pipeline_core.document_store._summary_worker import SummaryWorker
|
|
9
|
+
from ai_pipeline_core.documents.document import Document
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MemoryDocumentStore:
|
|
13
|
+
"""Dict-based document store for unit tests.
|
|
14
|
+
|
|
15
|
+
Storage layout: dict[run_scope, dict[document_sha256, Document]].
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
*,
|
|
21
|
+
summary_generator: SummaryGenerator | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
self._data: dict[str, dict[str, Document]] = {}
|
|
24
|
+
self._summaries: dict[str, dict[str, str]] = {} # run_scope -> sha256 -> summary
|
|
25
|
+
self._summary_worker: SummaryWorker | None = None
|
|
26
|
+
if summary_generator:
|
|
27
|
+
self._summary_worker = SummaryWorker(
|
|
28
|
+
generator=summary_generator,
|
|
29
|
+
update_fn=self.update_summary,
|
|
30
|
+
)
|
|
31
|
+
self._summary_worker.start()
|
|
32
|
+
|
|
33
|
+
async def save(self, document: Document, run_scope: str) -> None:
|
|
34
|
+
"""Store document in memory, keyed by SHA256."""
|
|
35
|
+
scope = self._data.setdefault(run_scope, {})
|
|
36
|
+
if document.sha256 in scope:
|
|
37
|
+
return # Idempotent — same document already saved
|
|
38
|
+
scope[document.sha256] = document
|
|
39
|
+
if self._summary_worker:
|
|
40
|
+
self._summary_worker.schedule(run_scope, document)
|
|
41
|
+
|
|
42
|
+
async def save_batch(self, documents: list[Document], run_scope: str) -> None:
|
|
43
|
+
"""Save multiple documents sequentially."""
|
|
44
|
+
for doc in documents:
|
|
45
|
+
await self.save(doc, run_scope)
|
|
46
|
+
|
|
47
|
+
async def load(self, run_scope: str, document_types: list[type[Document]]) -> list[Document]:
|
|
48
|
+
"""Return all documents matching the given types from a run scope."""
|
|
49
|
+
scope = self._data.get(run_scope, {})
|
|
50
|
+
type_tuple = tuple(document_types)
|
|
51
|
+
return [doc for doc in scope.values() if isinstance(doc, type_tuple)]
|
|
52
|
+
|
|
53
|
+
async def has_documents(self, run_scope: str, document_type: type[Document]) -> bool:
|
|
54
|
+
"""Check if any documents of this type exist in the run scope."""
|
|
55
|
+
scope = self._data.get(run_scope, {})
|
|
56
|
+
return any(isinstance(doc, document_type) for doc in scope.values())
|
|
57
|
+
|
|
58
|
+
async def check_existing(self, sha256s: list[str]) -> set[str]:
|
|
59
|
+
"""Return the subset of sha256s that exist across all scopes."""
|
|
60
|
+
all_hashes: set[str] = set()
|
|
61
|
+
for scope in self._data.values():
|
|
62
|
+
all_hashes.update(scope.keys())
|
|
63
|
+
return all_hashes & set(sha256s)
|
|
64
|
+
|
|
65
|
+
async def update_summary(self, run_scope: str, document_sha256: str, summary: str) -> None:
|
|
66
|
+
"""Update summary for a stored document. No-op if document doesn't exist."""
|
|
67
|
+
scope = self._data.get(run_scope, {})
|
|
68
|
+
if document_sha256 not in scope:
|
|
69
|
+
return
|
|
70
|
+
self._summaries.setdefault(run_scope, {})[document_sha256] = summary
|
|
71
|
+
|
|
72
|
+
async def load_summaries(self, run_scope: str, document_sha256s: list[str]) -> dict[str, str]:
|
|
73
|
+
"""Load summaries by SHA256."""
|
|
74
|
+
scope_summaries = self._summaries.get(run_scope, {})
|
|
75
|
+
return {sha: scope_summaries[sha] for sha in document_sha256s if sha in scope_summaries}
|
|
76
|
+
|
|
77
|
+
def flush(self) -> None:
|
|
78
|
+
"""Block until all pending document summaries are processed."""
|
|
79
|
+
if self._summary_worker:
|
|
80
|
+
self._summary_worker.flush()
|
|
81
|
+
|
|
82
|
+
def shutdown(self) -> None:
|
|
83
|
+
"""Flush pending summaries and stop the summary worker."""
|
|
84
|
+
if self._summary_worker:
|
|
85
|
+
self._summary_worker.shutdown()
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Document store protocol and singleton management.
|
|
2
|
+
|
|
3
|
+
Defines the DocumentStore protocol that all storage backends must implement,
|
|
4
|
+
along with get/set helpers for the process-global singleton.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Protocol, runtime_checkable
|
|
8
|
+
|
|
9
|
+
from ai_pipeline_core.documents.document import Document
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@runtime_checkable
|
|
13
|
+
class DocumentStore(Protocol):
|
|
14
|
+
"""Protocol for document storage backends.
|
|
15
|
+
|
|
16
|
+
Implementations: ClickHouseDocumentStore (production), LocalDocumentStore (CLI/debug),
|
|
17
|
+
MemoryDocumentStore (testing).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
async def save(self, document: Document, run_scope: str) -> None:
|
|
21
|
+
"""Save a single document to the store. Idempotent — same SHA256 is a no-op."""
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
async def save_batch(self, documents: list[Document], run_scope: str) -> None:
|
|
25
|
+
"""Save multiple documents. Dependencies must be sorted (caller's responsibility)."""
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
async def load(self, run_scope: str, document_types: list[type[Document]]) -> list[Document]:
|
|
29
|
+
"""Load all documents of the given types from a run scope."""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
async def has_documents(self, run_scope: str, document_type: type[Document]) -> bool:
|
|
33
|
+
"""Check if any documents of this type exist in the run scope."""
|
|
34
|
+
...
|
|
35
|
+
|
|
36
|
+
async def check_existing(self, sha256s: list[str]) -> set[str]:
|
|
37
|
+
"""Return the subset of sha256s that already exist in the store."""
|
|
38
|
+
...
|
|
39
|
+
|
|
40
|
+
async def update_summary(self, run_scope: str, document_sha256: str, summary: str) -> None:
|
|
41
|
+
"""Update summary for a stored document. No-op if document doesn't exist."""
|
|
42
|
+
...
|
|
43
|
+
|
|
44
|
+
async def load_summaries(self, run_scope: str, document_sha256s: list[str]) -> dict[str, str]:
|
|
45
|
+
"""Load summaries by SHA256. Returns {sha256: summary} for docs that have summaries."""
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
def flush(self) -> None:
|
|
49
|
+
"""Block until all pending background work (summaries) is processed."""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
def shutdown(self) -> None:
|
|
53
|
+
"""Flush pending work and stop background workers."""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
_document_store: DocumentStore | None = None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_document_store() -> DocumentStore | None:
|
|
61
|
+
"""Get the process-global document store singleton."""
|
|
62
|
+
return _document_store
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def set_document_store(store: DocumentStore | None) -> None:
|
|
66
|
+
"""Set the process-global document store singleton."""
|
|
67
|
+
global _document_store
|
|
68
|
+
_document_store = store
|
|
@@ -1,26 +1,24 @@
|
|
|
1
|
-
"""Document
|
|
1
|
+
"""Document system for AI pipeline flows.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
various content types in AI pipelines, including text, images, PDFs, and other
|
|
7
|
-
binary data with automatic MIME type detection.
|
|
3
|
+
Provides the Document base class (immutable, content-addressed), Attachment for
|
|
4
|
+
binary sub-documents, and RunContext/TaskDocumentContext for document lifecycle
|
|
5
|
+
management within pipeline tasks.
|
|
8
6
|
"""
|
|
9
7
|
|
|
8
|
+
from .attachment import Attachment
|
|
9
|
+
from .context import RunContext, TaskDocumentContext, get_run_context, reset_run_context, set_run_context
|
|
10
10
|
from .document import Document
|
|
11
|
-
from .document_list import DocumentList
|
|
12
|
-
from .flow_document import FlowDocument
|
|
13
|
-
from .task_document import TaskDocument
|
|
14
|
-
from .temporary_document import TemporaryDocument
|
|
15
11
|
from .utils import canonical_name_key, is_document_sha256, sanitize_url
|
|
16
12
|
|
|
17
13
|
__all__ = [
|
|
14
|
+
"Attachment",
|
|
18
15
|
"Document",
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"TaskDocument",
|
|
22
|
-
"TemporaryDocument",
|
|
16
|
+
"RunContext",
|
|
17
|
+
"TaskDocumentContext",
|
|
23
18
|
"canonical_name_key",
|
|
19
|
+
"get_run_context",
|
|
24
20
|
"is_document_sha256",
|
|
21
|
+
"reset_run_context",
|
|
25
22
|
"sanitize_url",
|
|
23
|
+
"set_run_context",
|
|
26
24
|
]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Low-level ContextVar declarations for document registration and task context.
|
|
2
|
+
|
|
3
|
+
Extracted into a separate module to break the circular dependency between
|
|
4
|
+
document.py (which needs suppression/task-context checks) and context.py
|
|
5
|
+
(which defines the full TaskDocumentContext that depends on Document).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from collections.abc import Iterator
|
|
9
|
+
from contextlib import contextmanager
|
|
10
|
+
from contextvars import ContextVar, Token
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
# --- Run context ---
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True, slots=True)
|
|
17
|
+
class RunContext:
|
|
18
|
+
"""Immutable context for a pipeline run, carried via ContextVar."""
|
|
19
|
+
|
|
20
|
+
run_scope: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_run_context: ContextVar[RunContext | None] = ContextVar("_run_context", default=None)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_run_context() -> RunContext | None:
|
|
27
|
+
"""Get the current run context, or None if not set."""
|
|
28
|
+
return _run_context.get()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def set_run_context(ctx: RunContext) -> Token[RunContext | None]:
|
|
32
|
+
"""Set the run context. Returns a token for restoring the previous value."""
|
|
33
|
+
return _run_context.set(ctx)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def reset_run_context(token: Token[RunContext | None]) -> None:
|
|
37
|
+
"""Reset the run context to its previous value using a token from set_run_context."""
|
|
38
|
+
_run_context.reset(token)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# --- Suppression flag ---
|
|
42
|
+
|
|
43
|
+
_suppression_flag: ContextVar[bool] = ContextVar("_document_registration_suppressed", default=False)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def is_registration_suppressed() -> bool:
|
|
47
|
+
"""Check if document registration is currently suppressed."""
|
|
48
|
+
return _suppression_flag.get()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@contextmanager
|
|
52
|
+
def suppress_registration() -> Iterator[None]:
|
|
53
|
+
"""Context manager that suppresses Document registration with TaskDocumentContext.
|
|
54
|
+
|
|
55
|
+
Used during model_validate() and other internal Pydantic operations that
|
|
56
|
+
construct intermediate Document objects that should not be tracked.
|
|
57
|
+
"""
|
|
58
|
+
token = _suppression_flag.set(True)
|
|
59
|
+
try:
|
|
60
|
+
yield
|
|
61
|
+
finally:
|
|
62
|
+
_suppression_flag.reset(token)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# --- Task document context ContextVar ---
|
|
66
|
+
|
|
67
|
+
# Forward reference: the actual TaskDocumentContext class lives in context.py.
|
|
68
|
+
# Here we only manage the ContextVar holding it.
|
|
69
|
+
|
|
70
|
+
_task_context: ContextVar[object | None] = ContextVar("_task_context", default=None)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_task_context() -> object | None:
|
|
74
|
+
"""Get the current task document context, or None if not inside a pipeline task."""
|
|
75
|
+
return _task_context.get()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def set_task_context(ctx: object) -> Token[object | None]:
|
|
79
|
+
"""Set the task document context. Returns a token for restoring the previous value."""
|
|
80
|
+
return _task_context.set(ctx)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def reset_task_context(token: Token[object | None]) -> None:
|
|
84
|
+
"""Reset the task document context to its previous value."""
|
|
85
|
+
_task_context.reset(token)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Document hashing utilities for store implementations.
|
|
2
|
+
|
|
3
|
+
Computes document_sha256 and content_sha256 as defined in the document store
|
|
4
|
+
design: length-prefixed fields with null-byte separators, BASE32 encoded.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
from base64 import b32encode
|
|
9
|
+
from typing import Any, Protocol
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class _Hashable(Protocol):
|
|
13
|
+
"""Protocol for objects whose identity hash can be computed."""
|
|
14
|
+
|
|
15
|
+
name: str
|
|
16
|
+
content: bytes
|
|
17
|
+
attachments: Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compute_document_sha256(doc: _Hashable) -> str:
|
|
21
|
+
"""Compute the document identity hash: hash(name + content + sorted_attachments).
|
|
22
|
+
|
|
23
|
+
Uses length-prefixed fields with null-byte separators for collision resistance.
|
|
24
|
+
Attachments are sorted by name. Result is BASE32 encoded (uppercase, no padding),
|
|
25
|
+
consistent with Document.sha256.
|
|
26
|
+
|
|
27
|
+
Excluded from hash: description, sources, origins, class_name.
|
|
28
|
+
"""
|
|
29
|
+
h = hashlib.sha256()
|
|
30
|
+
|
|
31
|
+
name_bytes = doc.name.encode("utf-8")
|
|
32
|
+
_hash_field(h, name_bytes)
|
|
33
|
+
_hash_field(h, doc.content)
|
|
34
|
+
|
|
35
|
+
for att in sorted(doc.attachments, key=lambda a: a.name):
|
|
36
|
+
att_name_bytes = att.name.encode("utf-8")
|
|
37
|
+
_hash_field(h, att_name_bytes)
|
|
38
|
+
_hash_field(h, att.content)
|
|
39
|
+
|
|
40
|
+
return b32encode(h.digest()).decode("ascii").upper().rstrip("=")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def compute_content_sha256(content: bytes) -> str:
|
|
44
|
+
"""Compute SHA256 of raw content bytes, BASE32 encoded."""
|
|
45
|
+
return b32encode(hashlib.sha256(content).digest()).decode("ascii").upper().rstrip("=")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _hash_field(h: Any, data: bytes) -> None:
|
|
49
|
+
"""Append a length-prefixed, null-separated field to the hash."""
|
|
50
|
+
h.update(str(len(data)).encode("ascii"))
|
|
51
|
+
h.update(b"\x00")
|
|
52
|
+
h.update(data)
|