ai-pipeline-core 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. ai_pipeline_core/__init__.py +78 -125
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +37 -82
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +130 -81
  37. ai_pipeline_core/llm/client.py +327 -193
  38. ai_pipeline_core/llm/model_options.py +14 -86
  39. ai_pipeline_core/llm/model_response.py +60 -103
  40. ai_pipeline_core/llm/model_types.py +16 -34
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -283
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -483
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/simple_runner/__init__.py +0 -14
  85. ai_pipeline_core/simple_runner/cli.py +0 -254
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -247
  87. ai_pipeline_core/storage/__init__.py +0 -8
  88. ai_pipeline_core/storage/storage.py +0 -628
  89. ai_pipeline_core/utils/__init__.py +0 -8
  90. ai_pipeline_core/utils/deploy.py +0 -373
  91. ai_pipeline_core/utils/remote_deployment.py +0 -269
  92. ai_pipeline_core-0.2.6.dist-info/METADATA +0 -500
  93. ai_pipeline_core-0.2.6.dist-info/RECORD +0 -41
  94. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,312 @@
1
+ """Local filesystem document store for CLI/debug mode.
2
+
3
+ Layout:
4
+ {base_path}/{canonical_name}/{filename} <- raw content
5
+ {base_path}/{canonical_name}/{filename}.meta.json <- metadata
6
+ {base_path}/{canonical_name}/{filename}.att/ <- attachments directory
7
+ """
8
+
9
+ import asyncio
10
+ import json
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from ai_pipeline_core.document_store._summary import SummaryGenerator
15
+ from ai_pipeline_core.document_store._summary_worker import SummaryWorker
16
+ from ai_pipeline_core.documents._context_vars import suppress_registration
17
+ from ai_pipeline_core.documents._hashing import compute_content_sha256, compute_document_sha256
18
+ from ai_pipeline_core.documents.attachment import Attachment
19
+ from ai_pipeline_core.documents.document import Document
20
+ from ai_pipeline_core.logging import get_pipeline_logger
21
+
22
+ logger = get_pipeline_logger(__name__)
23
+
24
+
25
+ class LocalDocumentStore:
26
+ """Filesystem-backed document store for local development and debugging.
27
+
28
+ Documents are stored as browsable files organized by canonical type name.
29
+ Write order (content before meta) ensures crash safety — load() ignores
30
+ content files without a valid .meta.json.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ base_path: Path | None = None,
36
+ *,
37
+ summary_generator: SummaryGenerator | None = None,
38
+ ) -> None:
39
+ self._base_path = base_path or Path.cwd()
40
+ self._meta_path_cache: dict[str, Path] = {} # "{run_scope}:{sha256}" -> meta file path
41
+ self._summary_worker: SummaryWorker | None = None
42
+ if summary_generator:
43
+ self._summary_worker = SummaryWorker(
44
+ generator=summary_generator,
45
+ update_fn=self.update_summary,
46
+ )
47
+ self._summary_worker.start()
48
+
49
+ @property
50
+ def base_path(self) -> Path:
51
+ """Root directory for all stored documents."""
52
+ return self._base_path
53
+
54
+ async def save(self, document: Document, run_scope: str) -> None:
55
+ """Save a document to disk. Idempotent — same SHA256 is a no-op."""
56
+ written = await asyncio.to_thread(self._save_sync, document, run_scope)
57
+ if written and self._summary_worker:
58
+ self._summary_worker.schedule(run_scope, document)
59
+
60
+ async def save_batch(self, documents: list[Document], run_scope: str) -> None:
61
+ """Save multiple documents sequentially."""
62
+ for doc in documents:
63
+ await self.save(doc, run_scope)
64
+
65
+ async def load(self, run_scope: str, document_types: list[type[Document]]) -> list[Document]:
66
+ """Load documents by type from the run scope directory."""
67
+ return await asyncio.to_thread(self._load_sync, run_scope, document_types)
68
+
69
+ async def has_documents(self, run_scope: str, document_type: type[Document]) -> bool:
70
+ """Check for meta files in the type's directory without loading content."""
71
+ return await asyncio.to_thread(self._has_documents_sync, run_scope, document_type)
72
+
73
+ async def check_existing(self, sha256s: list[str]) -> set[str]:
74
+ """Scan all meta files to find matching document_sha256 values."""
75
+ return await asyncio.to_thread(self._check_existing_sync, sha256s)
76
+
77
+ async def update_summary(self, run_scope: str, document_sha256: str, summary: str) -> None:
78
+ """Update summary in the document's .meta.json file."""
79
+ await asyncio.to_thread(self._update_summary_sync, run_scope, document_sha256, summary)
80
+
81
+ async def load_summaries(self, run_scope: str, document_sha256s: list[str]) -> dict[str, str]:
82
+ """Load summaries from .meta.json files."""
83
+ return await asyncio.to_thread(self._load_summaries_sync, run_scope, document_sha256s)
84
+
85
+ def flush(self) -> None:
86
+ """Block until all pending document summaries are processed."""
87
+ if self._summary_worker:
88
+ self._summary_worker.flush()
89
+
90
+ def shutdown(self) -> None:
91
+ """Flush pending summaries and stop the summary worker."""
92
+ if self._summary_worker:
93
+ self._summary_worker.shutdown()
94
+
95
+ # --- Sync implementation (called via asyncio.to_thread) ---
96
+
97
+ def _scope_path(self, run_scope: str) -> Path:
98
+ return self._base_path / run_scope
99
+
100
+ def _save_sync(self, document: Document, run_scope: str) -> bool:
101
+ canonical = document.canonical_name()
102
+ doc_dir = self._scope_path(run_scope) / canonical
103
+ doc_dir.mkdir(parents=True, exist_ok=True)
104
+
105
+ content_path = doc_dir / document.name
106
+ if not content_path.resolve().is_relative_to(doc_dir.resolve()):
107
+ raise ValueError(f"Path traversal detected: document name '{document.name}' escapes store directory")
108
+ meta_path = doc_dir / f"{document.name}.meta.json"
109
+
110
+ doc_sha256 = compute_document_sha256(document)
111
+ content_sha256 = compute_content_sha256(document.content)
112
+
113
+ # Check for concurrent access: if meta exists with different SHA256, log warning
114
+ if meta_path.exists():
115
+ existing_meta = self._read_meta(meta_path)
116
+ if existing_meta and existing_meta.get("document_sha256") == doc_sha256:
117
+ # Populate cache even for idempotent saves
118
+ self._meta_path_cache[f"{run_scope}:{doc_sha256}"] = meta_path
119
+ return False # Idempotent — same document already saved
120
+ if existing_meta:
121
+ logger.warning(
122
+ f"Overwriting document '{document.name}' in '{canonical}': "
123
+ f"existing SHA256 {existing_meta.get('document_sha256', '?')[:12]}... "
124
+ f"differs from new {doc_sha256[:12]}..."
125
+ )
126
+
127
+ # Write content before meta (crash safety)
128
+ content_path.write_bytes(document.content)
129
+
130
+ # Write attachments
131
+ att_meta_list: list[dict[str, Any]] = []
132
+ if document.attachments:
133
+ att_dir = doc_dir / f"{document.name}.att"
134
+ att_dir.mkdir(exist_ok=True)
135
+ for att in document.attachments:
136
+ att_path = att_dir / att.name
137
+ if not att_path.resolve().is_relative_to(att_dir.resolve()):
138
+ raise ValueError(f"Path traversal detected: attachment name '{att.name}' escapes store directory")
139
+ att_path.write_bytes(att.content)
140
+ att_meta_list.append({
141
+ "name": att.name,
142
+ "description": att.description,
143
+ "sha256": compute_content_sha256(att.content),
144
+ })
145
+
146
+ # Write meta last (crash safety — content is already on disk)
147
+ meta = {
148
+ "document_sha256": doc_sha256,
149
+ "content_sha256": content_sha256,
150
+ "class_name": document.__class__.__name__,
151
+ "description": document.description,
152
+ "sources": list(document.sources),
153
+ "origins": list(document.origins),
154
+ "mime_type": document.mime_type,
155
+ "attachments": att_meta_list,
156
+ }
157
+ meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
158
+
159
+ # Cache meta path for summary updates
160
+ self._meta_path_cache[f"{run_scope}:{doc_sha256}"] = meta_path
161
+ return True
162
+
163
+ def _load_sync(self, run_scope: str, document_types: list[type[Document]]) -> list[Document]:
164
+ scope_path = self._scope_path(run_scope)
165
+ if not scope_path.exists():
166
+ return []
167
+
168
+ # Build reverse map: canonical_name -> document type
169
+ type_by_canonical: dict[str, type[Document]] = {}
170
+ for doc_type in document_types:
171
+ cn = doc_type.canonical_name()
172
+ type_by_canonical[cn] = doc_type
173
+
174
+ documents: list[Document] = []
175
+
176
+ with suppress_registration():
177
+ for canonical, doc_type in type_by_canonical.items():
178
+ type_dir = scope_path / canonical
179
+ if not type_dir.is_dir():
180
+ continue
181
+ self._load_type_dir(type_dir, doc_type, documents)
182
+
183
+ return documents
184
+
185
+ def _load_type_dir(self, type_dir: Path, doc_type: type[Document], out: list[Document]) -> None:
186
+ """Load all documents of a single type from its directory."""
187
+ for meta_path in type_dir.glob("*.meta.json"):
188
+ meta = self._read_meta(meta_path)
189
+ if meta is None:
190
+ continue
191
+
192
+ content_name = meta_path.name.removesuffix(".meta.json")
193
+ content_path = type_dir / content_name
194
+
195
+ if not content_path.exists():
196
+ logger.warning(f"Meta file {meta_path} has no corresponding content file, skipping")
197
+ continue
198
+
199
+ content = content_path.read_bytes()
200
+
201
+ # Load attachments
202
+ attachments: tuple[Attachment, ...] = ()
203
+ att_meta_list = meta.get("attachments", [])
204
+ if att_meta_list:
205
+ att_dir = type_dir / f"{content_name}.att"
206
+ att_list: list[Attachment] = []
207
+ for att_meta in att_meta_list:
208
+ att_path = att_dir / att_meta["name"]
209
+ if not att_path.exists():
210
+ logger.warning(f"Attachment file {att_path} missing, skipping")
211
+ continue
212
+ att_list.append(
213
+ Attachment(
214
+ name=att_meta["name"],
215
+ content=att_path.read_bytes(),
216
+ description=att_meta.get("description"),
217
+ )
218
+ )
219
+ attachments = tuple(att_list)
220
+
221
+ doc = doc_type(
222
+ name=content_name,
223
+ content=content,
224
+ description=meta.get("description"),
225
+ sources=tuple(meta.get("sources", ())),
226
+ origins=tuple(meta.get("origins", ())),
227
+ attachments=attachments or None,
228
+ )
229
+ out.append(doc)
230
+
231
+ def _has_documents_sync(self, run_scope: str, document_type: type[Document]) -> bool:
232
+ """Check for meta files in the type's directory without loading content."""
233
+ scope_path = self._scope_path(run_scope)
234
+ canonical = document_type.canonical_name()
235
+ type_dir = scope_path / canonical
236
+ if not type_dir.is_dir():
237
+ return False
238
+ return any(type_dir.glob("*.meta.json"))
239
+
240
+ def _check_existing_sync(self, sha256s: list[str]) -> set[str]:
241
+ """Scan all meta files to find matching document_sha256 values."""
242
+ target = set(sha256s)
243
+ found: set[str] = set()
244
+ if not self._base_path.exists():
245
+ return found
246
+
247
+ for meta_path in self._base_path.rglob("*.meta.json"):
248
+ meta = self._read_meta(meta_path)
249
+ if meta and meta.get("document_sha256") in target:
250
+ found.add(meta["document_sha256"])
251
+ if found == target:
252
+ break
253
+ return found
254
+
255
+ def _update_summary_sync(self, run_scope: str, document_sha256: str, summary: str) -> None:
256
+ """Update summary in the document's .meta.json file."""
257
+ cache_key = f"{run_scope}:{document_sha256}"
258
+ meta_path = self._meta_path_cache.get(cache_key)
259
+
260
+ # Fallback: scan for the meta file if not cached
261
+ if meta_path is None or not meta_path.exists():
262
+ meta_path = self._find_meta_by_sha256(run_scope, document_sha256)
263
+ if meta_path is None:
264
+ return
265
+
266
+ meta = self._read_meta(meta_path)
267
+ if meta is None:
268
+ return
269
+
270
+ meta["summary"] = summary
271
+ meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
272
+ self._meta_path_cache[cache_key] = meta_path
273
+
274
+ def _load_summaries_sync(self, run_scope: str, document_sha256s: list[str]) -> dict[str, str]:
275
+ """Scan meta files for matching sha256s and return their summaries."""
276
+ target = set(document_sha256s)
277
+ result: dict[str, str] = {}
278
+ scope_path = self._scope_path(run_scope)
279
+ if not scope_path.exists():
280
+ return result
281
+
282
+ for meta_path in scope_path.rglob("*.meta.json"):
283
+ meta = self._read_meta(meta_path)
284
+ if meta is None:
285
+ continue
286
+ sha = meta.get("document_sha256")
287
+ summary = meta.get("summary")
288
+ if sha in target and summary:
289
+ result[sha] = summary
290
+ if len(result) == len(target):
291
+ break
292
+ return result
293
+
294
+ def _find_meta_by_sha256(self, run_scope: str, document_sha256: str) -> Path | None:
295
+ """Scan meta files in run_scope to find one matching the given sha256."""
296
+ scope_path = self._scope_path(run_scope)
297
+ if not scope_path.exists():
298
+ return None
299
+ for meta_path in scope_path.rglob("*.meta.json"):
300
+ meta = self._read_meta(meta_path)
301
+ if meta and meta.get("document_sha256") == document_sha256:
302
+ return meta_path
303
+ return None
304
+
305
+ @staticmethod
306
+ def _read_meta(meta_path: Path) -> dict[str, Any] | None:
307
+ """Read and parse a meta.json file, returning None on any error."""
308
+ try:
309
+ return json.loads(meta_path.read_text(encoding="utf-8"))
310
+ except (json.JSONDecodeError, OSError) as e:
311
+ logger.warning(f"Failed to read meta file {meta_path}: {e}")
312
+ return None
@@ -0,0 +1,85 @@
1
+ """In-memory document store for testing.
2
+
3
+ Simple dict-based storage implementing the full DocumentStore protocol.
4
+ Not for production use — all data is lost when the process exits.
5
+ """
6
+
7
+ from ai_pipeline_core.document_store._summary import SummaryGenerator
8
+ from ai_pipeline_core.document_store._summary_worker import SummaryWorker
9
+ from ai_pipeline_core.documents.document import Document
10
+
11
+
12
+ class MemoryDocumentStore:
13
+ """Dict-based document store for unit tests.
14
+
15
+ Storage layout: dict[run_scope, dict[document_sha256, Document]].
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ *,
21
+ summary_generator: SummaryGenerator | None = None,
22
+ ) -> None:
23
+ self._data: dict[str, dict[str, Document]] = {}
24
+ self._summaries: dict[str, dict[str, str]] = {} # run_scope -> sha256 -> summary
25
+ self._summary_worker: SummaryWorker | None = None
26
+ if summary_generator:
27
+ self._summary_worker = SummaryWorker(
28
+ generator=summary_generator,
29
+ update_fn=self.update_summary,
30
+ )
31
+ self._summary_worker.start()
32
+
33
+ async def save(self, document: Document, run_scope: str) -> None:
34
+ """Store document in memory, keyed by SHA256."""
35
+ scope = self._data.setdefault(run_scope, {})
36
+ if document.sha256 in scope:
37
+ return # Idempotent — same document already saved
38
+ scope[document.sha256] = document
39
+ if self._summary_worker:
40
+ self._summary_worker.schedule(run_scope, document)
41
+
42
+ async def save_batch(self, documents: list[Document], run_scope: str) -> None:
43
+ """Save multiple documents sequentially."""
44
+ for doc in documents:
45
+ await self.save(doc, run_scope)
46
+
47
+ async def load(self, run_scope: str, document_types: list[type[Document]]) -> list[Document]:
48
+ """Return all documents matching the given types from a run scope."""
49
+ scope = self._data.get(run_scope, {})
50
+ type_tuple = tuple(document_types)
51
+ return [doc for doc in scope.values() if isinstance(doc, type_tuple)]
52
+
53
+ async def has_documents(self, run_scope: str, document_type: type[Document]) -> bool:
54
+ """Check if any documents of this type exist in the run scope."""
55
+ scope = self._data.get(run_scope, {})
56
+ return any(isinstance(doc, document_type) for doc in scope.values())
57
+
58
+ async def check_existing(self, sha256s: list[str]) -> set[str]:
59
+ """Return the subset of sha256s that exist across all scopes."""
60
+ all_hashes: set[str] = set()
61
+ for scope in self._data.values():
62
+ all_hashes.update(scope.keys())
63
+ return all_hashes & set(sha256s)
64
+
65
+ async def update_summary(self, run_scope: str, document_sha256: str, summary: str) -> None:
66
+ """Update summary for a stored document. No-op if document doesn't exist."""
67
+ scope = self._data.get(run_scope, {})
68
+ if document_sha256 not in scope:
69
+ return
70
+ self._summaries.setdefault(run_scope, {})[document_sha256] = summary
71
+
72
+ async def load_summaries(self, run_scope: str, document_sha256s: list[str]) -> dict[str, str]:
73
+ """Load summaries by SHA256."""
74
+ scope_summaries = self._summaries.get(run_scope, {})
75
+ return {sha: scope_summaries[sha] for sha in document_sha256s if sha in scope_summaries}
76
+
77
+ def flush(self) -> None:
78
+ """Block until all pending document summaries are processed."""
79
+ if self._summary_worker:
80
+ self._summary_worker.flush()
81
+
82
+ def shutdown(self) -> None:
83
+ """Flush pending summaries and stop the summary worker."""
84
+ if self._summary_worker:
85
+ self._summary_worker.shutdown()
@@ -0,0 +1,68 @@
1
+ """Document store protocol and singleton management.
2
+
3
+ Defines the DocumentStore protocol that all storage backends must implement,
4
+ along with get/set helpers for the process-global singleton.
5
+ """
6
+
7
+ from typing import Protocol, runtime_checkable
8
+
9
+ from ai_pipeline_core.documents.document import Document
10
+
11
+
12
+ @runtime_checkable
13
+ class DocumentStore(Protocol):
14
+ """Protocol for document storage backends.
15
+
16
+ Implementations: ClickHouseDocumentStore (production), LocalDocumentStore (CLI/debug),
17
+ MemoryDocumentStore (testing).
18
+ """
19
+
20
+ async def save(self, document: Document, run_scope: str) -> None:
21
+ """Save a single document to the store. Idempotent — same SHA256 is a no-op."""
22
+ ...
23
+
24
+ async def save_batch(self, documents: list[Document], run_scope: str) -> None:
25
+ """Save multiple documents. Dependencies must be sorted (caller's responsibility)."""
26
+ ...
27
+
28
+ async def load(self, run_scope: str, document_types: list[type[Document]]) -> list[Document]:
29
+ """Load all documents of the given types from a run scope."""
30
+ ...
31
+
32
+ async def has_documents(self, run_scope: str, document_type: type[Document]) -> bool:
33
+ """Check if any documents of this type exist in the run scope."""
34
+ ...
35
+
36
+ async def check_existing(self, sha256s: list[str]) -> set[str]:
37
+ """Return the subset of sha256s that already exist in the store."""
38
+ ...
39
+
40
+ async def update_summary(self, run_scope: str, document_sha256: str, summary: str) -> None:
41
+ """Update summary for a stored document. No-op if document doesn't exist."""
42
+ ...
43
+
44
+ async def load_summaries(self, run_scope: str, document_sha256s: list[str]) -> dict[str, str]:
45
+ """Load summaries by SHA256. Returns {sha256: summary} for docs that have summaries."""
46
+ ...
47
+
48
+ def flush(self) -> None:
49
+ """Block until all pending background work (summaries) is processed."""
50
+ ...
51
+
52
+ def shutdown(self) -> None:
53
+ """Flush pending work and stop background workers."""
54
+ ...
55
+
56
+
57
+ _document_store: DocumentStore | None = None
58
+
59
+
60
+ def get_document_store() -> DocumentStore | None:
61
+ """Get the process-global document store singleton."""
62
+ return _document_store
63
+
64
+
65
+ def set_document_store(store: DocumentStore | None) -> None:
66
+ """Set the process-global document store singleton."""
67
+ global _document_store
68
+ _document_store = store
@@ -1,26 +1,24 @@
1
- """Document abstraction system for AI pipeline flows.
1
+ """Document system for AI pipeline flows.
2
2
 
3
- @public
4
-
5
- The documents package provides immutable, type-safe data structures for handling
6
- various content types in AI pipelines, including text, images, PDFs, and other
7
- binary data with automatic MIME type detection.
3
+ Provides the Document base class (immutable, content-addressed), Attachment for
4
+ binary sub-documents, and RunContext/TaskDocumentContext for document lifecycle
5
+ management within pipeline tasks.
8
6
  """
9
7
 
8
+ from .attachment import Attachment
9
+ from .context import RunContext, TaskDocumentContext, get_run_context, reset_run_context, set_run_context
10
10
  from .document import Document
11
- from .document_list import DocumentList
12
- from .flow_document import FlowDocument
13
- from .task_document import TaskDocument
14
- from .temporary_document import TemporaryDocument
15
11
  from .utils import canonical_name_key, is_document_sha256, sanitize_url
16
12
 
17
13
  __all__ = [
14
+ "Attachment",
18
15
  "Document",
19
- "DocumentList",
20
- "FlowDocument",
21
- "TaskDocument",
22
- "TemporaryDocument",
16
+ "RunContext",
17
+ "TaskDocumentContext",
23
18
  "canonical_name_key",
19
+ "get_run_context",
24
20
  "is_document_sha256",
21
+ "reset_run_context",
25
22
  "sanitize_url",
23
+ "set_run_context",
26
24
  ]
@@ -0,0 +1,85 @@
1
+ """Low-level ContextVar declarations for document registration and task context.
2
+
3
+ Extracted into a separate module to break the circular dependency between
4
+ document.py (which needs suppression/task-context checks) and context.py
5
+ (which defines the full TaskDocumentContext that depends on Document).
6
+ """
7
+
8
+ from collections.abc import Iterator
9
+ from contextlib import contextmanager
10
+ from contextvars import ContextVar, Token
11
+ from dataclasses import dataclass
12
+
13
+ # --- Run context ---
14
+
15
+
16
+ @dataclass(frozen=True, slots=True)
17
+ class RunContext:
18
+ """Immutable context for a pipeline run, carried via ContextVar."""
19
+
20
+ run_scope: str
21
+
22
+
23
+ _run_context: ContextVar[RunContext | None] = ContextVar("_run_context", default=None)
24
+
25
+
26
+ def get_run_context() -> RunContext | None:
27
+ """Get the current run context, or None if not set."""
28
+ return _run_context.get()
29
+
30
+
31
+ def set_run_context(ctx: RunContext) -> Token[RunContext | None]:
32
+ """Set the run context. Returns a token for restoring the previous value."""
33
+ return _run_context.set(ctx)
34
+
35
+
36
+ def reset_run_context(token: Token[RunContext | None]) -> None:
37
+ """Reset the run context to its previous value using a token from set_run_context."""
38
+ _run_context.reset(token)
39
+
40
+
41
+ # --- Suppression flag ---
42
+
43
+ _suppression_flag: ContextVar[bool] = ContextVar("_document_registration_suppressed", default=False)
44
+
45
+
46
+ def is_registration_suppressed() -> bool:
47
+ """Check if document registration is currently suppressed."""
48
+ return _suppression_flag.get()
49
+
50
+
51
+ @contextmanager
52
+ def suppress_registration() -> Iterator[None]:
53
+ """Context manager that suppresses Document registration with TaskDocumentContext.
54
+
55
+ Used during model_validate() and other internal Pydantic operations that
56
+ construct intermediate Document objects that should not be tracked.
57
+ """
58
+ token = _suppression_flag.set(True)
59
+ try:
60
+ yield
61
+ finally:
62
+ _suppression_flag.reset(token)
63
+
64
+
65
+ # --- Task document context ContextVar ---
66
+
67
+ # Forward reference: the actual TaskDocumentContext class lives in context.py.
68
+ # Here we only manage the ContextVar holding it.
69
+
70
+ _task_context: ContextVar[object | None] = ContextVar("_task_context", default=None)
71
+
72
+
73
+ def get_task_context() -> object | None:
74
+ """Get the current task document context, or None if not inside a pipeline task."""
75
+ return _task_context.get()
76
+
77
+
78
+ def set_task_context(ctx: object) -> Token[object | None]:
79
+ """Set the task document context. Returns a token for restoring the previous value."""
80
+ return _task_context.set(ctx)
81
+
82
+
83
+ def reset_task_context(token: Token[object | None]) -> None:
84
+ """Reset the task document context to its previous value."""
85
+ _task_context.reset(token)
@@ -0,0 +1,52 @@
1
+ """Document hashing utilities for store implementations.
2
+
3
+ Computes document_sha256 and content_sha256 as defined in the document store
4
+ design: length-prefixed fields with null-byte separators, BASE32 encoded.
5
+ """
6
+
7
+ import hashlib
8
+ from base64 import b32encode
9
+ from typing import Any, Protocol
10
+
11
+
12
+ class _Hashable(Protocol):
13
+ """Protocol for objects whose identity hash can be computed."""
14
+
15
+ name: str
16
+ content: bytes
17
+ attachments: Any
18
+
19
+
20
+ def compute_document_sha256(doc: _Hashable) -> str:
21
+ """Compute the document identity hash: hash(name + content + sorted_attachments).
22
+
23
+ Uses length-prefixed fields with null-byte separators for collision resistance.
24
+ Attachments are sorted by name. Result is BASE32 encoded (uppercase, no padding),
25
+ consistent with Document.sha256.
26
+
27
+ Excluded from hash: description, sources, origins, class_name.
28
+ """
29
+ h = hashlib.sha256()
30
+
31
+ name_bytes = doc.name.encode("utf-8")
32
+ _hash_field(h, name_bytes)
33
+ _hash_field(h, doc.content)
34
+
35
+ for att in sorted(doc.attachments, key=lambda a: a.name):
36
+ att_name_bytes = att.name.encode("utf-8")
37
+ _hash_field(h, att_name_bytes)
38
+ _hash_field(h, att.content)
39
+
40
+ return b32encode(h.digest()).decode("ascii").upper().rstrip("=")
41
+
42
+
43
+ def compute_content_sha256(content: bytes) -> str:
44
+ """Compute SHA256 of raw content bytes, BASE32 encoded."""
45
+ return b32encode(hashlib.sha256(content).digest()).decode("ascii").upper().rstrip("=")
46
+
47
+
48
+ def _hash_field(h: Any, data: bytes) -> None:
49
+ """Append a length-prefixed, null-separated field to the hash."""
50
+ h.update(str(len(data)).encode("ascii"))
51
+ h.update(b"\x00")
52
+ h.update(data)