ai-pipeline-core 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +78 -125
- ai_pipeline_core/deployment/__init__.py +34 -0
- ai_pipeline_core/deployment/base.py +861 -0
- ai_pipeline_core/deployment/contract.py +80 -0
- ai_pipeline_core/deployment/deploy.py +561 -0
- ai_pipeline_core/deployment/helpers.py +97 -0
- ai_pipeline_core/deployment/progress.py +126 -0
- ai_pipeline_core/deployment/remote.py +116 -0
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +12 -14
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +318 -1434
- ai_pipeline_core/documents/mime_type.py +37 -82
- ai_pipeline_core/documents/utils.py +4 -12
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +309 -0
- ai_pipeline_core/images/_processing.py +151 -0
- ai_pipeline_core/llm/__init__.py +6 -4
- ai_pipeline_core/llm/ai_messages.py +130 -81
- ai_pipeline_core/llm/client.py +327 -193
- ai_pipeline_core/llm/model_options.py +14 -86
- ai_pipeline_core/llm/model_response.py +60 -103
- ai_pipeline_core/llm/model_types.py +16 -34
- ai_pipeline_core/logging/__init__.py +2 -7
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -37
- ai_pipeline_core/logging/logging_mixin.py +15 -41
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/observability/_debug/_config.py +95 -0
- ai_pipeline_core/observability/_debug/_content.py +764 -0
- ai_pipeline_core/observability/_debug/_processor.py +98 -0
- ai_pipeline_core/observability/_debug/_summary.py +312 -0
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/observability/_debug/_writer.py +843 -0
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -283
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +16 -102
- ai_pipeline_core/settings.py +26 -31
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
- {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
- ai_pipeline_core/documents/document_list.py +0 -420
- ai_pipeline_core/documents/flow_document.py +0 -112
- ai_pipeline_core/documents/task_document.py +0 -117
- ai_pipeline_core/documents/temporary_document.py +0 -74
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -483
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -718
- ai_pipeline_core/prefect.py +0 -63
- ai_pipeline_core/simple_runner/__init__.py +0 -14
- ai_pipeline_core/simple_runner/cli.py +0 -254
- ai_pipeline_core/simple_runner/simple_runner.py +0 -247
- ai_pipeline_core/storage/__init__.py +0 -8
- ai_pipeline_core/storage/storage.py +0 -628
- ai_pipeline_core/utils/__init__.py +0 -8
- ai_pipeline_core/utils/deploy.py +0 -373
- ai_pipeline_core/utils/remote_deployment.py +0 -269
- ai_pipeline_core-0.2.6.dist-info/METADATA +0 -500
- ai_pipeline_core-0.2.6.dist-info/RECORD +0 -41
- {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Lightweight attachment model for multi-part documents."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
|
|
7
|
+
|
|
8
|
+
from ai_pipeline_core.exceptions import DocumentNameError
|
|
9
|
+
|
|
10
|
+
from .mime_type import (
|
|
11
|
+
detect_mime_type,
|
|
12
|
+
is_image_mime_type,
|
|
13
|
+
is_pdf_mime_type,
|
|
14
|
+
is_text_mime_type,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Attachment(BaseModel):
|
|
19
|
+
"""Immutable binary attachment for multi-part documents.
|
|
20
|
+
|
|
21
|
+
Carries binary content (screenshots, PDFs, supplementary files) without full Document machinery.
|
|
22
|
+
``mime_type`` is a cached_property — not included in ``model_dump()`` output.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
26
|
+
|
|
27
|
+
name: str
|
|
28
|
+
content: bytes
|
|
29
|
+
description: str | None = None
|
|
30
|
+
|
|
31
|
+
@field_validator("name")
|
|
32
|
+
@classmethod
|
|
33
|
+
def validate_name(cls, v: str) -> str:
|
|
34
|
+
"""Reject path traversal, reserved suffixes, whitespace issues."""
|
|
35
|
+
if v.endswith(".description.md"):
|
|
36
|
+
raise DocumentNameError(f"Attachment names cannot end with .description.md: {v}")
|
|
37
|
+
if v.endswith(".sources.json"):
|
|
38
|
+
raise DocumentNameError(f"Attachment names cannot end with .sources.json: {v}")
|
|
39
|
+
if v.endswith(".attachments.json"):
|
|
40
|
+
raise DocumentNameError(f"Attachment names cannot end with .attachments.json: {v}")
|
|
41
|
+
if ".." in v or "\\" in v or "/" in v:
|
|
42
|
+
raise DocumentNameError(f"Invalid attachment name - contains path traversal characters: {v}")
|
|
43
|
+
if not v or v.startswith(" ") or v.endswith(" "):
|
|
44
|
+
raise DocumentNameError(f"Invalid attachment name format: {v}")
|
|
45
|
+
return v
|
|
46
|
+
|
|
47
|
+
@field_serializer("content")
|
|
48
|
+
def serialize_content(self, v: bytes) -> str: # noqa: PLR6301
|
|
49
|
+
"""UTF-8 decode for text, base64 for binary."""
|
|
50
|
+
try:
|
|
51
|
+
return v.decode("utf-8")
|
|
52
|
+
except UnicodeDecodeError:
|
|
53
|
+
return base64.b64encode(v).decode("ascii")
|
|
54
|
+
|
|
55
|
+
@cached_property
|
|
56
|
+
def mime_type(self) -> str:
|
|
57
|
+
"""Detected MIME type from content and filename. Cached."""
|
|
58
|
+
return detect_mime_type(self.content, self.name)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def is_image(self) -> bool:
|
|
62
|
+
"""True if MIME type starts with image/."""
|
|
63
|
+
return is_image_mime_type(self.mime_type)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def is_pdf(self) -> bool:
|
|
67
|
+
"""True if MIME type is application/pdf."""
|
|
68
|
+
return is_pdf_mime_type(self.mime_type)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def is_text(self) -> bool:
|
|
72
|
+
"""True if MIME type indicates text content."""
|
|
73
|
+
return is_text_mime_type(self.mime_type)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def size(self) -> int:
|
|
77
|
+
"""Content size in bytes."""
|
|
78
|
+
return len(self.content)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def text(self) -> str:
|
|
82
|
+
"""Content decoded as UTF-8. Raises ValueError if not text."""
|
|
83
|
+
if not self.is_text:
|
|
84
|
+
raise ValueError(f"Attachment is not text: {self.name}")
|
|
85
|
+
return self.content.decode("utf-8")
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Run context and task document context for document lifecycle management.
|
|
2
|
+
|
|
3
|
+
RunContext tracks the current run scope via ContextVar.
|
|
4
|
+
TaskDocumentContext tracks document creation within a pipeline task/flow,
|
|
5
|
+
providing provenance validation, finalize checks, and deduplication.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
|
|
10
|
+
from ai_pipeline_core.documents._context_vars import (
|
|
11
|
+
RunContext,
|
|
12
|
+
get_run_context,
|
|
13
|
+
get_task_context,
|
|
14
|
+
is_registration_suppressed,
|
|
15
|
+
reset_run_context,
|
|
16
|
+
reset_task_context,
|
|
17
|
+
set_run_context,
|
|
18
|
+
set_task_context,
|
|
19
|
+
suppress_registration,
|
|
20
|
+
)
|
|
21
|
+
from ai_pipeline_core.documents.document import Document
|
|
22
|
+
from ai_pipeline_core.documents.utils import is_document_sha256
|
|
23
|
+
|
|
24
|
+
# Re-export everything from _context_vars so existing imports from context.py keep working
|
|
25
|
+
__all__ = [
|
|
26
|
+
"RunContext",
|
|
27
|
+
"TaskDocumentContext",
|
|
28
|
+
"get_run_context",
|
|
29
|
+
"get_task_context",
|
|
30
|
+
"is_registration_suppressed",
|
|
31
|
+
"reset_run_context",
|
|
32
|
+
"reset_task_context",
|
|
33
|
+
"set_run_context",
|
|
34
|
+
"set_task_context",
|
|
35
|
+
"suppress_registration",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class TaskDocumentContext:
|
|
41
|
+
"""Tracks documents created within a single pipeline task or flow execution.
|
|
42
|
+
|
|
43
|
+
Used by @pipeline_task and @pipeline_flow decorators to:
|
|
44
|
+
- Validate that all source/origin SHA256 references point to pre-existing documents
|
|
45
|
+
- Detect same-task interdependencies (doc B referencing doc A created in the same task)
|
|
46
|
+
- Warn about documents with no provenance (no sources and no origins)
|
|
47
|
+
- Detect documents created but not returned (orphaned)
|
|
48
|
+
- Deduplicate returned documents by SHA256
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
created: set[str] = field(default_factory=set)
|
|
52
|
+
|
|
53
|
+
def register_created(self, doc: Document) -> None:
|
|
54
|
+
"""Register a document as created in this task/flow context."""
|
|
55
|
+
self.created.add(doc.sha256)
|
|
56
|
+
|
|
57
|
+
def validate_provenance(
|
|
58
|
+
self,
|
|
59
|
+
documents: list[Document],
|
|
60
|
+
existing_sha256s: set[str],
|
|
61
|
+
*,
|
|
62
|
+
check_created: bool = False,
|
|
63
|
+
) -> list[str]:
|
|
64
|
+
"""Validate provenance (sources and origins) for returned documents.
|
|
65
|
+
|
|
66
|
+
Checks:
|
|
67
|
+
1. All SHA256 source references exist in the store (existing_sha256s).
|
|
68
|
+
2. All origin references exist in the store (existing_sha256s).
|
|
69
|
+
3. No same-task interdependencies: a returned document must not reference
|
|
70
|
+
(via source or origin SHA256) another document created in this same context.
|
|
71
|
+
4. Documents with no sources AND no origins get a warning (no provenance).
|
|
72
|
+
5. (When check_created=True) Returned documents must have been created in
|
|
73
|
+
this context. Only applicable for @pipeline_task — flows delegate creation
|
|
74
|
+
to nested tasks whose documents register in the task's own context.
|
|
75
|
+
|
|
76
|
+
Only SHA256-formatted sources are validated; URLs and other reference strings
|
|
77
|
+
in sources are skipped. Initial pipeline inputs (documents with no provenance)
|
|
78
|
+
are acceptable and warned about for awareness.
|
|
79
|
+
|
|
80
|
+
Returns a list of warning messages (empty if everything is valid).
|
|
81
|
+
"""
|
|
82
|
+
warnings: list[str] = []
|
|
83
|
+
|
|
84
|
+
for doc in documents:
|
|
85
|
+
# Check that returned doc was created in this context (task-only)
|
|
86
|
+
if check_created and doc.sha256 not in self.created:
|
|
87
|
+
warnings.append(f"Document '{doc.name}' was not created in this task — only newly created documents should be returned")
|
|
88
|
+
|
|
89
|
+
# Check sources
|
|
90
|
+
for src in doc.sources:
|
|
91
|
+
if not is_document_sha256(src):
|
|
92
|
+
continue
|
|
93
|
+
if src in self.created:
|
|
94
|
+
warnings.append(f"Document '{doc.name}' references source {src[:12]}... created in the same task (same-task interdependency)")
|
|
95
|
+
elif src not in existing_sha256s:
|
|
96
|
+
warnings.append(f"Document '{doc.name}' references source {src[:12]}... which does not exist in the store")
|
|
97
|
+
|
|
98
|
+
# Check origins
|
|
99
|
+
for origin in doc.origins:
|
|
100
|
+
if origin in self.created:
|
|
101
|
+
warnings.append(f"Document '{doc.name}' references origin {origin[:12]}... created in the same task (same-task interdependency)")
|
|
102
|
+
elif origin not in existing_sha256s:
|
|
103
|
+
warnings.append(f"Document '{doc.name}' references origin {origin[:12]}... which does not exist in the store")
|
|
104
|
+
|
|
105
|
+
# Warn about no provenance
|
|
106
|
+
if not doc.sources and not doc.origins:
|
|
107
|
+
warnings.append(f"Document '{doc.name}' has no sources and no origins (no provenance)")
|
|
108
|
+
|
|
109
|
+
return warnings
|
|
110
|
+
|
|
111
|
+
def finalize(self, returned_docs: list[Document]) -> list[str]:
|
|
112
|
+
"""Check for documents created but not returned from the task/flow.
|
|
113
|
+
|
|
114
|
+
Returns a list of warning messages for orphaned documents — those registered
|
|
115
|
+
via Document.__init__ but not present in the returned result.
|
|
116
|
+
"""
|
|
117
|
+
returned_sha256s = {doc.sha256 for doc in returned_docs}
|
|
118
|
+
orphaned = self.created - returned_sha256s
|
|
119
|
+
return [f"Document {sha[:12]}... was created but not returned" for sha in sorted(orphaned)]
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def deduplicate(documents: list[Document]) -> list[Document]:
|
|
123
|
+
"""Deduplicate documents by SHA256, preserving first occurrence order."""
|
|
124
|
+
seen: dict[str, Document] = {}
|
|
125
|
+
for doc in documents:
|
|
126
|
+
if doc.sha256 not in seen:
|
|
127
|
+
seen[doc.sha256] = doc
|
|
128
|
+
return list(seen.values())
|