ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. ai_pipeline_core/__init__.py +64 -158
  2. ai_pipeline_core/deployment/__init__.py +6 -18
  3. ai_pipeline_core/deployment/base.py +392 -212
  4. ai_pipeline_core/deployment/contract.py +6 -10
  5. ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
  6. ai_pipeline_core/deployment/helpers.py +16 -17
  7. ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
  8. ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +11 -84
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +32 -85
  34. ai_pipeline_core/images/_processing.py +5 -11
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +102 -90
  37. ai_pipeline_core/llm/client.py +229 -183
  38. ai_pipeline_core/llm/model_options.py +12 -84
  39. ai_pipeline_core/llm/model_response.py +53 -99
  40. ai_pipeline_core/llm/model_types.py +8 -23
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
  49. ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
  50. ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
  51. ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
  74. ai_pipeline_core/debug/__init__.py +0 -26
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -494
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/prompt_builder/__init__.py +0 -5
  85. ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
  86. ai_pipeline_core/prompt_builder/global_cache.py +0 -78
  87. ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
  88. ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
  89. ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
  90. ai_pipeline_core/storage/__init__.py +0 -8
  91. ai_pipeline_core/storage/storage.py +0 -628
  92. ai_pipeline_core/utils/__init__.py +0 -8
  93. ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
  94. ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
  95. {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
  96. {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,85 @@
1
+ """Lightweight attachment model for multi-part documents."""
2
+
3
+ import base64
4
+ from functools import cached_property
5
+
6
+ from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
7
+
8
+ from ai_pipeline_core.exceptions import DocumentNameError
9
+
10
+ from .mime_type import (
11
+ detect_mime_type,
12
+ is_image_mime_type,
13
+ is_pdf_mime_type,
14
+ is_text_mime_type,
15
+ )
16
+
17
+
18
+ class Attachment(BaseModel):
19
+ """Immutable binary attachment for multi-part documents.
20
+
21
+ Carries binary content (screenshots, PDFs, supplementary files) without full Document machinery.
22
+ ``mime_type`` is a cached_property — not included in ``model_dump()`` output.
23
+ """
24
+
25
+ model_config = ConfigDict(frozen=True, extra="forbid")
26
+
27
+ name: str
28
+ content: bytes
29
+ description: str | None = None
30
+
31
+ @field_validator("name")
32
+ @classmethod
33
+ def validate_name(cls, v: str) -> str:
34
+ """Reject path traversal, reserved suffixes, whitespace issues."""
35
+ if v.endswith(".description.md"):
36
+ raise DocumentNameError(f"Attachment names cannot end with .description.md: {v}")
37
+ if v.endswith(".sources.json"):
38
+ raise DocumentNameError(f"Attachment names cannot end with .sources.json: {v}")
39
+ if v.endswith(".attachments.json"):
40
+ raise DocumentNameError(f"Attachment names cannot end with .attachments.json: {v}")
41
+ if ".." in v or "\\" in v or "/" in v:
42
+ raise DocumentNameError(f"Invalid attachment name - contains path traversal characters: {v}")
43
+ if not v or v.startswith(" ") or v.endswith(" "):
44
+ raise DocumentNameError(f"Invalid attachment name format: {v}")
45
+ return v
46
+
47
+ @field_serializer("content")
48
+ def serialize_content(self, v: bytes) -> str: # noqa: PLR6301
49
+ """UTF-8 decode for text, base64 for binary."""
50
+ try:
51
+ return v.decode("utf-8")
52
+ except UnicodeDecodeError:
53
+ return base64.b64encode(v).decode("ascii")
54
+
55
+ @cached_property
56
+ def mime_type(self) -> str:
57
+ """Detected MIME type from content and filename. Cached."""
58
+ return detect_mime_type(self.content, self.name)
59
+
60
+ @property
61
+ def is_image(self) -> bool:
62
+ """True if MIME type starts with image/."""
63
+ return is_image_mime_type(self.mime_type)
64
+
65
+ @property
66
+ def is_pdf(self) -> bool:
67
+ """True if MIME type is application/pdf."""
68
+ return is_pdf_mime_type(self.mime_type)
69
+
70
+ @property
71
+ def is_text(self) -> bool:
72
+ """True if MIME type indicates text content."""
73
+ return is_text_mime_type(self.mime_type)
74
+
75
+ @property
76
+ def size(self) -> int:
77
+ """Content size in bytes."""
78
+ return len(self.content)
79
+
80
+ @property
81
+ def text(self) -> str:
82
+ """Content decoded as UTF-8. Raises ValueError if not text."""
83
+ if not self.is_text:
84
+ raise ValueError(f"Attachment is not text: {self.name}")
85
+ return self.content.decode("utf-8")
@@ -0,0 +1,128 @@
1
+ """Run context and task document context for document lifecycle management.
2
+
3
+ RunContext tracks the current run scope via ContextVar.
4
+ TaskDocumentContext tracks document creation within a pipeline task/flow,
5
+ providing provenance validation, finalize checks, and deduplication.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+
10
+ from ai_pipeline_core.documents._context_vars import (
11
+ RunContext,
12
+ get_run_context,
13
+ get_task_context,
14
+ is_registration_suppressed,
15
+ reset_run_context,
16
+ reset_task_context,
17
+ set_run_context,
18
+ set_task_context,
19
+ suppress_registration,
20
+ )
21
+ from ai_pipeline_core.documents.document import Document
22
+ from ai_pipeline_core.documents.utils import is_document_sha256
23
+
24
+ # Re-export everything from _context_vars so existing imports from context.py keep working
25
+ __all__ = [
26
+ "RunContext",
27
+ "TaskDocumentContext",
28
+ "get_run_context",
29
+ "get_task_context",
30
+ "is_registration_suppressed",
31
+ "reset_run_context",
32
+ "reset_task_context",
33
+ "set_run_context",
34
+ "set_task_context",
35
+ "suppress_registration",
36
+ ]
37
+
38
+
39
+ @dataclass
40
+ class TaskDocumentContext:
41
+ """Tracks documents created within a single pipeline task or flow execution.
42
+
43
+ Used by @pipeline_task and @pipeline_flow decorators to:
44
+ - Validate that all source/origin SHA256 references point to pre-existing documents
45
+ - Detect same-task interdependencies (doc B referencing doc A created in the same task)
46
+ - Warn about documents with no provenance (no sources and no origins)
47
+ - Detect documents created but not returned (orphaned)
48
+ - Deduplicate returned documents by SHA256
49
+ """
50
+
51
+ created: set[str] = field(default_factory=set)
52
+
53
+ def register_created(self, doc: Document) -> None:
54
+ """Register a document as created in this task/flow context."""
55
+ self.created.add(doc.sha256)
56
+
57
+ def validate_provenance(
58
+ self,
59
+ documents: list[Document],
60
+ existing_sha256s: set[str],
61
+ *,
62
+ check_created: bool = False,
63
+ ) -> list[str]:
64
+ """Validate provenance (sources and origins) for returned documents.
65
+
66
+ Checks:
67
+ 1. All SHA256 source references exist in the store (existing_sha256s).
68
+ 2. All origin references exist in the store (existing_sha256s).
69
+ 3. No same-task interdependencies: a returned document must not reference
70
+ (via source or origin SHA256) another document created in this same context.
71
+ 4. Documents with no sources AND no origins get a warning (no provenance).
72
+ 5. (When check_created=True) Returned documents must have been created in
73
+ this context. Only applicable for @pipeline_task — flows delegate creation
74
+ to nested tasks whose documents register in the task's own context.
75
+
76
+ Only SHA256-formatted sources are validated; URLs and other reference strings
77
+ in sources are skipped. Initial pipeline inputs (documents with no provenance)
78
+ are acceptable and warned about for awareness.
79
+
80
+ Returns a list of warning messages (empty if everything is valid).
81
+ """
82
+ warnings: list[str] = []
83
+
84
+ for doc in documents:
85
+ # Check that returned doc was created in this context (task-only)
86
+ if check_created and doc.sha256 not in self.created:
87
+ warnings.append(f"Document '{doc.name}' was not created in this task — only newly created documents should be returned")
88
+
89
+ # Check sources
90
+ for src in doc.sources:
91
+ if not is_document_sha256(src):
92
+ continue
93
+ if src in self.created:
94
+ warnings.append(f"Document '{doc.name}' references source {src[:12]}... created in the same task (same-task interdependency)")
95
+ elif src not in existing_sha256s:
96
+ warnings.append(f"Document '{doc.name}' references source {src[:12]}... which does not exist in the store")
97
+
98
+ # Check origins
99
+ for origin in doc.origins:
100
+ if origin in self.created:
101
+ warnings.append(f"Document '{doc.name}' references origin {origin[:12]}... created in the same task (same-task interdependency)")
102
+ elif origin not in existing_sha256s:
103
+ warnings.append(f"Document '{doc.name}' references origin {origin[:12]}... which does not exist in the store")
104
+
105
+ # Warn about no provenance
106
+ if not doc.sources and not doc.origins:
107
+ warnings.append(f"Document '{doc.name}' has no sources and no origins (no provenance)")
108
+
109
+ return warnings
110
+
111
+ def finalize(self, returned_docs: list[Document]) -> list[str]:
112
+ """Check for documents created but not returned from the task/flow.
113
+
114
+ Returns a list of warning messages for orphaned documents — those registered
115
+ via Document.__init__ but not present in the returned result.
116
+ """
117
+ returned_sha256s = {doc.sha256 for doc in returned_docs}
118
+ orphaned = self.created - returned_sha256s
119
+ return [f"Document {sha[:12]}... was created but not returned" for sha in sorted(orphaned)]
120
+
121
+ @staticmethod
122
+ def deduplicate(documents: list[Document]) -> list[Document]:
123
+ """Deduplicate documents by SHA256, preserving first occurrence order."""
124
+ seen: dict[str, Document] = {}
125
+ for doc in documents:
126
+ if doc.sha256 not in seen:
127
+ seen[doc.sha256] = doc
128
+ return list(seen.values())