ai-pipeline-core 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. ai_pipeline_core/__init__.py +70 -144
  2. ai_pipeline_core/deployment/__init__.py +6 -18
  3. ai_pipeline_core/deployment/base.py +392 -212
  4. ai_pipeline_core/deployment/contract.py +6 -10
  5. ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
  6. ai_pipeline_core/deployment/helpers.py +16 -17
  7. ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
  8. ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +37 -82
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +32 -85
  34. ai_pipeline_core/images/_processing.py +5 -11
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +106 -81
  37. ai_pipeline_core/llm/client.py +267 -158
  38. ai_pipeline_core/llm/model_options.py +12 -84
  39. ai_pipeline_core/llm/model_response.py +53 -99
  40. ai_pipeline_core/llm/model_types.py +8 -23
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
  49. ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +134 -75
  50. ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
  51. ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
  74. ai_pipeline_core/debug/__init__.py +0 -26
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -494
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/prompt_builder/__init__.py +0 -5
  85. ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
  86. ai_pipeline_core/prompt_builder/global_cache.py +0 -78
  87. ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
  88. ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
  89. ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
  90. ai_pipeline_core/storage/__init__.py +0 -8
  91. ai_pipeline_core/storage/storage.py +0 -628
  92. ai_pipeline_core/utils/__init__.py +0 -8
  93. ai_pipeline_core-0.3.3.dist-info/METADATA +0 -569
  94. ai_pipeline_core-0.3.3.dist-info/RECORD +0 -57
  95. {ai_pipeline_core-0.3.3.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
  96. {ai_pipeline_core-0.3.3.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,114 @@
1
+ """Validation utilities for AI documentation freshness, completeness, and size."""
2
+
3
+ import ast
4
+ import hashlib
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ from ai_pipeline_core.docs_generator.extractor import is_public_name
9
+ from ai_pipeline_core.docs_generator.trimmer import MAX_GUIDE_SIZE
10
+
11
+ HASH_FILE = ".hash"
12
+ # Generic entry-point names that are not part of the public API
13
+ _EXCLUDED_SYMBOLS: frozenset[str] = frozenset({"main"})
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class ValidationResult:
18
+ """Aggregated validation result across all checks."""
19
+
20
+ is_fresh: bool
21
+ missing_symbols: tuple[str, ...]
22
+ size_violations: tuple[tuple[str, int], ...]
23
+
24
+ @property
25
+ def is_valid(self) -> bool:
26
+ """Hard validations pass (freshness + completeness). Size is warning-only."""
27
+ return self.is_fresh and not self.missing_symbols
28
+
29
+
30
+ def compute_source_hash(source_dir: Path, tests_dir: Path) -> str:
31
+ """SHA256 hash of all .py files (sorted by relative path) under source and test dirs."""
32
+ repo_root = source_dir.parent
33
+ all_files: list[Path] = []
34
+ for directory in (source_dir, tests_dir):
35
+ if directory.is_dir():
36
+ all_files.extend(directory.rglob("*.py"))
37
+
38
+ sha = hashlib.sha256()
39
+ for path in sorted(all_files, key=lambda p: p.relative_to(repo_root)):
40
+ rel = str(path.relative_to(repo_root))
41
+ sha.update(rel.encode())
42
+ sha.update(path.read_bytes())
43
+ return sha.hexdigest()
44
+
45
+
46
+ def validate_freshness(ai_docs_dir: Path, source_dir: Path, tests_dir: Path) -> bool:
47
+ """Check whether .hash matches current source state."""
48
+ hash_file = ai_docs_dir / HASH_FILE
49
+ if not hash_file.exists():
50
+ return False
51
+ stored = hash_file.read_text().strip()
52
+ return stored == compute_source_hash(source_dir, tests_dir)
53
+
54
+
55
+ def validate_completeness(ai_docs_dir: Path, source_dir: Path, excluded_modules: frozenset[str] = frozenset()) -> list[str]:
56
+ """Return public symbols (by naming convention) not found in any guide file."""
57
+ public_symbols = _find_public_symbols(source_dir, excluded_modules)
58
+ guide_content = _read_all_guides(ai_docs_dir)
59
+ return [symbol for symbol in sorted(public_symbols) if f"class {symbol}" not in guide_content and f"def {symbol}" not in guide_content]
60
+
61
+
62
+ def validate_size(ai_docs_dir: Path, max_size: int = MAX_GUIDE_SIZE) -> list[tuple[str, int]]:
63
+ """Return guide files exceeding max_size bytes."""
64
+ violations: list[tuple[str, int]] = []
65
+ if not ai_docs_dir.is_dir():
66
+ return violations
67
+ for guide in sorted(ai_docs_dir.glob("*.md")):
68
+ size = len(guide.read_bytes())
69
+ if size > max_size:
70
+ violations.append((guide.name, size))
71
+ return violations
72
+
73
+
74
+ def validate_all(
75
+ ai_docs_dir: Path,
76
+ source_dir: Path,
77
+ tests_dir: Path,
78
+ excluded_modules: frozenset[str] = frozenset(),
79
+ ) -> ValidationResult:
80
+ """Run all validation checks and return aggregated result."""
81
+ return ValidationResult(
82
+ is_fresh=validate_freshness(ai_docs_dir, source_dir, tests_dir),
83
+ missing_symbols=tuple(validate_completeness(ai_docs_dir, source_dir, excluded_modules)),
84
+ size_violations=tuple(validate_size(ai_docs_dir)),
85
+ )
86
+
87
+
88
+ def _find_public_symbols(source_dir: Path, excluded_modules: frozenset[str] = frozenset()) -> set[str]:
89
+ """Find all public symbols via naming convention in non-private modules."""
90
+ symbols: set[str] = set()
91
+ for py_file in sorted(source_dir.rglob("*.py")):
92
+ if py_file.name.startswith("_") and py_file.name != "__init__.py":
93
+ continue
94
+ relative = py_file.relative_to(source_dir)
95
+ top_module = relative.parts[0] if len(relative.parts) > 1 else relative.stem
96
+ if top_module in excluded_modules:
97
+ continue
98
+ try:
99
+ tree = ast.parse(py_file.read_text(encoding="utf-8"))
100
+ except SyntaxError:
101
+ continue
102
+ for node in tree.body:
103
+ if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
104
+ continue
105
+ if is_public_name(node.name) and node.name not in _EXCLUDED_SYMBOLS:
106
+ symbols.add(node.name)
107
+ return symbols
108
+
109
+
110
+ def _read_all_guides(ai_docs_dir: Path) -> str:
111
+ """Concatenate all .md guide files into a single string for searching."""
112
+ if not ai_docs_dir.is_dir():
113
+ return ""
114
+ return "\n".join([guide.read_text() for guide in sorted(ai_docs_dir.glob("*.md"))])
@@ -0,0 +1,13 @@
1
+ """Document store protocol and backends for AI pipeline flows."""
2
+
3
+ from ._summary import SummaryGenerator
4
+ from .factory import create_document_store
5
+ from .protocol import DocumentStore, get_document_store, set_document_store
6
+
7
+ __all__ = [
8
+ "DocumentStore",
9
+ "SummaryGenerator",
10
+ "create_document_store",
11
+ "get_document_store",
12
+ "set_document_store",
13
+ ]
@@ -0,0 +1,9 @@
1
+ """Summary generation types and constants for document stores."""
2
+
3
+ from collections.abc import Callable, Coroutine
4
+
5
+ type SummaryGenerator = Callable[[str, str], Coroutine[None, None, str]]
6
+ """Async callable: (document_name, content_excerpt) -> summary string.
7
+ Returns empty string on failure. Must handle recursion prevention internally."""
8
+
9
+ SUMMARY_EXCERPT_CHARS: int = 5_000
@@ -0,0 +1,170 @@
1
+ """Background worker for asynchronous document summary generation."""
2
+
3
+ import asyncio
4
+ import contextlib
5
+ from collections.abc import Callable, Coroutine
6
+ from dataclasses import dataclass, field
7
+ from threading import Event, Thread
8
+
9
+ from lmnr.opentelemetry_lib.tracing import context as laminar_context
10
+ from opentelemetry import context as otel_context
11
+ from opentelemetry.context import Context
12
+
13
+ from ai_pipeline_core.document_store._summary import SUMMARY_EXCERPT_CHARS, SummaryGenerator
14
+ from ai_pipeline_core.documents.document import Document
15
+ from ai_pipeline_core.logging import get_pipeline_logger
16
+
17
+ logger = get_pipeline_logger(__name__)
18
+
19
+ _SENTINEL = object()
20
+
21
+
22
+ @dataclass(frozen=True, slots=True)
23
+ class _SummaryItem:
24
+ run_scope: str
25
+ sha256: str
26
+ name: str
27
+ excerpt: str
28
+ parent_otel_context: Context | None = field(default=None, hash=False, compare=False)
29
+ parent_laminar_context: Context | None = field(default=None, hash=False, compare=False)
30
+
31
+
32
+ class SummaryWorker:
33
+ """Background daemon thread that generates summaries and writes them back to the store.
34
+
35
+ Processes jobs in parallel on its own asyncio event loop. Thread-safe scheduling
36
+ via ``loop.call_soon_threadsafe()``. Best-effort — failures are logged and skipped.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ *,
42
+ generator: SummaryGenerator,
43
+ update_fn: Callable[[str, str, str], Coroutine[None, None, None]],
44
+ ) -> None:
45
+ self._generator = generator
46
+ self._update_fn = update_fn
47
+ self._inflight: set[tuple[str, str]] = set() # (run_scope, sha256)
48
+ self._loop: asyncio.AbstractEventLoop | None = None
49
+ self._queue: asyncio.Queue[_SummaryItem | object] | None = None
50
+ self._thread: Thread | None = None
51
+ self._ready = Event()
52
+
53
+ def start(self) -> None:
54
+ """Start the background daemon thread for summary generation."""
55
+ if self._thread is not None:
56
+ return
57
+ self._thread = Thread(target=self._thread_main, name="summary-worker", daemon=True)
58
+ self._thread.start()
59
+ if not self._ready.wait(timeout=5.0):
60
+ logger.warning("Summary worker thread did not start within 5 seconds")
61
+
62
+ def _thread_main(self) -> None:
63
+ self._loop = asyncio.new_event_loop()
64
+ self._queue = asyncio.Queue()
65
+ self._ready.set()
66
+ try:
67
+ self._loop.run_until_complete(self._run())
68
+ finally:
69
+ self._loop.close()
70
+ self._loop = None
71
+
72
+ async def _run(self) -> None:
73
+ assert self._queue is not None
74
+ while True:
75
+ item = await self._queue.get()
76
+ if item is _SENTINEL:
77
+ break
78
+ if isinstance(item, Event):
79
+ item.set()
80
+ continue
81
+ assert isinstance(item, _SummaryItem)
82
+
83
+ # Collect all immediately available items into a batch
84
+ batch: list[_SummaryItem] = [item]
85
+ sentinel_seen = False
86
+ flush_events: list[Event] = []
87
+
88
+ while not self._queue.empty():
89
+ try:
90
+ next_item = self._queue.get_nowait()
91
+ except asyncio.QueueEmpty:
92
+ break
93
+ if next_item is _SENTINEL:
94
+ sentinel_seen = True
95
+ break
96
+ if isinstance(next_item, Event):
97
+ flush_events.append(next_item)
98
+ break
99
+ assert isinstance(next_item, _SummaryItem)
100
+ batch.append(next_item)
101
+
102
+ await asyncio.gather(*[self._process_one(i) for i in batch])
103
+
104
+ for event in flush_events:
105
+ event.set()
106
+ if sentinel_seen:
107
+ break
108
+
109
+ async def _process_one(self, item: _SummaryItem) -> None:
110
+ try:
111
+ otel_token = otel_context.attach(item.parent_otel_context) if item.parent_otel_context is not None else None
112
+ laminar_token = laminar_context.attach_context(item.parent_laminar_context) if item.parent_laminar_context is not None else None
113
+ try:
114
+ summary = await self._generator(item.name, item.excerpt)
115
+ finally:
116
+ if laminar_token is not None:
117
+ laminar_context.detach_context(laminar_token)
118
+ if otel_token is not None:
119
+ otel_context.detach(otel_token)
120
+ if summary:
121
+ await self._update_fn(item.run_scope, item.sha256, summary)
122
+ except Exception as e:
123
+ logger.warning(f"Summary generation failed for '{item.name}': {e}")
124
+ finally:
125
+ self._inflight.discard((item.run_scope, item.sha256))
126
+
127
+ def schedule(self, run_scope: str, document: Document) -> None:
128
+ """Schedule summary generation for a document. Thread-safe, non-blocking."""
129
+ if self._loop is None or self._queue is None:
130
+ return
131
+ key = (run_scope, document.sha256)
132
+ if key in self._inflight:
133
+ return
134
+ self._inflight.add(key)
135
+ if document.is_text:
136
+ excerpt = document.text[:SUMMARY_EXCERPT_CHARS]
137
+ else:
138
+ excerpt = f"[Binary document: {document.mime_type}, {len(document.content)} bytes]"
139
+ item = _SummaryItem(
140
+ run_scope=run_scope,
141
+ sha256=document.sha256,
142
+ name=document.name,
143
+ excerpt=excerpt,
144
+ parent_otel_context=otel_context.get_current(),
145
+ parent_laminar_context=laminar_context.get_current_context(),
146
+ )
147
+ try:
148
+ self._loop.call_soon_threadsafe(self._queue.put_nowait, item)
149
+ except RuntimeError:
150
+ self._inflight.discard(key)
151
+
152
+ def flush(self, timeout: float = 60.0) -> None:
153
+ """Block until all queued items are processed."""
154
+ if self._loop is None or self._queue is None:
155
+ return
156
+ barrier = Event()
157
+ try:
158
+ self._loop.call_soon_threadsafe(self._queue.put_nowait, barrier)
159
+ except RuntimeError:
160
+ return
161
+ if not barrier.wait(timeout=timeout):
162
+ logger.warning("Summary worker flush timed out after %.0fs — some summaries may still be processing", timeout)
163
+
164
+ def shutdown(self, timeout: float = 60.0) -> None:
165
+ """Send stop sentinel and join the worker thread. Pending items are drained before stop."""
166
+ if self._loop is not None and self._queue is not None:
167
+ with contextlib.suppress(RuntimeError):
168
+ self._loop.call_soon_threadsafe(self._queue.put_nowait, _SENTINEL)
169
+ if self._thread is not None:
170
+ self._thread.join(timeout=timeout)