ai-pipeline-core 0.1.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +83 -119
- ai_pipeline_core/deployment/__init__.py +34 -0
- ai_pipeline_core/deployment/base.py +861 -0
- ai_pipeline_core/deployment/contract.py +80 -0
- ai_pipeline_core/deployment/deploy.py +561 -0
- ai_pipeline_core/deployment/helpers.py +97 -0
- ai_pipeline_core/deployment/progress.py +126 -0
- ai_pipeline_core/deployment/remote.py +116 -0
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +14 -15
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +349 -1062
- ai_pipeline_core/documents/mime_type.py +40 -85
- ai_pipeline_core/documents/utils.py +62 -7
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +309 -0
- ai_pipeline_core/images/_processing.py +151 -0
- ai_pipeline_core/llm/__init__.py +5 -3
- ai_pipeline_core/llm/ai_messages.py +284 -73
- ai_pipeline_core/llm/client.py +462 -209
- ai_pipeline_core/llm/model_options.py +86 -53
- ai_pipeline_core/llm/model_response.py +187 -241
- ai_pipeline_core/llm/model_types.py +34 -54
- ai_pipeline_core/logging/__init__.py +2 -9
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -43
- ai_pipeline_core/logging/logging_mixin.py +17 -51
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/observability/_debug/_config.py +95 -0
- ai_pipeline_core/observability/_debug/_content.py +764 -0
- ai_pipeline_core/observability/_debug/_processor.py +98 -0
- ai_pipeline_core/observability/_debug/_summary.py +312 -0
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/observability/_debug/_writer.py +843 -0
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/observability/tracing.py +640 -0
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +26 -105
- ai_pipeline_core/settings.py +41 -32
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
- ai_pipeline_core/documents/document_list.py +0 -240
- ai_pipeline_core/documents/flow_document.py +0 -128
- ai_pipeline_core/documents/task_document.py +0 -133
- ai_pipeline_core/documents/temporary_document.py +0 -95
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -314
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -717
- ai_pipeline_core/prefect.py +0 -54
- ai_pipeline_core/simple_runner/__init__.py +0 -24
- ai_pipeline_core/simple_runner/cli.py +0 -255
- ai_pipeline_core/simple_runner/simple_runner.py +0 -385
- ai_pipeline_core/tracing.py +0 -475
- ai_pipeline_core-0.1.12.dist-info/METADATA +0 -450
- ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Validation utilities for AI documentation freshness, completeness, and size."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import hashlib
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ai_pipeline_core.docs_generator.extractor import is_public_name
|
|
9
|
+
from ai_pipeline_core.docs_generator.trimmer import MAX_GUIDE_SIZE
|
|
10
|
+
|
|
11
|
+
HASH_FILE = ".hash"
|
|
12
|
+
# Generic entry-point names that are not part of the public API
|
|
13
|
+
_EXCLUDED_SYMBOLS: frozenset[str] = frozenset({"main"})
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class ValidationResult:
|
|
18
|
+
"""Aggregated validation result across all checks."""
|
|
19
|
+
|
|
20
|
+
is_fresh: bool
|
|
21
|
+
missing_symbols: tuple[str, ...]
|
|
22
|
+
size_violations: tuple[tuple[str, int], ...]
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def is_valid(self) -> bool:
|
|
26
|
+
"""Hard validations pass (freshness + completeness). Size is warning-only."""
|
|
27
|
+
return self.is_fresh and not self.missing_symbols
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def compute_source_hash(source_dir: Path, tests_dir: Path) -> str:
|
|
31
|
+
"""SHA256 hash of all .py files (sorted by relative path) under source and test dirs."""
|
|
32
|
+
repo_root = source_dir.parent
|
|
33
|
+
all_files: list[Path] = []
|
|
34
|
+
for directory in (source_dir, tests_dir):
|
|
35
|
+
if directory.is_dir():
|
|
36
|
+
all_files.extend(directory.rglob("*.py"))
|
|
37
|
+
|
|
38
|
+
sha = hashlib.sha256()
|
|
39
|
+
for path in sorted(all_files, key=lambda p: p.relative_to(repo_root)):
|
|
40
|
+
rel = str(path.relative_to(repo_root))
|
|
41
|
+
sha.update(rel.encode())
|
|
42
|
+
sha.update(path.read_bytes())
|
|
43
|
+
return sha.hexdigest()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def validate_freshness(ai_docs_dir: Path, source_dir: Path, tests_dir: Path) -> bool:
|
|
47
|
+
"""Check whether .hash matches current source state."""
|
|
48
|
+
hash_file = ai_docs_dir / HASH_FILE
|
|
49
|
+
if not hash_file.exists():
|
|
50
|
+
return False
|
|
51
|
+
stored = hash_file.read_text().strip()
|
|
52
|
+
return stored == compute_source_hash(source_dir, tests_dir)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def validate_completeness(ai_docs_dir: Path, source_dir: Path, excluded_modules: frozenset[str] = frozenset()) -> list[str]:
|
|
56
|
+
"""Return public symbols (by naming convention) not found in any guide file."""
|
|
57
|
+
public_symbols = _find_public_symbols(source_dir, excluded_modules)
|
|
58
|
+
guide_content = _read_all_guides(ai_docs_dir)
|
|
59
|
+
return [symbol for symbol in sorted(public_symbols) if f"class {symbol}" not in guide_content and f"def {symbol}" not in guide_content]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def validate_size(ai_docs_dir: Path, max_size: int = MAX_GUIDE_SIZE) -> list[tuple[str, int]]:
|
|
63
|
+
"""Return guide files exceeding max_size bytes."""
|
|
64
|
+
violations: list[tuple[str, int]] = []
|
|
65
|
+
if not ai_docs_dir.is_dir():
|
|
66
|
+
return violations
|
|
67
|
+
for guide in sorted(ai_docs_dir.glob("*.md")):
|
|
68
|
+
size = len(guide.read_bytes())
|
|
69
|
+
if size > max_size:
|
|
70
|
+
violations.append((guide.name, size))
|
|
71
|
+
return violations
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def validate_all(
|
|
75
|
+
ai_docs_dir: Path,
|
|
76
|
+
source_dir: Path,
|
|
77
|
+
tests_dir: Path,
|
|
78
|
+
excluded_modules: frozenset[str] = frozenset(),
|
|
79
|
+
) -> ValidationResult:
|
|
80
|
+
"""Run all validation checks and return aggregated result."""
|
|
81
|
+
return ValidationResult(
|
|
82
|
+
is_fresh=validate_freshness(ai_docs_dir, source_dir, tests_dir),
|
|
83
|
+
missing_symbols=tuple(validate_completeness(ai_docs_dir, source_dir, excluded_modules)),
|
|
84
|
+
size_violations=tuple(validate_size(ai_docs_dir)),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _find_public_symbols(source_dir: Path, excluded_modules: frozenset[str] = frozenset()) -> set[str]:
|
|
89
|
+
"""Find all public symbols via naming convention in non-private modules."""
|
|
90
|
+
symbols: set[str] = set()
|
|
91
|
+
for py_file in sorted(source_dir.rglob("*.py")):
|
|
92
|
+
if py_file.name.startswith("_") and py_file.name != "__init__.py":
|
|
93
|
+
continue
|
|
94
|
+
relative = py_file.relative_to(source_dir)
|
|
95
|
+
top_module = relative.parts[0] if len(relative.parts) > 1 else relative.stem
|
|
96
|
+
if top_module in excluded_modules:
|
|
97
|
+
continue
|
|
98
|
+
try:
|
|
99
|
+
tree = ast.parse(py_file.read_text(encoding="utf-8"))
|
|
100
|
+
except SyntaxError:
|
|
101
|
+
continue
|
|
102
|
+
for node in tree.body:
|
|
103
|
+
if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
104
|
+
continue
|
|
105
|
+
if is_public_name(node.name) and node.name not in _EXCLUDED_SYMBOLS:
|
|
106
|
+
symbols.add(node.name)
|
|
107
|
+
return symbols
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _read_all_guides(ai_docs_dir: Path) -> str:
|
|
111
|
+
"""Concatenate all .md guide files into a single string for searching."""
|
|
112
|
+
if not ai_docs_dir.is_dir():
|
|
113
|
+
return ""
|
|
114
|
+
return "\n".join([guide.read_text() for guide in sorted(ai_docs_dir.glob("*.md"))])
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Document store protocol and backends for AI pipeline flows."""
|
|
2
|
+
|
|
3
|
+
from ._summary import SummaryGenerator
|
|
4
|
+
from .factory import create_document_store
|
|
5
|
+
from .protocol import DocumentStore, get_document_store, set_document_store
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"DocumentStore",
|
|
9
|
+
"SummaryGenerator",
|
|
10
|
+
"create_document_store",
|
|
11
|
+
"get_document_store",
|
|
12
|
+
"set_document_store",
|
|
13
|
+
]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Summary generation types and constants for document stores."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Coroutine
|
|
4
|
+
|
|
5
|
+
type SummaryGenerator = Callable[[str, str], Coroutine[None, None, str]]
|
|
6
|
+
"""Async callable: (document_name, content_excerpt) -> summary string.
|
|
7
|
+
Returns empty string on failure. Must handle recursion prevention internally."""
|
|
8
|
+
|
|
9
|
+
SUMMARY_EXCERPT_CHARS: int = 5_000
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Background worker for asynchronous document summary generation."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
from collections.abc import Callable, Coroutine
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from threading import Event, Thread
|
|
8
|
+
|
|
9
|
+
from lmnr.opentelemetry_lib.tracing import context as laminar_context
|
|
10
|
+
from opentelemetry import context as otel_context
|
|
11
|
+
from opentelemetry.context import Context
|
|
12
|
+
|
|
13
|
+
from ai_pipeline_core.document_store._summary import SUMMARY_EXCERPT_CHARS, SummaryGenerator
|
|
14
|
+
from ai_pipeline_core.documents.document import Document
|
|
15
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
16
|
+
|
|
17
|
+
logger = get_pipeline_logger(__name__)
|
|
18
|
+
|
|
19
|
+
_SENTINEL = object()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True, slots=True)
|
|
23
|
+
class _SummaryItem:
|
|
24
|
+
run_scope: str
|
|
25
|
+
sha256: str
|
|
26
|
+
name: str
|
|
27
|
+
excerpt: str
|
|
28
|
+
parent_otel_context: Context | None = field(default=None, hash=False, compare=False)
|
|
29
|
+
parent_laminar_context: Context | None = field(default=None, hash=False, compare=False)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SummaryWorker:
|
|
33
|
+
"""Background daemon thread that generates summaries and writes them back to the store.
|
|
34
|
+
|
|
35
|
+
Processes jobs in parallel on its own asyncio event loop. Thread-safe scheduling
|
|
36
|
+
via ``loop.call_soon_threadsafe()``. Best-effort — failures are logged and skipped.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
*,
|
|
42
|
+
generator: SummaryGenerator,
|
|
43
|
+
update_fn: Callable[[str, str, str], Coroutine[None, None, None]],
|
|
44
|
+
) -> None:
|
|
45
|
+
self._generator = generator
|
|
46
|
+
self._update_fn = update_fn
|
|
47
|
+
self._inflight: set[tuple[str, str]] = set() # (run_scope, sha256)
|
|
48
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
49
|
+
self._queue: asyncio.Queue[_SummaryItem | object] | None = None
|
|
50
|
+
self._thread: Thread | None = None
|
|
51
|
+
self._ready = Event()
|
|
52
|
+
|
|
53
|
+
def start(self) -> None:
|
|
54
|
+
"""Start the background daemon thread for summary generation."""
|
|
55
|
+
if self._thread is not None:
|
|
56
|
+
return
|
|
57
|
+
self._thread = Thread(target=self._thread_main, name="summary-worker", daemon=True)
|
|
58
|
+
self._thread.start()
|
|
59
|
+
if not self._ready.wait(timeout=5.0):
|
|
60
|
+
logger.warning("Summary worker thread did not start within 5 seconds")
|
|
61
|
+
|
|
62
|
+
def _thread_main(self) -> None:
|
|
63
|
+
self._loop = asyncio.new_event_loop()
|
|
64
|
+
self._queue = asyncio.Queue()
|
|
65
|
+
self._ready.set()
|
|
66
|
+
try:
|
|
67
|
+
self._loop.run_until_complete(self._run())
|
|
68
|
+
finally:
|
|
69
|
+
self._loop.close()
|
|
70
|
+
self._loop = None
|
|
71
|
+
|
|
72
|
+
async def _run(self) -> None:
|
|
73
|
+
assert self._queue is not None
|
|
74
|
+
while True:
|
|
75
|
+
item = await self._queue.get()
|
|
76
|
+
if item is _SENTINEL:
|
|
77
|
+
break
|
|
78
|
+
if isinstance(item, Event):
|
|
79
|
+
item.set()
|
|
80
|
+
continue
|
|
81
|
+
assert isinstance(item, _SummaryItem)
|
|
82
|
+
|
|
83
|
+
# Collect all immediately available items into a batch
|
|
84
|
+
batch: list[_SummaryItem] = [item]
|
|
85
|
+
sentinel_seen = False
|
|
86
|
+
flush_events: list[Event] = []
|
|
87
|
+
|
|
88
|
+
while not self._queue.empty():
|
|
89
|
+
try:
|
|
90
|
+
next_item = self._queue.get_nowait()
|
|
91
|
+
except asyncio.QueueEmpty:
|
|
92
|
+
break
|
|
93
|
+
if next_item is _SENTINEL:
|
|
94
|
+
sentinel_seen = True
|
|
95
|
+
break
|
|
96
|
+
if isinstance(next_item, Event):
|
|
97
|
+
flush_events.append(next_item)
|
|
98
|
+
break
|
|
99
|
+
assert isinstance(next_item, _SummaryItem)
|
|
100
|
+
batch.append(next_item)
|
|
101
|
+
|
|
102
|
+
await asyncio.gather(*[self._process_one(i) for i in batch])
|
|
103
|
+
|
|
104
|
+
for event in flush_events:
|
|
105
|
+
event.set()
|
|
106
|
+
if sentinel_seen:
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
async def _process_one(self, item: _SummaryItem) -> None:
|
|
110
|
+
try:
|
|
111
|
+
otel_token = otel_context.attach(item.parent_otel_context) if item.parent_otel_context is not None else None
|
|
112
|
+
laminar_token = laminar_context.attach_context(item.parent_laminar_context) if item.parent_laminar_context is not None else None
|
|
113
|
+
try:
|
|
114
|
+
summary = await self._generator(item.name, item.excerpt)
|
|
115
|
+
finally:
|
|
116
|
+
if laminar_token is not None:
|
|
117
|
+
laminar_context.detach_context(laminar_token)
|
|
118
|
+
if otel_token is not None:
|
|
119
|
+
otel_context.detach(otel_token)
|
|
120
|
+
if summary:
|
|
121
|
+
await self._update_fn(item.run_scope, item.sha256, summary)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.warning(f"Summary generation failed for '{item.name}': {e}")
|
|
124
|
+
finally:
|
|
125
|
+
self._inflight.discard((item.run_scope, item.sha256))
|
|
126
|
+
|
|
127
|
+
def schedule(self, run_scope: str, document: Document) -> None:
|
|
128
|
+
"""Schedule summary generation for a document. Thread-safe, non-blocking."""
|
|
129
|
+
if self._loop is None or self._queue is None:
|
|
130
|
+
return
|
|
131
|
+
key = (run_scope, document.sha256)
|
|
132
|
+
if key in self._inflight:
|
|
133
|
+
return
|
|
134
|
+
self._inflight.add(key)
|
|
135
|
+
if document.is_text:
|
|
136
|
+
excerpt = document.text[:SUMMARY_EXCERPT_CHARS]
|
|
137
|
+
else:
|
|
138
|
+
excerpt = f"[Binary document: {document.mime_type}, {len(document.content)} bytes]"
|
|
139
|
+
item = _SummaryItem(
|
|
140
|
+
run_scope=run_scope,
|
|
141
|
+
sha256=document.sha256,
|
|
142
|
+
name=document.name,
|
|
143
|
+
excerpt=excerpt,
|
|
144
|
+
parent_otel_context=otel_context.get_current(),
|
|
145
|
+
parent_laminar_context=laminar_context.get_current_context(),
|
|
146
|
+
)
|
|
147
|
+
try:
|
|
148
|
+
self._loop.call_soon_threadsafe(self._queue.put_nowait, item)
|
|
149
|
+
except RuntimeError:
|
|
150
|
+
self._inflight.discard(key)
|
|
151
|
+
|
|
152
|
+
def flush(self, timeout: float = 60.0) -> None:
|
|
153
|
+
"""Block until all queued items are processed."""
|
|
154
|
+
if self._loop is None or self._queue is None:
|
|
155
|
+
return
|
|
156
|
+
barrier = Event()
|
|
157
|
+
try:
|
|
158
|
+
self._loop.call_soon_threadsafe(self._queue.put_nowait, barrier)
|
|
159
|
+
except RuntimeError:
|
|
160
|
+
return
|
|
161
|
+
if not barrier.wait(timeout=timeout):
|
|
162
|
+
logger.warning("Summary worker flush timed out after %.0fs — some summaries may still be processing", timeout)
|
|
163
|
+
|
|
164
|
+
def shutdown(self, timeout: float = 60.0) -> None:
|
|
165
|
+
"""Send stop sentinel and join the worker thread. Pending items are drained before stop."""
|
|
166
|
+
if self._loop is not None and self._queue is not None:
|
|
167
|
+
with contextlib.suppress(RuntimeError):
|
|
168
|
+
self._loop.call_soon_threadsafe(self._queue.put_nowait, _SENTINEL)
|
|
169
|
+
if self._thread is not None:
|
|
170
|
+
self._thread.join(timeout=timeout)
|