ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. ai_pipeline_core/__init__.py +64 -158
  2. ai_pipeline_core/deployment/__init__.py +6 -18
  3. ai_pipeline_core/deployment/base.py +392 -212
  4. ai_pipeline_core/deployment/contract.py +6 -10
  5. ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
  6. ai_pipeline_core/deployment/helpers.py +16 -17
  7. ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
  8. ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +11 -84
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +32 -85
  34. ai_pipeline_core/images/_processing.py +5 -11
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +102 -90
  37. ai_pipeline_core/llm/client.py +229 -183
  38. ai_pipeline_core/llm/model_options.py +12 -84
  39. ai_pipeline_core/llm/model_response.py +53 -99
  40. ai_pipeline_core/llm/model_types.py +8 -23
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
  49. ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
  50. ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
  51. ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
  74. ai_pipeline_core/debug/__init__.py +0 -26
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -494
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/prompt_builder/__init__.py +0 -5
  85. ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
  86. ai_pipeline_core/prompt_builder/global_cache.py +0 -78
  87. ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
  88. ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
  89. ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
  90. ai_pipeline_core/storage/__init__.py +0 -8
  91. ai_pipeline_core/storage/storage.py +0 -628
  92. ai_pipeline_core/utils/__init__.py +0 -8
  93. ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
  94. ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
  95. {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
  96. {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,146 @@
1
+ """Document tracking helpers for pipeline instrumentation.
2
+
3
+ Emits document lifecycle events and sets OTel span attributes for
4
+ document lineage. All functions are no-ops when tracking is not initialized.
5
+ """
6
+
7
+ from typing import cast
8
+
9
+ from opentelemetry import trace as otel_trace
10
+
11
+ from ai_pipeline_core.documents import Document
12
+ from ai_pipeline_core.logging import get_pipeline_logger
13
+ from ai_pipeline_core.observability._initialization import TrackingServiceProtocol, get_tracking_service
14
+ from ai_pipeline_core.observability._tracking._models import ATTR_INPUT_DOCUMENT_SHA256S, ATTR_OUTPUT_DOCUMENT_SHA256S, DocumentEventType
15
+
16
+ logger = get_pipeline_logger(__name__)
17
+
18
+
19
+ def get_current_span_id() -> str:
20
+ """Return the current OTel span ID as hex, or empty string."""
21
+ span = otel_trace.get_current_span()
22
+ ctx = span.get_span_context()
23
+ if ctx and ctx.span_id:
24
+ return format(ctx.span_id, "016x")
25
+ return ""
26
+
27
+
28
+ def _get_tracking_service() -> TrackingServiceProtocol | None:
29
+ """Return the global tracking service, or None if not initialized."""
30
+ return get_tracking_service()
31
+
32
+
33
+ def track_task_io(task_name: str, args: tuple[object, ...], kwargs: dict[str, object], result: object) -> None: # noqa: ARG001
34
+ """Track input/output documents for a pipeline task."""
35
+ service = _get_tracking_service()
36
+ if service is None:
37
+ return
38
+
39
+ span_id = get_current_span_id()
40
+ input_sha256s: list[str] = []
41
+ output_sha256s: list[str] = []
42
+
43
+ # Track input documents
44
+ for arg in (*args, *kwargs.values()):
45
+ if isinstance(arg, Document):
46
+ input_sha256s.append(arg.sha256)
47
+ service.track_document_event(
48
+ document_sha256=arg.sha256,
49
+ span_id=span_id,
50
+ event_type=DocumentEventType.TASK_INPUT,
51
+ )
52
+ elif isinstance(arg, list) and arg and isinstance(arg[0], Document):
53
+ for doc in cast(list[Document], arg):
54
+ input_sha256s.append(doc.sha256)
55
+ service.track_document_event(
56
+ document_sha256=doc.sha256,
57
+ span_id=span_id,
58
+ event_type=DocumentEventType.TASK_INPUT,
59
+ )
60
+
61
+ # Track output documents
62
+ if isinstance(result, Document):
63
+ output_sha256s.append(result.sha256)
64
+ service.track_document_event(
65
+ document_sha256=result.sha256,
66
+ span_id=span_id,
67
+ event_type=DocumentEventType.TASK_OUTPUT,
68
+ )
69
+ elif isinstance(result, list) and result and isinstance(result[0], Document):
70
+ for doc in cast(list[Document], result):
71
+ output_sha256s.append(doc.sha256)
72
+ service.track_document_event(
73
+ document_sha256=doc.sha256,
74
+ span_id=span_id,
75
+ event_type=DocumentEventType.TASK_OUTPUT,
76
+ )
77
+
78
+ # Set span attributes for TrackingSpanProcessor to populate tracked_spans columns
79
+ if input_sha256s or output_sha256s:
80
+ span = otel_trace.get_current_span()
81
+ if input_sha256s:
82
+ span.set_attribute(ATTR_INPUT_DOCUMENT_SHA256S, input_sha256s)
83
+ if output_sha256s:
84
+ span.set_attribute(ATTR_OUTPUT_DOCUMENT_SHA256S, output_sha256s)
85
+
86
+
87
+ def track_flow_io(flow_name: str, input_documents: list[Document], output_documents: list[Document]) -> None: # noqa: ARG001
88
+ """Track input/output documents for a pipeline flow."""
89
+ service = _get_tracking_service()
90
+ if service is None:
91
+ return
92
+
93
+ span_id = get_current_span_id()
94
+ input_sha256s: list[str] = []
95
+ output_sha256s: list[str] = []
96
+
97
+ for doc in input_documents:
98
+ input_sha256s.append(doc.sha256)
99
+ service.track_document_event(
100
+ document_sha256=doc.sha256,
101
+ span_id=span_id,
102
+ event_type=DocumentEventType.FLOW_INPUT,
103
+ )
104
+
105
+ for doc in output_documents:
106
+ output_sha256s.append(doc.sha256)
107
+ service.track_document_event(
108
+ document_sha256=doc.sha256,
109
+ span_id=span_id,
110
+ event_type=DocumentEventType.FLOW_OUTPUT,
111
+ )
112
+
113
+ if input_sha256s or output_sha256s:
114
+ span = otel_trace.get_current_span()
115
+ if input_sha256s:
116
+ span.set_attribute(ATTR_INPUT_DOCUMENT_SHA256S, input_sha256s)
117
+ if output_sha256s:
118
+ span.set_attribute(ATTR_OUTPUT_DOCUMENT_SHA256S, output_sha256s)
119
+
120
+
121
+ def track_llm_documents(context: object | None, messages: object | None) -> None:
122
+ """Track documents used in LLM calls (context and messages)."""
123
+ service = _get_tracking_service()
124
+ if service is None:
125
+ return
126
+
127
+ span_id = get_current_span_id()
128
+
129
+ if context is not None:
130
+ _track_docs_from_messages(service, context, span_id, DocumentEventType.LLM_CONTEXT)
131
+
132
+ if messages is not None:
133
+ _track_docs_from_messages(service, messages, span_id, DocumentEventType.LLM_MESSAGE)
134
+
135
+
136
+ def _track_docs_from_messages(service: TrackingServiceProtocol, messages: object, span_id: str, event_type: DocumentEventType) -> None:
137
+ """Extract and track documents from AIMessages or similar containers."""
138
+ if not isinstance(messages, list):
139
+ return
140
+ for item in cast(list[object], messages):
141
+ if isinstance(item, Document):
142
+ service.track_document_event(
143
+ document_sha256=item.sha256,
144
+ span_id=span_id,
145
+ event_type=event_type,
146
+ )
@@ -0,0 +1,194 @@
1
+ """Observability system initialization.
2
+
3
+ Provides ``initialize_observability()`` as the single entry point for
4
+ setting up Laminar and ClickHouse tracking.
5
+ """
6
+
7
+ import importlib
8
+ from typing import Any, Protocol
9
+ from uuid import UUID
10
+
11
+ from lmnr import Laminar
12
+ from opentelemetry import trace as otel_trace
13
+ from pydantic import BaseModel, ConfigDict
14
+
15
+ from ai_pipeline_core.logging import get_pipeline_logger
16
+ from ai_pipeline_core.observability._tracking._client import ClickHouseClient
17
+ from ai_pipeline_core.observability._tracking._models import DocumentEventType, RunStatus
18
+ from ai_pipeline_core.observability._tracking._processor import TrackingSpanProcessor
19
+ from ai_pipeline_core.observability._tracking._service import TrackingService
20
+ from ai_pipeline_core.settings import settings
21
+
22
+ logger = get_pipeline_logger(__name__)
23
+
24
+
25
+ class TrackingServiceProtocol(Protocol):
26
+ """Protocol for the tracking service methods used by deployment, decorators, and document tracking."""
27
+
28
+ # Run lifecycle
29
+ def set_run_context(self, *, run_id: UUID, project_name: str, flow_name: str, run_scope: str = "") -> None:
30
+ """Store run metadata in context vars for downstream span attribution."""
31
+ ...
32
+
33
+ def track_run_start(self, *, run_id: UUID, project_name: str, flow_name: str, run_scope: str = "") -> None:
34
+ """Record a pipeline run start event to ClickHouse."""
35
+ ...
36
+
37
+ def track_run_end(
38
+ self,
39
+ *,
40
+ run_id: UUID,
41
+ status: RunStatus,
42
+ total_cost: float = ...,
43
+ total_tokens: int = ...,
44
+ metadata: dict[str, object] | None = ...,
45
+ ) -> None:
46
+ """Record a pipeline run completion event with final metrics."""
47
+ ...
48
+
49
+ def clear_run_context(self) -> None:
50
+ """Reset run-scoped context vars after a run finishes."""
51
+ ...
52
+
53
+ # Document tracking
54
+ def track_document_event(
55
+ self,
56
+ *,
57
+ document_sha256: str,
58
+ span_id: str,
59
+ event_type: DocumentEventType,
60
+ metadata: dict[str, str] | None = ...,
61
+ ) -> None:
62
+ """Record a document lifecycle event (created, read, transformed)."""
63
+ ...
64
+
65
+ # Summaries
66
+ def schedule_summary(self, span_id: str, label: str, output_hint: str) -> None:
67
+ """Queue an LLM-generated summary for a span's output."""
68
+ ...
69
+
70
+ # Lifecycle
71
+ def flush(self, timeout: float = 30.0) -> None:
72
+ """Flush all pending tracking events to ClickHouse."""
73
+ ...
74
+
75
+ def shutdown(self, timeout: float = 30.0) -> None:
76
+ """Flush pending events and release tracking resources."""
77
+ ...
78
+
79
+
80
+ _tracking_service: TrackingServiceProtocol | None = None
81
+
82
+
83
+ def get_tracking_service() -> TrackingServiceProtocol | None:
84
+ """Return the global TrackingService instance, or None if not initialized."""
85
+ return _tracking_service
86
+
87
+
88
+ class ObservabilityConfig(BaseModel):
89
+ """Configuration for the observability system."""
90
+
91
+ model_config = ConfigDict(frozen=True)
92
+
93
+ # Laminar
94
+ lmnr_project_api_key: str = ""
95
+ lmnr_debug: str = ""
96
+
97
+ # ClickHouse tracking
98
+ clickhouse_host: str = ""
99
+ clickhouse_port: int = 8443
100
+ clickhouse_database: str = "default"
101
+ clickhouse_user: str = "default"
102
+ clickhouse_password: str = ""
103
+ clickhouse_secure: bool = True
104
+
105
+ # Tracking behavior
106
+ tracking_enabled: bool = True
107
+ tracking_summary_model: str = "gemini-3-flash"
108
+
109
+ @property
110
+ def has_clickhouse(self) -> bool:
111
+ """Whether ClickHouse is configured."""
112
+ return bool(self.clickhouse_host)
113
+
114
+ @property
115
+ def has_lmnr(self) -> bool:
116
+ """Whether Laminar is configured."""
117
+ return bool(self.lmnr_project_api_key)
118
+
119
+
120
+ def _build_config_from_settings() -> ObservabilityConfig:
121
+ """Build ObservabilityConfig from framework Settings."""
122
+ return ObservabilityConfig(
123
+ lmnr_project_api_key=getattr(settings, "lmnr_project_api_key", ""),
124
+ lmnr_debug=getattr(settings, "lmnr_debug", ""),
125
+ clickhouse_host=getattr(settings, "clickhouse_host", ""),
126
+ clickhouse_port=getattr(settings, "clickhouse_port", 8443),
127
+ clickhouse_database=getattr(settings, "clickhouse_database", "default"),
128
+ clickhouse_user=getattr(settings, "clickhouse_user", "default"),
129
+ clickhouse_password=getattr(settings, "clickhouse_password", ""),
130
+ clickhouse_secure=getattr(settings, "clickhouse_secure", True),
131
+ tracking_enabled=getattr(settings, "tracking_enabled", True),
132
+ tracking_summary_model=getattr(settings, "tracking_summary_model", "gemini-3-flash"),
133
+ )
134
+
135
+
136
+ def _setup_tracking(config: ObservabilityConfig) -> TrackingServiceProtocol | None:
137
+ """Set up ClickHouse tracking if configured. Returns TrackingService or None."""
138
+ if not config.has_clickhouse or not config.tracking_enabled:
139
+ return None
140
+
141
+ client = ClickHouseClient(
142
+ host=config.clickhouse_host,
143
+ port=config.clickhouse_port,
144
+ database=config.clickhouse_database,
145
+ username=config.clickhouse_user,
146
+ password=config.clickhouse_password,
147
+ secure=config.clickhouse_secure,
148
+ )
149
+ summary_mod = importlib.import_module("ai_pipeline_core.observability._summary")
150
+ service = TrackingService(
151
+ client,
152
+ summary_model=config.tracking_summary_model,
153
+ span_summary_fn=summary_mod.generate_span_summary,
154
+ )
155
+
156
+ # Register span processor with OTel
157
+ try:
158
+ provider: Any = otel_trace.get_tracer_provider()
159
+ if hasattr(provider, "add_span_processor"):
160
+ processor = TrackingSpanProcessor(service)
161
+ provider.add_span_processor(processor)
162
+ logger.info("ClickHouse tracking initialized")
163
+ except Exception as e:
164
+ logger.warning(f"Failed to register TrackingSpanProcessor: {e}")
165
+
166
+ return service
167
+
168
+
169
+ def initialize_observability(config: ObservabilityConfig | None = None) -> None:
170
+ """Initialize the full observability stack.
171
+
172
+ Call once at pipeline startup. Safe to call multiple times (idempotent
173
+ for Laminar). Reads from Settings if no config provided.
174
+ """
175
+ global _tracking_service # noqa: PLW0603
176
+
177
+ if _tracking_service is not None:
178
+ return # Already initialized
179
+
180
+ if config is None:
181
+ config = _build_config_from_settings()
182
+
183
+ # 1. Laminar
184
+ if config.has_lmnr:
185
+ try:
186
+ Laminar.initialize(project_api_key=config.lmnr_project_api_key, export_timeout_seconds=15)
187
+ logger.info("Laminar initialized")
188
+ except Exception as e:
189
+ logger.warning(f"Laminar initialization failed: {e}")
190
+
191
+ # 2. ClickHouse tracking
192
+ _tracking_service = _setup_tracking(config)
193
+
194
+ # 3. Logging bridge — attached per-logger in get_pipeline_logger(), nothing to do here.
@@ -0,0 +1,57 @@
1
+ """Logging bridge — captures Python log records as OTel span events.
2
+
3
+ Attaches a singleton ``SpanEventLoggingHandler`` to every logger created
4
+ via ``get_pipeline_logger()``. The handler is safe to attach eagerly
5
+ because ``emit()`` is a no-op when no OTel span is recording.
6
+
7
+ This is the only module that legitimately needs ``import logging`` directly
8
+ to subclass ``logging.Handler``. The ruff ban on ``import logging``
9
+ (pyproject.toml) is suppressed with ``# noqa: TID251``.
10
+ """
11
+
12
+ import contextlib
13
+ import logging # noqa: TID251
14
+
15
+ from opentelemetry import trace as otel_trace
16
+
17
+ _MIN_LEVEL = logging.INFO
18
+
19
+
20
+ class SpanEventLoggingHandler(logging.Handler):
21
+ """Logging handler that writes log records as OTel span events.
22
+
23
+ Attached to each logger returned by ``get_pipeline_logger()``.
24
+ Only captures records at INFO level and above. Each record becomes
25
+ a span event with ``log.level`` and ``log.message`` attributes.
26
+ """
27
+
28
+ def __init__(self) -> None:
29
+ super().__init__(level=_MIN_LEVEL)
30
+
31
+ def emit(self, record: logging.LogRecord) -> None:
32
+ """Write a log record as an OTel span event."""
33
+ with contextlib.suppress(Exception):
34
+ # Prevent duplicate events when handler is on both parent and child logger
35
+ if getattr(record, "_span_event_logged", False):
36
+ return
37
+ span = otel_trace.get_current_span()
38
+ if not span.is_recording():
39
+ return
40
+ span.add_event(
41
+ name="log",
42
+ attributes={
43
+ "log.level": record.levelname,
44
+ "log.message": self.format(record),
45
+ "log.logger": record.name,
46
+ },
47
+ )
48
+ record._span_event_logged = True
49
+
50
+
51
+ # Module-level singleton — safe because emit() checks is_recording().
52
+ _bridge_handler = SpanEventLoggingHandler()
53
+
54
+
55
+ def get_bridge_handler() -> SpanEventLoggingHandler:
56
+ """Return the singleton bridge handler for attaching to pipeline loggers."""
57
+ return _bridge_handler
@@ -0,0 +1,81 @@
1
+ """LLM-powered summary generation for tracked spans and documents."""
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from ai_pipeline_core.llm import generate_structured
6
+ from ai_pipeline_core.llm.model_options import ModelOptions
7
+ from ai_pipeline_core.logging import get_pipeline_logger
8
+ from ai_pipeline_core.observability._tracking._internal import internal_tracking_context
9
+
10
+ logger = get_pipeline_logger(__name__)
11
+
12
+ _SPAN_SUMMARY_SYSTEM_PROMPT = (
13
+ "You summarize AI pipeline task results for non-technical users "
14
+ "monitoring a research pipeline.\n"
15
+ "Rules:\n"
16
+ "- Describe the action and outcome, not the content\n"
17
+ "- No internal names, function names, or technical details\n"
18
+ "- No sensitive data (URLs, personal names, company details) from the output\n"
19
+ "- Use present perfect tense"
20
+ )
21
+
22
+ _DOC_SUMMARY_SYSTEM_PROMPT = (
23
+ "You generate metadata for documents in a research pipeline dashboard.\n"
24
+ "Rules:\n"
25
+ "- No sensitive data (URLs, personal names, company details)\n"
26
+ "- Describe purpose and content type, not the content itself\n"
27
+ "- For website documents: short_title must be 'domain.com: Page Title' (shorten title if needed to fit 50 chars)"
28
+ )
29
+
30
+
31
+ class SpanSummary(BaseModel):
32
+ """Structured output for span/task summaries."""
33
+
34
+ summary: str = Field(description="1-2 sentences (max 50 words) describing what the task accomplished in present perfect tense")
35
+
36
+
37
+ class DocumentSummary(BaseModel):
38
+ """Structured output for document summaries."""
39
+
40
+ short_title: str = Field(description="Document title proposition based on content, max 50 characters")
41
+ summary: str = Field(description="1-2 sentences (max 50 words) describing the document's purpose and content type")
42
+
43
+
44
+ async def generate_span_summary(label: str, output_hint: str, model: str = "gemini-3-flash") -> str:
45
+ """Generate a human-readable summary for a span/task output.
46
+
47
+ Returns plain summary string (stored in tracked_spans.user_summary).
48
+ """
49
+ try:
50
+ with internal_tracking_context():
51
+ result = await generate_structured(
52
+ model=model,
53
+ response_format=SpanSummary,
54
+ messages=f"Task: {label}\nResult: {output_hint}",
55
+ options=ModelOptions(system_prompt=_SPAN_SUMMARY_SYSTEM_PROMPT, cache_ttl=None, retries=3, timeout=30),
56
+ purpose=f"span_summary: {label}",
57
+ )
58
+ return result.parsed.summary
59
+ except Exception as e:
60
+ logger.warning(f"Span summary failed for '{label}': {e}")
61
+ return ""
62
+
63
+
64
+ async def generate_document_summary(name: str, excerpt: str, model: str = "gemini-3-flash") -> str:
65
+ """Generate structured metadata for a document.
66
+
67
+ Returns JSON-serialized DocumentSummary (stored in document_index.summary).
68
+ """
69
+ try:
70
+ with internal_tracking_context():
71
+ result = await generate_structured(
72
+ model=model,
73
+ response_format=DocumentSummary,
74
+ messages=f"Document: {name}\nContent excerpt:\n{excerpt}",
75
+ options=ModelOptions(system_prompt=_DOC_SUMMARY_SYSTEM_PROMPT, cache_ttl=None, retries=3, timeout=30),
76
+ purpose=f"document_summary: {name}",
77
+ )
78
+ return result.parsed.model_dump_json()
79
+ except Exception as e:
80
+ logger.warning(f"Document summary failed for '{name}': {e}")
81
+ return ""
@@ -0,0 +1,6 @@
1
+ """ClickHouse-based tracking system for pipeline observability.
2
+
3
+ Import from submodules directly:
4
+ - ``from ai_pipeline_core.observability._tracking._models import ...``
5
+ - ``from ai_pipeline_core.observability._tracking._service import ...``
6
+ """
@@ -0,0 +1,178 @@
1
+ """ClickHouse client with lazy connection and table management."""
2
+
3
+ import clickhouse_connect
4
+ from pydantic import BaseModel
5
+
6
+ from ai_pipeline_core.logging import get_pipeline_logger
7
+
8
+ from ._models import (
9
+ TABLE_DOCUMENT_EVENTS,
10
+ TABLE_PIPELINE_RUNS,
11
+ TABLE_SPAN_EVENTS,
12
+ TABLE_TRACKED_SPANS,
13
+ TrackedSpanRow,
14
+ )
15
+
16
+ logger = get_pipeline_logger(__name__)
17
+
18
+ # SQL statements for table creation
19
+ _CREATE_TABLES_SQL = [
20
+ f"""
21
+ CREATE TABLE IF NOT EXISTS {TABLE_PIPELINE_RUNS}
22
+ (
23
+ run_id UUID,
24
+ project_name LowCardinality(String),
25
+ flow_name LowCardinality(String),
26
+ run_scope String DEFAULT '',
27
+ status LowCardinality(String),
28
+ start_time DateTime64(3, 'UTC'),
29
+ end_time Nullable(DateTime64(3, 'UTC')),
30
+ total_cost Float64 DEFAULT 0,
31
+ total_tokens UInt64 DEFAULT 0,
32
+ metadata String DEFAULT '{{}}' CODEC(ZSTD(3)),
33
+ version UInt64 DEFAULT 1
34
+ )
35
+ ENGINE = ReplacingMergeTree(version)
36
+ PARTITION BY toYYYYMM(start_time)
37
+ ORDER BY (run_id)
38
+ SETTINGS index_granularity = 8192
39
+ """,
40
+ f"""
41
+ CREATE TABLE IF NOT EXISTS {TABLE_TRACKED_SPANS}
42
+ (
43
+ span_id String,
44
+ trace_id String,
45
+ run_id UUID,
46
+ parent_span_id Nullable(String),
47
+ name String,
48
+ span_type LowCardinality(String),
49
+ status LowCardinality(String),
50
+ start_time DateTime64(3, 'UTC'),
51
+ end_time Nullable(DateTime64(3, 'UTC')),
52
+ duration_ms UInt64 DEFAULT 0,
53
+ cost Float64 DEFAULT 0,
54
+ tokens_input UInt64 DEFAULT 0,
55
+ tokens_output UInt64 DEFAULT 0,
56
+ llm_model LowCardinality(Nullable(String)),
57
+ user_summary Nullable(String) CODEC(ZSTD(3)),
58
+ user_visible Bool DEFAULT false,
59
+ user_label Nullable(String),
60
+ input_document_sha256s Array(String),
61
+ output_document_sha256s Array(String),
62
+ version UInt64 DEFAULT 1,
63
+ INDEX idx_trace trace_id TYPE bloom_filter GRANULARITY 1
64
+ )
65
+ ENGINE = ReplacingMergeTree(version)
66
+ PARTITION BY toYYYYMM(start_time)
67
+ ORDER BY (run_id, span_id)
68
+ SETTINGS index_granularity = 8192
69
+ """,
70
+ f"""
71
+ CREATE TABLE IF NOT EXISTS {TABLE_DOCUMENT_EVENTS}
72
+ (
73
+ event_id UUID,
74
+ run_id UUID,
75
+ document_sha256 String,
76
+ span_id String,
77
+ event_type LowCardinality(String),
78
+ timestamp DateTime64(3, 'UTC'),
79
+ metadata String DEFAULT '{{}}' CODEC(ZSTD(3))
80
+ )
81
+ ENGINE = MergeTree
82
+ PARTITION BY toYYYYMM(timestamp)
83
+ ORDER BY (run_id, document_sha256, timestamp)
84
+ SETTINGS index_granularity = 8192
85
+ """,
86
+ f"""
87
+ CREATE TABLE IF NOT EXISTS {TABLE_SPAN_EVENTS}
88
+ (
89
+ event_id UUID,
90
+ run_id UUID,
91
+ span_id String,
92
+ name String,
93
+ timestamp DateTime64(3, 'UTC'),
94
+ attributes String DEFAULT '{{}}' CODEC(ZSTD(3)),
95
+ level LowCardinality(Nullable(String))
96
+ )
97
+ ENGINE = MergeTree
98
+ PARTITION BY toYYYYMM(timestamp)
99
+ ORDER BY (run_id, span_id, timestamp)
100
+ SETTINGS index_granularity = 8192
101
+ """,
102
+ ]
103
+
104
+
105
+ class ClickHouseClient:
106
+ """Synchronous ClickHouse client with lazy connection.
107
+
108
+ All methods are synchronous and must be called from the writer background
109
+ thread — never from the async event loop. Connection is deferred to
110
+ ``connect()`` which is called from the writer thread's ``_run()`` startup.
111
+ """
112
+
113
+ def __init__(
114
+ self,
115
+ *,
116
+ host: str,
117
+ port: int = 8443,
118
+ database: str = "default",
119
+ username: str = "default",
120
+ password: str = "",
121
+ secure: bool = True,
122
+ ) -> None:
123
+ """Store connection params. Does NOT connect yet."""
124
+ self._params = {
125
+ "host": host,
126
+ "port": port,
127
+ "database": database,
128
+ "username": username,
129
+ "password": password,
130
+ "secure": secure,
131
+ }
132
+ self._client: object | None = None
133
+ self._tables_initialized = False
134
+
135
+ def connect(self) -> None:
136
+ """Connect to ClickHouse. Call from writer thread, not async context."""
137
+ self._client = clickhouse_connect.get_client(**self._params) # pyright: ignore[reportArgumentType, reportUnknownMemberType]
138
+ logger.info(f"Connected to ClickHouse at {self._params['host']}:{self._params['port']}")
139
+
140
+ def ensure_tables(self) -> None:
141
+ """Create tables if they don't exist. Call after connect()."""
142
+ if self._client is None:
143
+ raise RuntimeError("Not connected — call connect() first")
144
+ if self._tables_initialized:
145
+ return
146
+ for sql in _CREATE_TABLES_SQL:
147
+ self._client.command(sql) # type: ignore[union-attr]
148
+
149
+ self._tables_initialized = True
150
+ logger.info("ClickHouse tables verified/created")
151
+
152
+ def _insert_rows(self, table: str, rows: list[BaseModel]) -> None:
153
+ """Insert rows into a table using columnar format."""
154
+ if not rows or self._client is None:
155
+ return
156
+ column_names = list(type(rows[0]).model_fields.keys())
157
+ data = [[getattr(row, col) for row in rows] for col in column_names]
158
+ self._client.insert(table, data, column_names=column_names, column_oriented=True) # type: ignore[union-attr]
159
+
160
+ def insert_runs(self, rows: list[BaseModel]) -> None:
161
+ """Insert pipeline run rows."""
162
+ self._insert_rows(TABLE_PIPELINE_RUNS, rows)
163
+
164
+ def insert_spans(self, rows: list[BaseModel]) -> None:
165
+ """Insert tracked span rows."""
166
+ self._insert_rows(TABLE_TRACKED_SPANS, rows)
167
+
168
+ def insert_document_events(self, rows: list[BaseModel]) -> None:
169
+ """Insert document event rows."""
170
+ self._insert_rows(TABLE_DOCUMENT_EVENTS, rows)
171
+
172
+ def insert_span_events(self, rows: list[BaseModel]) -> None:
173
+ """Insert span event rows."""
174
+ self._insert_rows(TABLE_SPAN_EVENTS, rows)
175
+
176
+ def update_span(self, row: TrackedSpanRow) -> None:
177
+ """Insert a single replacement span row (ReplacingMergeTree update)."""
178
+ self.insert_spans([row])
@@ -0,0 +1,28 @@
1
+ """Thread-local flag to prevent tracking recursion.
2
+
3
+ When summary generation calls ``llm.generate()``, the resulting span must NOT
4
+ be tracked again (infinite loop). The flag is checked by
5
+ ``TrackingSpanProcessor.on_end()``.
6
+ """
7
+
8
+ import threading
9
+ from collections.abc import Generator
10
+ from contextlib import contextmanager
11
+
12
+ _internal = threading.local()
13
+
14
+
15
+ def is_internal_tracking() -> bool:
16
+ """Return True if the current thread is inside a tracking-internal LLM call."""
17
+ return getattr(_internal, "active", False)
18
+
19
+
20
+ @contextmanager
21
+ def internal_tracking_context() -> Generator[None, None, None]:
22
+ """Mark the current thread as performing internal tracking work."""
23
+ prev = getattr(_internal, "active", False)
24
+ _internal.active = True
25
+ try:
26
+ yield
27
+ finally:
28
+ _internal.active = prev