PyPI - ai-pipeline-core - Versions diffs - 0.1.13__tar.gz → 0.2.0__tar.gz - Mend

ai-pipeline-core 0.1.13tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{ai_pipeline_core-0.1.13 → ai_pipeline_core-0.2.0}/.gitignore RENAMED Viewed

@@ -112,6 +112,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+key.json
 # Spyder project settings
 .spyderproject

{ai_pipeline_core-0.1.13 → ai_pipeline_core-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ai-pipeline-core
-Version: 0.1.13
+Version: 0.2.0
 Summary: Core utilities for AI-powered processing pipelines using prefect
 Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
 Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -22,6 +22,7 @@ Requires-Dist: httpx>=0.28.1
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: lmnr>=0.7.6
 Requires-Dist: openai>=1.99.9
+Requires-Dist: prefect-gcp[cloud-storage]>=0.6.10
 Requires-Dist: prefect>=3.4.13
 Requires-Dist: pydantic-settings>=2.10.1
 Requires-Dist: pydantic>=2.11.7
@@ -57,11 +58,11 @@ AI Pipeline Core is a production-ready framework that combines document processi
 ### Key Features
-- **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection
-- **LLM Integration**: Unified interface to any model via LiteLLM proxy with intelligent context caching
+- **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection and provenance tracking
+- **LLM Integration**: Unified interface to any model via LiteLLM proxy with configurable context caching
 - **Structured Output**: Type-safe generation with Pydantic model validation
 - **Workflow Orchestration**: Prefect-based flows and tasks with automatic retries
-- **Observability**: Built-in distributed tracing via Laminar (LMNR) for debugging and monitoring
+- **Observability**: Built-in distributed tracing via Laminar (LMNR) with cost tracking for debugging and monitoring
 - **Local Development**: Simple runner for testing pipelines without infrastructure
 ## Installation
@@ -111,15 +112,13 @@ class AnalysisConfig(FlowConfig):
     INPUT_DOCUMENT_TYPES = [InputDoc]
     OUTPUT_DOCUMENT_TYPE = OutputDoc
-# Create pipeline flow
-@pipeline_flow
+# Create pipeline flow with required config
+@pipeline_flow(config=AnalysisConfig)
 async def analyze_flow(
     project_name: str,
     documents: DocumentList,
     flow_options: FlowOptions
 ) -> DocumentList:
-    config = AnalysisConfig()
     # Process documents
     outputs = []
     for doc in documents:
@@ -136,7 +135,7 @@ async def analyze_flow(
         outputs.append(output)
     # RECOMMENDED: Always validate output
-    return config.create_and_validate_output(outputs)
+    return AnalysisConfig.create_and_validate_output(outputs)
 ```
 ### Structured Output
@@ -178,6 +177,19 @@ doc = MyDocument.create(
 # Parse back to original type
 data = doc.parse(dict)  # Returns {"key": "value"}
+# Document provenance tracking (new in v0.1.14)
+doc_with_sources = MyDocument.create(
+    name="derived.json",
+    content={"result": "processed"},
+    sources=[source_doc.sha256, "https://api.example.com/data"]
+)
+# Check provenance
+for hash in doc_with_sources.get_source_documents():
+    print(f"Derived from document: {hash}")
+for ref in doc_with_sources.get_source_references():
+    print(f"External source: {ref}")
 # Temporary documents (never persisted)
 temp = TemporaryDocument.create(
     name="api_response.json",
@@ -211,6 +223,10 @@ if doc.is_text:
 # Parse structured data
 data = doc.as_json()  # or as_yaml(), as_pydantic_model()
+# Enhanced filtering (new in v0.1.14)
+filtered = documents.filter_by([Doc1, Doc2, Doc3])  # Multiple types
+named = documents.filter_by(["file1.txt", "file2.txt"])  # Multiple names
 ```
 ### LLM Integration
@@ -233,7 +249,7 @@ static_context = AIMessages([large_document])
 # First call: caches context
 r1 = await llm.generate(
     model="gpt-5",
-    context=static_context,  # Cached for 120 seconds
+    context=static_context,  # Cached for 120 seconds by default
     messages="Summarize"     # Dynamic query
 )
@@ -243,6 +259,22 @@ r2 = await llm.generate(
     context=static_context,  # Reused from cache!
     messages="Key points?"   # Different query
 )
+# Custom cache TTL (new in v0.1.14)
+response = await llm.generate(
+    model="gpt-5",
+    context=static_context,
+    messages="Analyze",
+    options=ModelOptions(cache_ttl="300s")  # Cache for 5 minutes
+)
+# Disable caching for dynamic contexts
+response = await llm.generate(
+    model="gpt-5",
+    context=dynamic_context,
+    messages="Process",
+    options=ModelOptions(cache_ttl=None)  # No caching
+)
 ```
 ### Flow Configuration
@@ -256,15 +288,15 @@ class ProcessingConfig(FlowConfig):
     INPUT_DOCUMENT_TYPES = [RawDataDocument]
     OUTPUT_DOCUMENT_TYPE = ProcessedDocument  # Must be different!
-    # Use in flows for validation
-    @pipeline_flow
-    async def process(
-        config: ProcessingConfig,
-        documents: DocumentList,
-        flow_options: FlowOptions
-    ) -> DocumentList:
-        # ... processing logic ...
-        return config.create_and_validate_output(outputs)
+# Use in flows for validation
+@pipeline_flow(config=ProcessingConfig)
+async def process(
+    project_name: str,
+    documents: DocumentList,
+    flow_options: FlowOptions
+) -> DocumentList:
+    # ... processing logic ...
+    return ProcessingConfig.create_and_validate_output(outputs)
 ```
 ### Pipeline Decorators
@@ -272,13 +304,15 @@ class ProcessingConfig(FlowConfig):
 Enhanced decorators with built-in tracing and monitoring:
 ```python
-from ai_pipeline_core import pipeline_flow, pipeline_task
+from ai_pipeline_core import pipeline_flow, pipeline_task, set_trace_cost
 @pipeline_task  # Automatic retry, tracing, and monitoring
 async def process_chunk(data: str) -> str:
-    return await transform(data)
+    result = await transform(data)
+    set_trace_cost(0.05)  # Track costs (new in v0.1.14)
+    return result
-@pipeline_flow  # Full observability and orchestration
+@pipeline_flow(config=MyFlowConfig)  # Full observability and orchestration
 async def main_flow(
     project_name: str,
     documents: DocumentList,
@@ -304,6 +338,9 @@ LMNR_DEBUG=true  # Enable debug traces
 # Optional: Orchestration
 PREFECT_API_URL=http://localhost:4200/api
 PREFECT_API_KEY=your-prefect-key
+# Optional: Storage (for Google Cloud Storage)
+GCS_SERVICE_ACCOUNT_FILE=/path/to/service-account.json  # GCS auth file
 ```
 ### Settings Management
@@ -331,7 +368,7 @@ print(settings.app_name)
 ### Framework Rules (90% Use Cases)
-1. **Decorators**: Use `@trace`, `@pipeline_task`, `@pipeline_flow` WITHOUT parameters
+1. **Decorators**: Use `@pipeline_task` WITHOUT parameters, `@pipeline_flow` WITH config
 2. **Logging**: Use `get_pipeline_logger(__name__)` - NEVER `print()` or `logging` module
 3. **LLM calls**: Use `AIMessages` or `str`. Wrap Documents in `AIMessages`
 4. **Options**: Omit `ModelOptions` unless specifically needed (defaults are optimal)

{ai_pipeline_core-0.1.13 → ai_pipeline_core-0.2.0}/README.md RENAMED Viewed

@@ -13,11 +13,11 @@ AI Pipeline Core is a production-ready framework that combines document processi
 ### Key Features
-- **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection
-- **LLM Integration**: Unified interface to any model via LiteLLM proxy with intelligent context caching
+- **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection and provenance tracking
+- **LLM Integration**: Unified interface to any model via LiteLLM proxy with configurable context caching
 - **Structured Output**: Type-safe generation with Pydantic model validation
 - **Workflow Orchestration**: Prefect-based flows and tasks with automatic retries
-- **Observability**: Built-in distributed tracing via Laminar (LMNR) for debugging and monitoring
+- **Observability**: Built-in distributed tracing via Laminar (LMNR) with cost tracking for debugging and monitoring
 - **Local Development**: Simple runner for testing pipelines without infrastructure
 ## Installation
@@ -67,15 +67,13 @@ class AnalysisConfig(FlowConfig):
     INPUT_DOCUMENT_TYPES = [InputDoc]
     OUTPUT_DOCUMENT_TYPE = OutputDoc
-# Create pipeline flow
-@pipeline_flow
+# Create pipeline flow with required config
+@pipeline_flow(config=AnalysisConfig)
 async def analyze_flow(
     project_name: str,
     documents: DocumentList,
     flow_options: FlowOptions
 ) -> DocumentList:
-    config = AnalysisConfig()
     # Process documents
     outputs = []
     for doc in documents:
@@ -92,7 +90,7 @@ async def analyze_flow(
         outputs.append(output)
     # RECOMMENDED: Always validate output
-    return config.create_and_validate_output(outputs)
+    return AnalysisConfig.create_and_validate_output(outputs)
 ```
 ### Structured Output
@@ -134,6 +132,19 @@ doc = MyDocument.create(
 # Parse back to original type
 data = doc.parse(dict)  # Returns {"key": "value"}
+# Document provenance tracking (new in v0.1.14)
+doc_with_sources = MyDocument.create(
+    name="derived.json",
+    content={"result": "processed"},
+    sources=[source_doc.sha256, "https://api.example.com/data"]
+)
+# Check provenance
+for hash in doc_with_sources.get_source_documents():
+    print(f"Derived from document: {hash}")
+for ref in doc_with_sources.get_source_references():
+    print(f"External source: {ref}")
 # Temporary documents (never persisted)
 temp = TemporaryDocument.create(
     name="api_response.json",
@@ -167,6 +178,10 @@ if doc.is_text:
 # Parse structured data
 data = doc.as_json()  # or as_yaml(), as_pydantic_model()
+# Enhanced filtering (new in v0.1.14)
+filtered = documents.filter_by([Doc1, Doc2, Doc3])  # Multiple types
+named = documents.filter_by(["file1.txt", "file2.txt"])  # Multiple names
 ```
 ### LLM Integration
@@ -189,7 +204,7 @@ static_context = AIMessages([large_document])
 # First call: caches context
 r1 = await llm.generate(
     model="gpt-5",
-    context=static_context,  # Cached for 120 seconds
+    context=static_context,  # Cached for 120 seconds by default
     messages="Summarize"     # Dynamic query
 )
@@ -199,6 +214,22 @@ r2 = await llm.generate(
     context=static_context,  # Reused from cache!
     messages="Key points?"   # Different query
 )
+# Custom cache TTL (new in v0.1.14)
+response = await llm.generate(
+    model="gpt-5",
+    context=static_context,
+    messages="Analyze",
+    options=ModelOptions(cache_ttl="300s")  # Cache for 5 minutes
+)
+# Disable caching for dynamic contexts
+response = await llm.generate(
+    model="gpt-5",
+    context=dynamic_context,
+    messages="Process",
+    options=ModelOptions(cache_ttl=None)  # No caching
+)
 ```
 ### Flow Configuration
@@ -212,15 +243,15 @@ class ProcessingConfig(FlowConfig):
     INPUT_DOCUMENT_TYPES = [RawDataDocument]
     OUTPUT_DOCUMENT_TYPE = ProcessedDocument  # Must be different!
-    # Use in flows for validation
-    @pipeline_flow
-    async def process(
-        config: ProcessingConfig,
-        documents: DocumentList,
-        flow_options: FlowOptions
-    ) -> DocumentList:
-        # ... processing logic ...
-        return config.create_and_validate_output(outputs)
+# Use in flows for validation
+@pipeline_flow(config=ProcessingConfig)
+async def process(
+    project_name: str,
+    documents: DocumentList,
+    flow_options: FlowOptions
+) -> DocumentList:
+    # ... processing logic ...
+    return ProcessingConfig.create_and_validate_output(outputs)
 ```
 ### Pipeline Decorators
@@ -228,13 +259,15 @@ class ProcessingConfig(FlowConfig):
 Enhanced decorators with built-in tracing and monitoring:
 ```python
-from ai_pipeline_core import pipeline_flow, pipeline_task
+from ai_pipeline_core import pipeline_flow, pipeline_task, set_trace_cost
 @pipeline_task  # Automatic retry, tracing, and monitoring
 async def process_chunk(data: str) -> str:
-    return await transform(data)
+    result = await transform(data)
+    set_trace_cost(0.05)  # Track costs (new in v0.1.14)
+    return result
-@pipeline_flow  # Full observability and orchestration
+@pipeline_flow(config=MyFlowConfig)  # Full observability and orchestration
 async def main_flow(
     project_name: str,
     documents: DocumentList,
@@ -260,6 +293,9 @@ LMNR_DEBUG=true  # Enable debug traces
 # Optional: Orchestration
 PREFECT_API_URL=http://localhost:4200/api
 PREFECT_API_KEY=your-prefect-key
+# Optional: Storage (for Google Cloud Storage)
+GCS_SERVICE_ACCOUNT_FILE=/path/to/service-account.json  # GCS auth file
 ```
 ### Settings Management
@@ -287,7 +323,7 @@ print(settings.app_name)
 ### Framework Rules (90% Use Cases)
-1. **Decorators**: Use `@trace`, `@pipeline_task`, `@pipeline_flow` WITHOUT parameters
+1. **Decorators**: Use `@pipeline_task` WITHOUT parameters, `@pipeline_flow` WITH config
 2. **Logging**: Use `get_pipeline_logger(__name__)` - NEVER `print()` or `logging` module
 3. **LLM calls**: Use `AIMessages` or `str`. Wrap Documents in `AIMessages`
 4. **Options**: Omit `ModelOptions` unless specifically needed (defaults are optimal)

{ai_pipeline_core-0.1.13 → ai_pipeline_core-0.2.0}/ai_pipeline_core/__init__.py RENAMED Viewed

@@ -7,7 +7,7 @@ It combines document processing, LLM integration, and workflow orchestration int
 system designed for production use.
 The framework enforces best practices through strong typing (Pydantic), automatic retries,
-cost tracking, and distributed tracing. All I/O operations are async for maximum throughput.
+and cost tracking. All I/O operations are async for maximum throughput.
 **CRITICAL IMPORT RULE**:
     Always import from the top-level package:
@@ -18,12 +18,12 @@ cost tracking, and distributed tracing. All I/O operations are async for maximum
         from ai_pipeline_core.llm import generate  # NO!
         from ai_pipeline_core.documents import FlowDocument  # NO!
-FRAMEWORK RULES (90% Use Cases):
-    1. Decorators: Use @trace, @pipeline_task, @pipeline_flow WITHOUT parameters
+FRAMEWORK RULES (Use by default, unless instructed otherwise):
+    1. Decorators: Use @pipeline_task WITHOUT parameters, @pipeline_flow WITH config
     2. Logging: Use get_pipeline_logger(__name__) - NEVER print() or logging module
     3. LLM calls: Use AIMessages or str. Wrap Documents in AIMessages; do not call .text yourself
-    4. Options: Omit ModelOptions unless specifically needed (defaults are optimal)
-    5. Documents: Create with just name and content - skip description
+    4. Options: DO NOT use options parameter - omit it entirely (defaults are optimal)
+    5. Documents: Create with just name and content - skip description unless needed
     6. FlowConfig: OUTPUT_DOCUMENT_TYPE must differ from all INPUT_DOCUMENT_TYPES
     7. Initialization: PromptManager and logger at module scope, not in functions
     8. DocumentList: Use default constructor - no validation flags needed
@@ -36,18 +36,22 @@ Core Capabilities:
     - **LLM Integration**: Unified interface to any model via LiteLLM with caching
     - **Structured Output**: Type-safe generation with Pydantic model validation
     - **Workflow Orchestration**: Prefect-based flows and tasks with retries
-    - **Observability**: Distributed tracing via Laminar (LMNR) for debugging
+    - **Observability**: Built-in monitoring and debugging capabilities
     - **Local Development**: Simple runner for testing without infrastructure
 Quick Start:
     >>> from ai_pipeline_core import (
-    ...     pipeline_flow, FlowDocument, DocumentList, FlowOptions, llm, AIMessages
+    ...     pipeline_flow, FlowDocument, DocumentList, FlowOptions, FlowConfig, llm, AIMessages
     ... )
     >>>
     >>> class OutputDoc(FlowDocument):
     ...     '''Analysis result document.'''
     >>>
-    >>> @pipeline_flow
+    >>> class MyFlowConfig(FlowConfig):
+    ...     INPUT_DOCUMENT_TYPES = []
+    ...     OUTPUT_DOCUMENT_TYPE = OutputDoc
+    >>>
+    >>> @pipeline_flow(config=MyFlowConfig)
     >>> async def analyze_flow(
     ...     project_name: str,
     ...     documents: DocumentList,
@@ -55,7 +59,7 @@ Quick Start:
     ... ) -> DocumentList:
     ...     # Messages accept AIMessages or str. Wrap documents: AIMessages([doc])
     ...     response = await llm.generate(
-    ...         model="gpt-5",
+    ...         "gpt-5",
     ...         messages=AIMessages([documents[0]])
     ...     )
     ...     result = OutputDoc.create(
@@ -76,8 +80,6 @@ Optional Environment Variables:
     - PREFECT_API_KEY: Prefect API authentication key
     - LMNR_PROJECT_API_KEY: Laminar (LMNR) API key for tracing
     - LMNR_DEBUG: Set to "true" to enable debug-level traces
-    - LMNR_SESSION_ID: Default session ID for traces
-    - LMNR_USER_ID: Default user ID for traces
 """
 from . import llm
@@ -88,6 +90,7 @@ from .documents import (
     TaskDocument,
     TemporaryDocument,
     canonical_name_key,
+    is_document_sha256,
     sanitize_url,
 )
 from .flow import FlowConfig, FlowOptions
@@ -98,6 +101,8 @@ from .llm import (
     ModelOptions,
     ModelResponse,
     StructuredModelResponse,
+    generate,
+    generate_structured,
 )
 from .logging import (
     LoggerMixin,
@@ -111,9 +116,9 @@ from .pipeline import pipeline_flow, pipeline_task
 from .prefect import disable_run_logger, prefect_test_harness
 from .prompt_manager import PromptManager
 from .settings import Settings
-from .tracing import TraceInfo, TraceLevel, trace
+from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
-__version__ = "0.1.13"
+__version__ = "0.2.0"
 __all__ = [
     # Config/Settings
@@ -132,6 +137,7 @@ __all__ = [
     "TaskDocument",
     "TemporaryDocument",
     "canonical_name_key",
+    "is_document_sha256",
     "sanitize_url",
     # Flow/Task
     "FlowConfig",
@@ -143,7 +149,9 @@ __all__ = [
     "prefect_test_harness",
     "disable_run_logger",
     # LLM
-    "llm",
+    "llm",  # for backward compatibility
+    "generate",
+    "generate_structured",
     "ModelName",
     "ModelOptions",
     "ModelResponse",
@@ -154,6 +162,9 @@ __all__ = [
     "trace",
     "TraceLevel",
     "TraceInfo",
+    "set_trace_cost",
     # Utils
     "PromptManager",
+    "generate",
+    "generate_structured",
 ]

{ai_pipeline_core-0.1.13 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/__init__.py RENAMED Viewed

@@ -12,7 +12,7 @@ from .document_list import DocumentList
 from .flow_document import FlowDocument
 from .task_document import TaskDocument
 from .temporary_document import TemporaryDocument
-from .utils import canonical_name_key, sanitize_url
+from .utils import canonical_name_key, is_document_sha256, sanitize_url
 __all__ = [
     "Document",
@@ -21,5 +21,6 @@ __all__ = [
     "TaskDocument",
     "TemporaryDocument",
     "canonical_name_key",
+    "is_document_sha256",
     "sanitize_url",
 ]

ai-pipeline-core 0.1.13__tar.gz → 0.2.0__tar.gz

ai-pipeline-core 0.1.13tar.gz → 0.2.0tar.gz