ai-pipeline-core 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. ai_pipeline_core/__init__.py +84 -4
  2. ai_pipeline_core/documents/__init__.py +9 -0
  3. ai_pipeline_core/documents/document.py +1044 -152
  4. ai_pipeline_core/documents/document_list.py +147 -38
  5. ai_pipeline_core/documents/flow_document.py +112 -11
  6. ai_pipeline_core/documents/mime_type.py +173 -15
  7. ai_pipeline_core/documents/task_document.py +117 -12
  8. ai_pipeline_core/documents/temporary_document.py +84 -5
  9. ai_pipeline_core/documents/utils.py +41 -9
  10. ai_pipeline_core/exceptions.py +47 -11
  11. ai_pipeline_core/flow/__init__.py +2 -0
  12. ai_pipeline_core/flow/config.py +236 -27
  13. ai_pipeline_core/flow/options.py +50 -1
  14. ai_pipeline_core/llm/__init__.py +6 -0
  15. ai_pipeline_core/llm/ai_messages.py +125 -27
  16. ai_pipeline_core/llm/client.py +278 -26
  17. ai_pipeline_core/llm/model_options.py +130 -1
  18. ai_pipeline_core/llm/model_response.py +239 -35
  19. ai_pipeline_core/llm/model_types.py +67 -0
  20. ai_pipeline_core/logging/__init__.py +13 -0
  21. ai_pipeline_core/logging/logging_config.py +72 -20
  22. ai_pipeline_core/logging/logging_mixin.py +38 -32
  23. ai_pipeline_core/pipeline.py +363 -60
  24. ai_pipeline_core/prefect.py +48 -1
  25. ai_pipeline_core/prompt_manager.py +209 -24
  26. ai_pipeline_core/settings.py +108 -4
  27. ai_pipeline_core/simple_runner/__init__.py +5 -0
  28. ai_pipeline_core/simple_runner/cli.py +96 -11
  29. ai_pipeline_core/simple_runner/simple_runner.py +237 -4
  30. ai_pipeline_core/tracing.py +253 -30
  31. ai_pipeline_core-0.1.12.dist-info/METADATA +450 -0
  32. ai_pipeline_core-0.1.12.dist-info/RECORD +36 -0
  33. ai_pipeline_core-0.1.10.dist-info/METADATA +0 -538
  34. ai_pipeline_core-0.1.10.dist-info/RECORD +0 -36
  35. {ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.12.dist-info}/WHEEL +0 -0
  36. {ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.12.dist-info}/licenses/LICENSE +0 -0
@@ -1,27 +1,130 @@
1
- """Flow configuration base class."""
1
+ """Flow configuration system for type-safe pipeline definitions.
2
+
3
+ @public
4
+
5
+ This module provides the FlowConfig abstract base class that enforces
6
+ type safety for flow inputs and outputs in the pipeline system.
7
+
8
+ Best Practice:
9
+ Always finish @pipeline_flow functions with create_and_validate_output()
10
+ to ensure type safety and proper validation of output documents.
11
+ """
2
12
 
3
13
  from abc import ABC
4
- from typing import Any, ClassVar
14
+ from typing import Any, ClassVar, Iterable
5
15
 
6
16
  from ai_pipeline_core.documents import DocumentList, FlowDocument
17
+ from ai_pipeline_core.exceptions import DocumentValidationError
7
18
 
8
19
 
9
20
  class FlowConfig(ABC):
10
- """
11
- Configuration for a flow. It makes flow easier to implement and test.
21
+ """Abstract base class for type-safe flow configuration.
22
+
23
+ @public
24
+
25
+ FlowConfig defines the contract for flow inputs and outputs, ensuring
26
+ type safety and preventing circular dependencies in pipeline flows.
27
+ Each flow must have a corresponding FlowConfig subclass that specifies
28
+ its input document types and output document type.
29
+
30
+ CRITICAL RULE: OUTPUT_DOCUMENT_TYPE must NEVER be in INPUT_DOCUMENT_TYPES!
31
+ This prevents circular dependencies as flows chain together.
32
+ Each flow transforms input types to a DIFFERENT output type.
33
+
34
+ Class Variables:
35
+ INPUT_DOCUMENT_TYPES: List of FlowDocument types this flow accepts
36
+ OUTPUT_DOCUMENT_TYPE: Single FlowDocument type this flow produces
37
+
38
+ Validation Rules:
39
+ - INPUT_DOCUMENT_TYPES and OUTPUT_DOCUMENT_TYPE must be defined
40
+ - OUTPUT_DOCUMENT_TYPE cannot be in INPUT_DOCUMENT_TYPES (prevents cycles)
41
+ - Field names must be exact (common typos are detected)
42
+
43
+ Why this matters:
44
+ Flows connect in pipelines where one flow's output becomes another's input.
45
+ Same input/output types would create infinite loops or circular dependencies.
46
+
47
+ Example:
48
+ >>> # CORRECT - Different output type from inputs
49
+ >>> class ProcessingFlowConfig(FlowConfig):
50
+ ... INPUT_DOCUMENT_TYPES = [RawDataDocument]
51
+ ... OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Different type!
52
+ >>>
53
+ >>> # Use in @pipeline_flow - RECOMMENDED PATTERN
54
+ >>> @pipeline_flow(name="processing")
55
+ >>> async def process(config: ProcessingFlowConfig, docs: DocumentList) -> DocumentList:
56
+ ... outputs = []
57
+ ... # ... processing logic ...
58
+ ... return config.create_and_validate_output(outputs)
59
+
60
+ >>> # WRONG - Will raise TypeError
61
+ >>> class BadConfig(FlowConfig):
62
+ ... INPUT_DOCUMENT_TYPES = [DataDocument]
63
+ ... OUTPUT_DOCUMENT_TYPE = DataDocument # SAME TYPE - NOT ALLOWED!
64
+
65
+ Note:
66
+ - Validation happens at class definition time
67
+ - Helps catch configuration errors early
68
+ - Used by simple_runner to manage document flow
12
69
  """
13
70
 
14
71
  INPUT_DOCUMENT_TYPES: ClassVar[list[type[FlowDocument]]]
15
72
  OUTPUT_DOCUMENT_TYPE: ClassVar[type[FlowDocument]]
16
73
 
17
74
  def __init_subclass__(cls, **kwargs: Any):
18
- """Validate that OUTPUT_DOCUMENT_TYPE is not in INPUT_DOCUMENT_TYPES."""
75
+ """Validate flow configuration at subclass definition time.
76
+
77
+ Performs comprehensive validation when a FlowConfig subclass is defined:
78
+ 1. Checks for common field name mistakes (typos)
79
+ 2. Ensures required fields are defined
80
+ 3. Prevents circular dependencies (output != input)
81
+
82
+ Args:
83
+ **kwargs: Additional arguments for parent __init_subclass__.
84
+
85
+ Raises:
86
+ TypeError: If configuration violates any validation rules:
87
+ - Missing required fields
88
+ - Incorrect field names
89
+ - Circular dependency detected
90
+
91
+ Note:
92
+ This runs at class definition time, not instantiation,
93
+ providing immediate feedback during development.
94
+ """
19
95
  super().__init_subclass__(**kwargs)
20
96
 
21
97
  # Skip validation for the abstract base class itself
22
98
  if cls.__name__ == "FlowConfig":
23
99
  return
24
100
 
101
+ # Check for invalid field names (common mistakes)
102
+ allowed_fields = {"INPUT_DOCUMENT_TYPES", "OUTPUT_DOCUMENT_TYPE"}
103
+ class_attrs = {name for name in dir(cls) if not name.startswith("_") and name.isupper()}
104
+
105
+ # Find fields that look like they might be mistakes
106
+ suspicious_fields = class_attrs - allowed_fields
107
+ common_mistakes = {
108
+ "OUTPUT_DOCUMENT_TYPES": "OUTPUT_DOCUMENT_TYPE",
109
+ "INPUT_DOCUMENT_TYPE": "INPUT_DOCUMENT_TYPES",
110
+ }
111
+
112
+ for field in suspicious_fields:
113
+ # Skip inherited attributes from parent classes
114
+ if any(hasattr(base, field) for base in cls.__bases__):
115
+ continue
116
+
117
+ if field in common_mistakes:
118
+ raise TypeError(
119
+ f"FlowConfig {cls.__name__}: Found '{field}' but expected "
120
+ f"'{common_mistakes[field]}'. Please use the correct field name."
121
+ )
122
+ elif "DOCUMENT" in field:
123
+ raise TypeError(
124
+ f"FlowConfig {cls.__name__}: Invalid field '{field}'. "
125
+ f"Only 'INPUT_DOCUMENT_TYPES' and 'OUTPUT_DOCUMENT_TYPE' are allowed."
126
+ )
127
+
25
128
  # Ensure required attributes are defined
26
129
  if not hasattr(cls, "INPUT_DOCUMENT_TYPES"):
27
130
  raise TypeError(f"FlowConfig {cls.__name__} must define INPUT_DOCUMENT_TYPES")
@@ -37,22 +140,55 @@ class FlowConfig(ABC):
37
140
 
38
141
  @classmethod
39
142
  def get_input_document_types(cls) -> list[type[FlowDocument]]:
40
- """
41
- Get the input document types for the flow.
143
+ """Get the list of input document types this flow accepts.
144
+
145
+ Returns:
146
+ List of FlowDocument subclasses that this flow requires
147
+ as input.
148
+
149
+ Example:
150
+ >>> types = MyFlowConfig.get_input_document_types()
151
+ >>> print([t.__name__ for t in types])
152
+ ['InputDoc', 'ConfigDoc']
42
153
  """
43
154
  return cls.INPUT_DOCUMENT_TYPES
44
155
 
45
156
  @classmethod
46
157
  def get_output_document_type(cls) -> type[FlowDocument]:
47
- """
48
- Get the output document type for the flow.
158
+ """Get the output document type this flow produces.
159
+
160
+ Returns:
161
+ Single FlowDocument subclass that this flow outputs.
162
+
163
+ Example:
164
+ >>> output_type = MyFlowConfig.get_output_document_type()
165
+ >>> print(output_type.__name__)
166
+ 'ProcessedDataDocument'
49
167
  """
50
168
  return cls.OUTPUT_DOCUMENT_TYPE
51
169
 
52
170
  @classmethod
53
171
  def has_input_documents(cls, documents: DocumentList) -> bool:
54
- """
55
- Check if the flow has all required input documents.
172
+ """Check if all required input documents are present.
173
+
174
+ Verifies that the document list contains at least one instance
175
+ of each required input document type.
176
+
177
+ Args:
178
+ documents: DocumentList to check for required inputs.
179
+
180
+ Returns:
181
+ True if all required document types are present,
182
+ False if any are missing.
183
+
184
+ Example:
185
+ >>> docs = DocumentList([input_doc, config_doc])
186
+ >>> if MyFlowConfig.has_input_documents(docs):
187
+ ... # Safe to proceed with flow
188
+ ... pass
189
+
190
+ Note:
191
+ Use this before get_input_documents() to avoid exceptions.
56
192
  """
57
193
  for doc_cls in cls.INPUT_DOCUMENT_TYPES:
58
194
  if not any(isinstance(doc, doc_cls) for doc in documents):
@@ -61,8 +197,29 @@ class FlowConfig(ABC):
61
197
 
62
198
  @classmethod
63
199
  def get_input_documents(cls, documents: DocumentList) -> DocumentList:
64
- """
65
- Get the input documents for the flow.
200
+ """Extract and return all required input documents.
201
+
202
+ Filters the provided document list to return only documents
203
+ matching the required input types. Returns all matching documents,
204
+ not just the first of each type.
205
+
206
+ Args:
207
+ documents: DocumentList containing mixed document types.
208
+
209
+ Returns:
210
+ DocumentList containing only the required input documents.
211
+
212
+ Raises:
213
+ ValueError: If any required document type is missing.
214
+
215
+ Example:
216
+ >>> all_docs = DocumentList([input1, input2, other_doc])
217
+ >>> input_docs = MyFlowConfig.get_input_documents(all_docs)
218
+ >>> len(input_docs) # Contains only input1 and input2
219
+ 2
220
+
221
+ Note:
222
+ Call has_input_documents() first to check availability.
66
223
  """
67
224
  input_documents = DocumentList()
68
225
  for doc_cls in cls.INPUT_DOCUMENT_TYPES:
@@ -73,25 +230,77 @@ class FlowConfig(ABC):
73
230
  return input_documents
74
231
 
75
232
  @classmethod
76
- def validate_output_documents(cls, documents: DocumentList) -> None:
77
- """
78
- Validate the output documents of the flow.
233
+ def validate_output_documents(cls, documents: Any) -> None:
234
+ """Validate that output documents match the expected type.
235
+
236
+ Ensures all documents in the list are instances of the
237
+ declared OUTPUT_DOCUMENT_TYPE.
238
+
239
+ Args:
240
+ documents: DocumentList to validate.
241
+
242
+ Raises:
243
+ DocumentValidationError: If documents is not a DocumentList or if any
244
+ document has incorrect type.
245
+
246
+ Example:
247
+ >>> output = DocumentList([ProcessedDoc(...)])
248
+ >>> MyFlowConfig.validate_output_documents(output)
249
+ >>> # No exception means valid
250
+
251
+ Note:
252
+ Used internally by create_and_validate_output().
253
+ Uses explicit exceptions for validation (works with python -O).
79
254
  """
80
- assert isinstance(documents, DocumentList), "Documents must be a DocumentList"
255
+ if not isinstance(documents, DocumentList):
256
+ raise DocumentValidationError("Documents must be a DocumentList")
257
+
81
258
  output_document_class = cls.get_output_document_type()
82
259
 
83
- invalid = [type(d).__name__ for d in documents if not isinstance(d, output_document_class)]
84
- assert not invalid, (
85
- "Documents must be of the correct type. "
86
- f"Expected: {output_document_class.__name__}, Got invalid: {invalid}"
87
- )
260
+ for doc in documents:
261
+ if not isinstance(doc, output_document_class):
262
+ raise DocumentValidationError(
263
+ f"Document '{doc.name}' has incorrect type. "
264
+ f"Expected: {output_document_class.__name__}, "
265
+ f"Got: {type(doc).__name__}"
266
+ )
88
267
 
89
268
  @classmethod
90
269
  def create_and_validate_output(
91
- cls, output: FlowDocument | list[FlowDocument] | DocumentList
270
+ cls, output: FlowDocument | Iterable[FlowDocument] | DocumentList
92
271
  ) -> DocumentList:
93
- """
94
- Create the output documents for the flow.
272
+ """Create and validate flow output documents.
273
+
274
+ @public
275
+
276
+ RECOMMENDED: Always use this method at the end of @pipeline_flow functions
277
+ to ensure type safety and proper output validation.
278
+
279
+ Convenience method that wraps output in a DocumentList if needed
280
+ and validates it matches the expected OUTPUT_DOCUMENT_TYPE.
281
+
282
+ Args:
283
+ output: Single document, iterable of documents, or DocumentList.
284
+
285
+ Returns:
286
+ Validated DocumentList containing the output documents.
287
+
288
+ Raises:
289
+ DocumentValidationError: If output type doesn't match OUTPUT_DOCUMENT_TYPE.
290
+
291
+ Example:
292
+ >>> @pipeline_flow(name="my_flow")
293
+ >>> async def process_flow(config: MyFlowConfig, ...) -> DocumentList:
294
+ >>> outputs = []
295
+ >>> # ... processing logic ...
296
+ >>> outputs.append(OutputDoc(...))
297
+ >>>
298
+ >>> # Always finish with this validation
299
+ >>> return config.create_and_validate_output(outputs)
300
+
301
+ Note:
302
+ This is the recommended pattern for all @pipeline_flow functions.
303
+ It ensures type safety and catches output errors immediately.
95
304
  """
96
305
  documents: DocumentList
97
306
  if isinstance(output, FlowDocument):
@@ -99,7 +308,7 @@ class FlowConfig(ABC):
99
308
  elif isinstance(output, DocumentList):
100
309
  documents = output
101
310
  else:
102
- assert isinstance(output, list)
103
- documents = DocumentList(output) # type: ignore[arg-type]
311
+ # Handle any iterable of FlowDocuments
312
+ documents = DocumentList(list(output)) # type: ignore[arg-type]
104
313
  cls.validate_output_documents(documents)
105
314
  return documents
@@ -1,3 +1,11 @@
1
+ """Flow options configuration for pipeline execution.
2
+
3
+ @public
4
+
5
+ Provides base configuration settings for AI pipeline flows,
6
+ including model selection and runtime parameters.
7
+ """
8
+
1
9
  from typing import TypeVar
2
10
 
3
11
  from pydantic import Field
@@ -9,7 +17,48 @@ T = TypeVar("T", bound="FlowOptions")
9
17
 
10
18
 
11
19
  class FlowOptions(BaseSettings):
12
- """Base configuration for AI Pipeline flows."""
20
+ """Base configuration settings for AI pipeline flows.
21
+
22
+ @public
23
+
24
+ FlowOptions provides runtime configuration for pipeline flows,
25
+ including model selection and other parameters. It uses pydantic-settings
26
+ to support environment variable overrides and is immutable (frozen) by default.
27
+
28
+ This class is designed to be subclassed for flow-specific configuration:
29
+
30
+ Example:
31
+ >>> class MyFlowOptions(FlowOptions):
32
+ ... temperature: float = Field(0.7, ge=0, le=2)
33
+ ... batch_size: int = Field(10, gt=0)
34
+ ... custom_param: str = "default"
35
+
36
+ >>> # Use in CLI with run_cli:
37
+ >>> run_cli(
38
+ ... flows=[my_flow],
39
+ ... options_cls=MyFlowOptions # Will parse CLI args
40
+ ... )
41
+
42
+ >>> # Or create programmatically:
43
+ >>> options = MyFlowOptions(
44
+ ... core_model="gemini-2.5-pro",
45
+ ... temperature=0.9
46
+ ... )
47
+
48
+ Attributes:
49
+ core_model: Primary LLM for complex tasks (default: gpt-5)
50
+ small_model: Fast model for simple tasks (default: gpt-5-mini)
51
+
52
+ Configuration:
53
+ - Frozen (immutable) after creation
54
+ - Extra fields ignored (not strict)
55
+ - Can be populated from environment variables
56
+ - Used by simple_runner.cli for command-line parsing
57
+
58
+ Note:
59
+ The base class provides model selection. Subclasses should
60
+ add flow-specific parameters with appropriate validation.
61
+ """
13
62
 
14
63
  core_model: ModelName | str = Field(
15
64
  default="gpt-5",
@@ -1,3 +1,9 @@
1
+ """Large Language Model integration via LiteLLM proxy.
2
+
3
+ This package provides OpenAI API-compatible LLM interactions with built-in retry logic,
4
+ LMNR tracing, and structured output generation using Pydantic models.
5
+ """
6
+
1
7
  from .ai_messages import AIMessages, AIMessageType
2
8
  from .client import (
3
9
  generate,
@@ -1,3 +1,11 @@
1
+ """AI message handling for LLM interactions.
2
+
3
+ @public
4
+
5
+ Provides AIMessages container for managing conversations with mixed content types
6
+ including text, documents, and model responses.
7
+ """
8
+
1
9
  import base64
2
10
  import hashlib
3
11
  import json
@@ -13,13 +21,82 @@ from ai_pipeline_core.documents import Document
13
21
  from .model_response import ModelResponse
14
22
 
15
23
  AIMessageType = str | Document | ModelResponse
24
+ """Type for messages in AIMessages container.
25
+
26
+ @public
27
+
28
+ Represents the allowed types for conversation messages:
29
+ - str: Plain text messages
30
+ - Document: Structured document content
31
+ - ModelResponse: LLM generation responses
32
+ """
16
33
 
17
34
 
18
35
  class AIMessages(list[AIMessageType]):
36
+ """Container for AI conversation messages supporting mixed types.
37
+
38
+ @public
39
+
40
+ This class extends list to manage conversation messages between user
41
+ and AI, supporting text, Document objects, and ModelResponse instances.
42
+ Messages are converted to OpenAI-compatible format for LLM interactions.
43
+
44
+ Conversion Rules:
45
+ - str: Becomes {"role": "user", "content": text}
46
+ - Document: Becomes {"role": "user", "content": document_content}
47
+ (automatically handles text, images, PDFs based on MIME type)
48
+ - ModelResponse: Becomes {"role": "assistant", "content": response.content}
49
+
50
+ Note: Document conversion is automatic. Text content becomes user text messages.
51
+ Images are sent to vision-capable models (non-vision models will raise ValueError).
52
+ PDFs are attached when supported by the model, otherwise a text extraction
53
+ fallback is used. LiteLLM proxy handles the specific encoding requirements
54
+ for each provider.
55
+
56
+ IMPORTANT: Although AIMessages can contain Document entries, the LLM client functions
57
+ expect `messages` to be `AIMessages` or `str`. If you start from a Document or a list
58
+ of Documents, build AIMessages first (e.g., `AIMessages([doc])` or `AIMessages(docs)`).
59
+
60
+ Example:
61
+ >>> from ai_pipeline_core import llm
62
+ >>> messages = AIMessages()
63
+ >>> messages.append("What is the capital of France?")
64
+ >>> response = await llm.generate("gpt-5", messages=messages)
65
+ >>> messages.append(response) # Add the actual response
66
+ >>> prompt = messages.get_last_message_as_str() # Get the last message as a string
67
+ """
68
+
19
69
  def get_last_message(self) -> AIMessageType:
70
+ """Get the last message in the conversation.
71
+
72
+ Returns:
73
+ The last message in the conversation, which can be a string,
74
+ Document, or ModelResponse.
75
+ """
20
76
  return self[-1]
21
77
 
22
78
  def get_last_message_as_str(self) -> str:
79
+ """Get the last message as a string, raising if not a string.
80
+
81
+ @public
82
+
83
+ Returns:
84
+ The last message as a string.
85
+
86
+ Raises:
87
+ ValueError: If the last message is not a string.
88
+
89
+ Safer Pattern:
90
+ Instead of catching ValueError, check type first:
91
+ >>> messages = AIMessages([user_msg, response, followup])
92
+ >>> last = messages.get_last_message()
93
+ >>> if isinstance(last, str):
94
+ ... text = last
95
+ >>> elif isinstance(last, ModelResponse):
96
+ ... text = last.content
97
+ >>> elif isinstance(last, Document):
98
+ ... text = last.text if last.is_text else "<binary>"
99
+ """
23
100
  last_message = self.get_last_message()
24
101
  if isinstance(last_message, str):
25
102
  return last_message
@@ -28,8 +105,25 @@ class AIMessages(list[AIMessageType]):
28
105
  def to_prompt(self) -> list[ChatCompletionMessageParam]:
29
106
  """Convert AIMessages to OpenAI-compatible format.
30
107
 
108
+ Transforms the message list into the format expected by OpenAI API.
109
+ Each message type is converted according to its role and content.
110
+
31
111
  Returns:
32
- List of ChatCompletionMessageParam for OpenAI API
112
+ List of ChatCompletionMessageParam dicts (from openai.types.chat)
113
+ with 'role' and 'content' keys. Ready to be passed to generate()
114
+ or OpenAI API directly.
115
+
116
+ Raises:
117
+ ValueError: If message type is not supported.
118
+
119
+ Example:
120
+ >>> messages = AIMessages(["Hello", response, "Follow up"])
121
+ >>> prompt = messages.to_prompt()
122
+ >>> # Result: [
123
+ >>> # {"role": "user", "content": "Hello"},
124
+ >>> # {"role": "assistant", "content": "..."},
125
+ >>> # {"role": "user", "content": "Follow up"}
126
+ >>> # ]
33
127
  """
34
128
  messages: list[ChatCompletionMessageParam] = []
35
129
 
@@ -46,7 +140,11 @@ class AIMessages(list[AIMessageType]):
46
140
  return messages
47
141
 
48
142
  def to_tracing_log(self) -> list[str]:
49
- """Convert AIMessages to a list of strings for tracing."""
143
+ """Convert AIMessages to a list of strings for tracing.
144
+
145
+ Returns:
146
+ List of string representations for tracing logs.
147
+ """
50
148
  messages: list[str] = []
51
149
  for message in self:
52
150
  if isinstance(message, Document):
@@ -61,20 +159,27 @@ class AIMessages(list[AIMessageType]):
61
159
  return messages
62
160
 
63
161
  def get_prompt_cache_key(self, system_prompt: str | None = None) -> str:
162
+ """Generate cache key for message set.
163
+
164
+ Args:
165
+ system_prompt: Optional system prompt to include in cache key.
166
+
167
+ Returns:
168
+ SHA256 hash as hex string for cache key.
169
+ """
64
170
  if not system_prompt:
65
171
  system_prompt = ""
66
172
  return hashlib.sha256((system_prompt + json.dumps(self.to_prompt())).encode()).hexdigest()
67
173
 
68
174
  @staticmethod
69
175
  def document_to_prompt(document: Document) -> list[ChatCompletionContentPartParam]:
70
- """
71
- Convert a document to prompt format for LLM consumption.
176
+ """Convert a document to prompt format for LLM consumption.
72
177
 
73
178
  Args:
74
- document: The document to convert
179
+ document: The document to convert.
75
180
 
76
181
  Returns:
77
- List of chat completion content parts for the prompt
182
+ List of chat completion content parts for the prompt.
78
183
  """
79
184
  prompt: list[ChatCompletionContentPartParam] = []
80
185
 
@@ -88,9 +193,8 @@ class AIMessages(list[AIMessageType]):
88
193
 
89
194
  # Handle text documents
90
195
  if document.is_text:
91
- content_text = (
92
- f"{header_text}<content>\n{document.as_text()}\n</content>\n</document>\n"
93
- )
196
+ text_content = document.content.decode("utf-8")
197
+ content_text = f"{header_text}<content>\n{text_content}\n</content>\n</document>\n"
94
198
  prompt.append({"type": "text", "text": content_text})
95
199
  return prompt
96
200
 
@@ -102,12 +206,10 @@ class AIMessages(list[AIMessageType]):
102
206
  return []
103
207
 
104
208
  # Add header for binary content
105
- prompt.append(
106
- {
107
- "type": "text",
108
- "text": f"{header_text}<content>\n",
109
- }
110
- )
209
+ prompt.append({
210
+ "type": "text",
211
+ "text": f"{header_text}<content>\n",
212
+ })
111
213
 
112
214
  # Encode binary content
113
215
  base64_content = base64.b64encode(document.content).decode("utf-8")
@@ -115,19 +217,15 @@ class AIMessages(list[AIMessageType]):
115
217
 
116
218
  # Add appropriate content type
117
219
  if document.is_pdf:
118
- prompt.append(
119
- {
120
- "type": "file",
121
- "file": {"file_data": data_uri},
122
- }
123
- )
220
+ prompt.append({
221
+ "type": "file",
222
+ "file": {"file_data": data_uri},
223
+ })
124
224
  else: # is_image
125
- prompt.append(
126
- {
127
- "type": "image_url",
128
- "image_url": {"url": data_uri, "detail": "high"},
129
- }
130
- )
225
+ prompt.append({
226
+ "type": "image_url",
227
+ "image_url": {"url": data_uri, "detail": "high"},
228
+ })
131
229
 
132
230
  # Close the document tag
133
231
  prompt.append({"type": "text", "text": "</content>\n</document>\n"})