ai-pipeline-core 0.1.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. ai_pipeline_core/__init__.py +83 -119
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +14 -15
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +349 -1062
  30. ai_pipeline_core/documents/mime_type.py +40 -85
  31. ai_pipeline_core/documents/utils.py +62 -7
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +5 -3
  36. ai_pipeline_core/llm/ai_messages.py +284 -73
  37. ai_pipeline_core/llm/client.py +462 -209
  38. ai_pipeline_core/llm/model_options.py +86 -53
  39. ai_pipeline_core/llm/model_response.py +187 -241
  40. ai_pipeline_core/llm/model_types.py +34 -54
  41. ai_pipeline_core/logging/__init__.py +2 -9
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -43
  44. ai_pipeline_core/logging/logging_mixin.py +17 -51
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/observability/tracing.py +640 -0
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +26 -105
  70. ai_pipeline_core/settings.py +41 -32
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -240
  76. ai_pipeline_core/documents/flow_document.py +0 -128
  77. ai_pipeline_core/documents/task_document.py +0 -133
  78. ai_pipeline_core/documents/temporary_document.py +0 -95
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -314
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -717
  83. ai_pipeline_core/prefect.py +0 -54
  84. ai_pipeline_core/simple_runner/__init__.py +0 -24
  85. ai_pipeline_core/simple_runner/cli.py +0 -255
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -385
  87. ai_pipeline_core/tracing.py +0 -475
  88. ai_pipeline_core-0.1.12.dist-info/METADATA +0 -450
  89. ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
  90. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -24,6 +24,8 @@ EXTENSION_MIME_MAP = {
24
24
  "gif": "image/gif",
25
25
  "bmp": "image/bmp",
26
26
  "webp": "image/webp",
27
+ "heic": "image/heic",
28
+ "heif": "image/heif",
27
29
  "json": "application/json",
28
30
  "yaml": "application/yaml",
29
31
  "yml": "application/yaml",
@@ -43,7 +45,7 @@ def detect_mime_type(content: bytes, name: str) -> str:
43
45
  r"""Detect MIME type from document content and filename.
44
46
 
45
47
  Uses a multi-stage detection strategy for maximum accuracy:
46
- 1. Returns 'application/x-empty' for empty content
48
+ 1. Returns 'text/plain' for empty content
47
49
  2. Uses extension-based detection for known formats (most reliable)
48
50
  3. Falls back to python-magic content analysis
49
51
  4. Final fallback to extension or 'application/octet-stream'
@@ -57,7 +59,7 @@ def detect_mime_type(content: bytes, name: str) -> str:
57
59
  Never returns None or empty string.
58
60
 
59
61
  Fallback behavior:
60
- - Empty content: 'application/x-empty'
62
+ - Empty content: 'text/plain'
61
63
  - Unknown extension with binary content: 'application/octet-stream'
62
64
  - Magic library failure: Falls back to extension or 'application/octet-stream'
63
65
 
@@ -65,23 +67,12 @@ def detect_mime_type(content: bytes, name: str) -> str:
65
67
  Only the first 1024 bytes are analyzed for content detection.
66
68
  Extension-based detection is O(1) lookup.
67
69
 
68
- Note:
69
- Extension-based detection is preferred for text formats as
70
- content analysis can sometimes misidentify structured text.
71
-
72
- Example:
73
- >>> detect_mime_type(b'{"key": "value"}', "data.json")
74
- 'application/json'
75
- >>> detect_mime_type(b'Hello World', "text.txt")
76
- 'text/plain'
77
- >>> detect_mime_type(b'', "empty.txt")
78
- 'application/x-empty'
79
- >>> detect_mime_type(b'\\x89PNG', "image.xyz")
80
- 'image/png' # Magic detects PNG despite wrong extension
70
+ Extension-based detection is preferred for text formats as
71
+ content analysis can sometimes misidentify structured text.
81
72
  """
82
73
  # Check for empty content
83
74
  if len(content) == 0:
84
- return "application/x-empty"
75
+ return "text/plain"
85
76
 
86
77
  # Try extension-based detection first for known formats
87
78
  # This is more reliable for text formats that magic might misidentify
@@ -97,40 +88,13 @@ def detect_mime_type(content: bytes, name: str) -> str:
97
88
  return mime
98
89
  except (AttributeError, OSError, magic.MagicException) as e:
99
90
  logger.warning(f"MIME detection failed for {name}: {e}")
100
- except Exception as e:
101
- logger.error(f"Unexpected error in MIME detection for {name}: {e}")
91
+ except Exception:
92
+ logger.exception(f"Unexpected error in MIME detection for {name}")
102
93
 
103
94
  # Final fallback based on extension or default
104
95
  return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
105
96
 
106
97
 
107
- def mime_type_from_extension(name: str) -> str:
108
- """Get MIME type based solely on file extension.
109
-
110
- Simple extension-based MIME type detection without content analysis.
111
- This is a legacy function maintained for backward compatibility.
112
-
113
- Args:
114
- name: Filename with extension.
115
-
116
- Returns:
117
- MIME type based on extension, or 'application/octet-stream'
118
- if extension is unknown.
119
-
120
- Note:
121
- Prefer detect_mime_type() for more accurate detection.
122
- This function only checks the file extension.
123
-
124
- Example:
125
- >>> mime_type_from_extension("document.pdf")
126
- 'application/pdf'
127
- >>> mime_type_from_extension("unknown.xyz")
128
- 'application/octet-stream'
129
- """
130
- ext = name.lower().split(".")[-1] if "." in name else ""
131
- return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
132
-
133
-
134
98
  def is_text_mime_type(mime_type: str) -> bool:
135
99
  """Check if MIME type represents text-based content.
136
100
 
@@ -151,13 +115,6 @@ def is_text_mime_type(mime_type: str) -> bool:
151
115
  - application/yaml
152
116
  - application/x-yaml
153
117
 
154
- Example:
155
- >>> is_text_mime_type('text/plain')
156
- True
157
- >>> is_text_mime_type('application/json')
158
- True
159
- >>> is_text_mime_type('image/png')
160
- False
161
118
  """
162
119
  text_types = [
163
120
  "text/",
@@ -179,15 +136,8 @@ def is_json_mime_type(mime_type: str) -> bool:
179
136
  Returns:
180
137
  True if MIME type is 'application/json', False otherwise.
181
138
 
182
- Note:
183
- Only matches exact 'application/json', not variants like
184
- 'application/ld+json' or 'application/vnd.api+json'.
185
-
186
- Example:
187
- >>> is_json_mime_type('application/json')
188
- True
189
- >>> is_json_mime_type('text/json') # Not standard JSON MIME
190
- False
139
+ Only matches exact 'application/json', not variants like
140
+ 'application/ld+json' or 'application/vnd.api+json'.
191
141
  """
192
142
  return mime_type == "application/json"
193
143
 
@@ -207,13 +157,8 @@ def is_yaml_mime_type(mime_type: str) -> bool:
207
157
  - application/yaml (standard)
208
158
  - application/x-yaml (legacy)
209
159
 
210
- Example:
211
- >>> is_yaml_mime_type('application/yaml')
212
- True
213
- >>> is_yaml_mime_type('application/x-yaml')
214
- True
215
160
  """
216
- return mime_type == "application/yaml" or mime_type == "application/x-yaml"
161
+ return mime_type in {"application/yaml", "application/x-yaml"}
217
162
 
218
163
 
219
164
  def is_pdf_mime_type(mime_type: str) -> bool:
@@ -225,15 +170,8 @@ def is_pdf_mime_type(mime_type: str) -> bool:
225
170
  Returns:
226
171
  True if MIME type is 'application/pdf', False otherwise.
227
172
 
228
- Note:
229
- PDF documents require special handling in the LLM module
230
- and are supported by certain vision-capable models.
231
-
232
- Example:
233
- >>> is_pdf_mime_type('application/pdf')
234
- True
235
- >>> is_pdf_mime_type('text/plain')
236
- False
173
+ PDF documents require special handling in the LLM module
174
+ and are supported by certain vision-capable models.
237
175
  """
238
176
  return mime_type == "application/pdf"
239
177
 
@@ -255,14 +193,31 @@ def is_image_mime_type(mime_type: str) -> bool:
255
193
  - image/webp
256
194
  - image/svg+xml
257
195
 
258
- Note:
259
- Image documents are automatically encoded for vision-capable
260
- LLM models in the AIMessages.document_to_prompt() method.
261
-
262
- Example:
263
- >>> is_image_mime_type('image/png')
264
- True
265
- >>> is_image_mime_type('application/pdf')
266
- False
196
+ Image documents are automatically encoded for vision-capable
197
+ LLM models in the AIMessages.document_to_prompt() method.
267
198
  """
268
199
  return mime_type.startswith("image/")
200
+
201
+
202
+ LLM_SUPPORTED_IMAGE_MIME_TYPES: frozenset[str] = frozenset({
203
+ "image/png",
204
+ "image/jpeg",
205
+ "image/webp",
206
+ "image/heic",
207
+ "image/heif",
208
+ })
209
+
210
+
211
+ def is_llm_supported_image(mime_type: str) -> bool:
212
+ """Check if MIME type is an image format directly supported by LLMs.
213
+
214
+ Unsupported image formats (gif, bmp, tiff, svg, etc.) need conversion
215
+ to PNG before sending to the LLM.
216
+
217
+ Args:
218
+ mime_type: MIME type string to check.
219
+
220
+ Returns:
221
+ True if the image format is natively supported by LLMs.
222
+ """
223
+ return mime_type in LLM_SUPPORTED_IMAGE_MIME_TYPES
@@ -1,19 +1,18 @@
1
1
  """Utility functions for document handling.
2
2
 
3
3
  Provides helper functions for URL sanitization, naming conventions,
4
- and canonical key generation used throughout the document system.
4
+ canonical key generation, and hash validation used throughout the document system.
5
5
  """
6
6
 
7
7
  import re
8
- from typing import Any, Iterable, Type
8
+ from collections.abc import Iterable
9
+ from typing import Any
9
10
  from urllib.parse import urlparse
10
11
 
11
12
 
12
13
  def sanitize_url(url: str) -> str:
13
14
  """Sanitize URL or query string for use in filenames.
14
15
 
15
- @public
16
-
17
16
  Removes or replaces characters that are invalid in filenames.
18
17
 
19
18
  Args:
@@ -63,15 +62,13 @@ def camel_to_snake(name: str) -> str:
63
62
 
64
63
 
65
64
  def canonical_name_key(
66
- obj_or_name: Type[Any] | str,
65
+ obj_or_name: type[Any] | str,
67
66
  *,
68
67
  max_parent_suffixes: int = 3,
69
68
  extra_suffixes: Iterable[str] = (),
70
69
  ) -> str:
71
70
  """Produce a canonical snake_case key from a class or name.
72
71
 
73
- @public
74
-
75
72
  Process:
76
73
  1) Starting with the class name (or given string),
77
74
  2) Stripping any trailing parent class names (up to `max_parent_suffixes` from the MRO),
@@ -115,3 +112,61 @@ def canonical_name_key(
115
112
  break
116
113
 
117
114
  return camel_to_snake(name)
115
+
116
+
117
+ def is_document_sha256(value: str) -> bool:
118
+ """Check if a string is a valid base32-encoded SHA256 hash with proper entropy.
119
+
120
+ This function validates that a string is not just formatted like a SHA256 hash,
121
+ but actually has the entropy characteristics of a real hash. It checks:
122
+ 1. Correct length (52 characters without padding)
123
+ 2. Valid base32 characters (A-Z, 2-7)
124
+ 3. Sufficient entropy (at least 8 unique characters)
125
+
126
+ The entropy check prevents false positives like 'AAAAAAA...AAA' from being
127
+ identified as valid document hashes.
128
+
129
+ Args:
130
+ value: String to check if it's a document SHA256 hash.
131
+
132
+ Returns:
133
+ True if the string appears to be a real base32-encoded SHA256 hash,
134
+ False otherwise.
135
+
136
+ Examples:
137
+ >>> # Real SHA256 hash
138
+ >>> is_document_sha256("P3AEMA2PSYILKFYVBUALJLMIYWVZIS2QDI3S5VTMD2X7SOODF2YQ")
139
+ True
140
+
141
+ >>> # Too uniform - lacks entropy
142
+ >>> is_document_sha256("A" * 52)
143
+ False
144
+
145
+ >>> # Wrong length
146
+ >>> is_document_sha256("ABC123")
147
+ False
148
+
149
+ >>> # Invalid characters
150
+ >>> is_document_sha256("a" * 52) # lowercase
151
+ False
152
+ """
153
+ # Check basic format: exactly 52 uppercase base32 characters
154
+ try:
155
+ if not value or len(value) != 52:
156
+ return False
157
+ except (TypeError, AttributeError):
158
+ return False
159
+
160
+ # Check if all characters are valid base32 (A-Z, 2-7)
161
+ try:
162
+ if not re.match(r"^[A-Z2-7]{52}$", value):
163
+ return False
164
+ except TypeError:
165
+ # re.match raises TypeError for non-string types like bytes
166
+ return False
167
+
168
+ # Check entropy: real SHA256 hashes have high entropy
169
+ # Require at least 8 unique characters (out of 32 possible in base32)
170
+ # This prevents patterns like "AAAAAAA..." from being identified as real hashes
171
+ unique_chars = len(set(value))
172
+ return unique_chars >= 8
@@ -1,97 +1,45 @@
1
1
  """Exception hierarchy for AI Pipeline Core.
2
2
 
3
- @public
4
-
5
3
  This module defines the exception hierarchy used throughout the AI Pipeline Core library.
6
4
  All exceptions inherit from PipelineCoreError, providing a consistent error handling interface.
7
5
  """
8
6
 
9
7
 
10
8
  class PipelineCoreError(Exception):
11
- """Base exception for all AI Pipeline Core errors.
12
-
13
- @public
14
- """
15
-
16
- pass
9
+ """Base exception for all AI Pipeline Core errors."""
17
10
 
18
11
 
19
12
  class DocumentError(PipelineCoreError):
20
- """Base exception for document-related errors.
21
-
22
- @public
23
- """
24
-
25
- pass
13
+ """Base exception for document-related errors."""
26
14
 
27
15
 
28
16
  class DocumentValidationError(DocumentError):
29
- """Raised when document validation fails.
30
-
31
- @public
32
- """
33
-
34
- pass
17
+ """Raised when document validation fails."""
35
18
 
36
19
 
37
20
  class DocumentSizeError(DocumentValidationError):
38
- """Raised when document content exceeds MAX_CONTENT_SIZE limit.
39
-
40
- @public
41
- """
42
-
43
- pass
21
+ """Raised when document content exceeds MAX_CONTENT_SIZE limit."""
44
22
 
45
23
 
46
24
  class DocumentNameError(DocumentValidationError):
47
- """Raised when document name contains invalid characters or patterns.
48
-
49
- @public
50
- """
51
-
52
- pass
25
+ """Raised when document name contains invalid characters or patterns."""
53
26
 
54
27
 
55
28
  class LLMError(PipelineCoreError):
56
- """Raised when LLM generation fails after all retries.
57
-
58
- @public
59
- """
60
-
61
- pass
29
+ """Raised when LLM generation fails after all retries."""
62
30
 
63
31
 
64
32
  class PromptError(PipelineCoreError):
65
- """Base exception for prompt template errors.
66
-
67
- @public
68
- """
69
-
70
- pass
33
+ """Base exception for prompt template errors."""
71
34
 
72
35
 
73
36
  class PromptRenderError(PromptError):
74
- """Raised when Jinja2 template rendering fails.
75
-
76
- @public
77
- """
78
-
79
- pass
37
+ """Raised when Jinja2 template rendering fails."""
80
38
 
81
39
 
82
40
  class PromptNotFoundError(PromptError):
83
- """Raised when prompt template file is not found in search paths.
84
-
85
- @public
86
- """
87
-
88
- pass
41
+ """Raised when prompt template file is not found in search paths."""
89
42
 
90
43
 
91
44
  class MimeTypeError(DocumentError):
92
- """Raised when MIME type detection or validation fails.
93
-
94
- @public
95
- """
96
-
97
- pass
45
+ """Raised when MIME type detection or validation fails."""