opik 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. opik/api_objects/attachment/attachment_context.py +36 -0
  2. opik/api_objects/attachment/attachments_extractor.py +153 -0
  3. opik/api_objects/attachment/client.py +1 -0
  4. opik/api_objects/attachment/converters.py +2 -0
  5. opik/api_objects/attachment/decoder.py +18 -0
  6. opik/api_objects/attachment/decoder_base64.py +83 -0
  7. opik/api_objects/attachment/decoder_helpers.py +137 -0
  8. opik/api_objects/constants.py +2 -0
  9. opik/api_objects/dataset/dataset.py +133 -40
  10. opik/api_objects/dataset/rest_operations.py +2 -0
  11. opik/api_objects/experiment/experiment.py +6 -0
  12. opik/api_objects/helpers.py +8 -4
  13. opik/api_objects/local_recording.py +6 -5
  14. opik/api_objects/observation_data.py +101 -0
  15. opik/api_objects/opik_client.py +78 -45
  16. opik/api_objects/opik_query_language.py +9 -3
  17. opik/api_objects/prompt/chat/chat_prompt.py +18 -1
  18. opik/api_objects/prompt/client.py +8 -1
  19. opik/api_objects/span/span_data.py +3 -88
  20. opik/api_objects/threads/threads_client.py +7 -4
  21. opik/api_objects/trace/trace_data.py +3 -74
  22. opik/api_objects/validation_helpers.py +3 -3
  23. opik/cli/exports/__init__.py +131 -0
  24. opik/cli/exports/dataset.py +278 -0
  25. opik/cli/exports/experiment.py +784 -0
  26. opik/cli/exports/project.py +685 -0
  27. opik/cli/exports/prompt.py +578 -0
  28. opik/cli/exports/utils.py +406 -0
  29. opik/cli/harbor.py +39 -0
  30. opik/cli/imports/__init__.py +439 -0
  31. opik/cli/imports/dataset.py +143 -0
  32. opik/cli/imports/experiment.py +1192 -0
  33. opik/cli/imports/project.py +262 -0
  34. opik/cli/imports/prompt.py +177 -0
  35. opik/cli/imports/utils.py +280 -0
  36. opik/cli/main.py +14 -12
  37. opik/config.py +12 -1
  38. opik/datetime_helpers.py +12 -0
  39. opik/decorator/arguments_helpers.py +4 -1
  40. opik/decorator/base_track_decorator.py +111 -37
  41. opik/decorator/context_manager/span_context_manager.py +5 -1
  42. opik/decorator/generator_wrappers.py +5 -4
  43. opik/decorator/span_creation_handler.py +13 -4
  44. opik/evaluation/engine/engine.py +111 -28
  45. opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
  46. opik/evaluation/evaluator.py +12 -0
  47. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
  48. opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
  49. opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
  50. opik/evaluation/metrics/heuristics/equals.py +11 -7
  51. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
  52. opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
  53. opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
  54. opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
  55. opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
  56. opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
  57. opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
  58. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
  59. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
  60. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
  61. opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
  62. opik/evaluation/metrics/ragas_metric.py +43 -23
  63. opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
  64. opik/evaluation/models/litellm/util.py +4 -20
  65. opik/evaluation/models/models_factory.py +19 -5
  66. opik/evaluation/rest_operations.py +3 -3
  67. opik/evaluation/threads/helpers.py +3 -2
  68. opik/file_upload/file_uploader.py +13 -0
  69. opik/file_upload/upload_options.py +2 -0
  70. opik/integrations/adk/legacy_opik_tracer.py +9 -11
  71. opik/integrations/adk/opik_tracer.py +2 -2
  72. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
  73. opik/integrations/dspy/callback.py +100 -14
  74. opik/integrations/dspy/parsers.py +168 -0
  75. opik/integrations/harbor/__init__.py +17 -0
  76. opik/integrations/harbor/experiment_service.py +269 -0
  77. opik/integrations/harbor/opik_tracker.py +528 -0
  78. opik/integrations/haystack/opik_tracer.py +2 -2
  79. opik/integrations/langchain/__init__.py +15 -2
  80. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  81. opik/integrations/langchain/opik_tracer.py +258 -160
  82. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
  83. opik/integrations/llama_index/callback.py +43 -6
  84. opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
  85. opik/integrations/openai/opik_tracker.py +99 -4
  86. opik/integrations/openai/videos/__init__.py +9 -0
  87. opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
  88. opik/integrations/openai/videos/videos_create_decorator.py +159 -0
  89. opik/integrations/openai/videos/videos_download_decorator.py +110 -0
  90. opik/message_processing/batching/base_batcher.py +14 -21
  91. opik/message_processing/batching/batch_manager.py +22 -10
  92. opik/message_processing/batching/batchers.py +32 -40
  93. opik/message_processing/batching/flushing_thread.py +0 -3
  94. opik/message_processing/emulation/emulator_message_processor.py +36 -1
  95. opik/message_processing/emulation/models.py +21 -0
  96. opik/message_processing/messages.py +9 -0
  97. opik/message_processing/preprocessing/__init__.py +0 -0
  98. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  99. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  100. opik/message_processing/preprocessing/constants.py +1 -0
  101. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  102. opik/message_processing/preprocessing/preprocessor.py +36 -0
  103. opik/message_processing/processors/__init__.py +0 -0
  104. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  105. opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
  106. opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
  107. opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
  108. opik/message_processing/queue_consumer.py +4 -2
  109. opik/message_processing/streamer.py +71 -33
  110. opik/message_processing/streamer_constructors.py +36 -8
  111. opik/plugins/pytest/experiment_runner.py +1 -1
  112. opik/plugins/pytest/hooks.py +5 -3
  113. opik/rest_api/__init__.py +42 -0
  114. opik/rest_api/datasets/client.py +321 -123
  115. opik/rest_api/datasets/raw_client.py +470 -145
  116. opik/rest_api/experiments/client.py +26 -0
  117. opik/rest_api/experiments/raw_client.py +26 -0
  118. opik/rest_api/llm_provider_key/client.py +4 -4
  119. opik/rest_api/llm_provider_key/raw_client.py +4 -4
  120. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
  121. opik/rest_api/manual_evaluation/client.py +101 -0
  122. opik/rest_api/manual_evaluation/raw_client.py +172 -0
  123. opik/rest_api/optimizations/client.py +0 -166
  124. opik/rest_api/optimizations/raw_client.py +0 -248
  125. opik/rest_api/projects/client.py +9 -0
  126. opik/rest_api/projects/raw_client.py +13 -0
  127. opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
  128. opik/rest_api/prompts/client.py +130 -2
  129. opik/rest_api/prompts/raw_client.py +175 -0
  130. opik/rest_api/traces/client.py +101 -0
  131. opik/rest_api/traces/raw_client.py +120 -0
  132. opik/rest_api/types/__init__.py +50 -0
  133. opik/rest_api/types/audio_url.py +19 -0
  134. opik/rest_api/types/audio_url_public.py +19 -0
  135. opik/rest_api/types/audio_url_write.py +19 -0
  136. opik/rest_api/types/automation_rule_evaluator.py +38 -2
  137. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
  138. opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
  139. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  140. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  141. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  142. opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
  143. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  144. opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
  145. opik/rest_api/types/dataset.py +2 -0
  146. opik/rest_api/types/dataset_item.py +1 -1
  147. opik/rest_api/types/dataset_item_batch.py +4 -0
  148. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  149. opik/rest_api/types/dataset_item_compare.py +1 -1
  150. opik/rest_api/types/dataset_item_filter.py +4 -0
  151. opik/rest_api/types/dataset_item_page_compare.py +0 -1
  152. opik/rest_api/types/dataset_item_page_public.py +0 -1
  153. opik/rest_api/types/dataset_item_public.py +1 -1
  154. opik/rest_api/types/dataset_public.py +2 -0
  155. opik/rest_api/types/dataset_version_public.py +10 -0
  156. opik/rest_api/types/dataset_version_summary.py +46 -0
  157. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  158. opik/rest_api/types/experiment.py +9 -0
  159. opik/rest_api/types/experiment_public.py +9 -0
  160. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  161. opik/rest_api/types/llm_as_judge_message_content.py +2 -0
  162. opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
  163. opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
  164. opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
  165. opik/rest_api/types/project.py +1 -0
  166. opik/rest_api/types/project_detailed.py +1 -0
  167. opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
  168. opik/rest_api/types/project_reference.py +31 -0
  169. opik/rest_api/types/project_reference_public.py +31 -0
  170. opik/rest_api/types/project_stats_summary_item.py +1 -0
  171. opik/rest_api/types/prompt_version.py +1 -0
  172. opik/rest_api/types/prompt_version_detail.py +1 -0
  173. opik/rest_api/types/prompt_version_page_public.py +5 -0
  174. opik/rest_api/types/prompt_version_public.py +1 -0
  175. opik/rest_api/types/prompt_version_update.py +33 -0
  176. opik/rest_api/types/provider_api_key.py +5 -1
  177. opik/rest_api/types/provider_api_key_provider.py +2 -1
  178. opik/rest_api/types/provider_api_key_public.py +5 -1
  179. opik/rest_api/types/provider_api_key_public_provider.py +2 -1
  180. opik/rest_api/types/service_toggles_config.py +11 -1
  181. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  182. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  183. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  184. opik/types.py +36 -0
  185. opik/validation/chat_prompt_messages.py +241 -0
  186. opik/validation/feedback_score.py +3 -3
  187. opik/validation/validator.py +28 -0
  188. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/METADATA +7 -7
  189. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/RECORD +193 -142
  190. opik/cli/export.py +0 -791
  191. opik/cli/import_command.py +0 -575
  192. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
  193. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
  194. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
  195. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,36 @@
1
+ import dataclasses
2
+ from typing import Literal
3
+
4
+ from . import attachment
5
+
6
+
7
+ @dataclasses.dataclass
8
+ class AttachmentWithContext:
9
+ """
10
+ Represents an attachment along with its associated context.
11
+
12
+ This class is used to pair an attachment with additional contextual
13
+ information such as the entity type, entity ID, project name, and
14
+ context description. It is specifically useful when dealing with
15
+ attachments related to entities like spans or traces. The context
16
+ can help provide further insights or classification of the
17
+ attachment's purpose.
18
+
19
+ Attributes:
20
+ attachment_data: The actual attachment
21
+ object containing the associated data.
22
+ entity_type: The type of entity the
23
+ attachment is associated with. It must be either "span"
24
+ or "trace".
25
+ entity_id: The unique identifier of the related entity.
26
+ project_name: The name of the project to which the
27
+ attachment and its entity belong.
28
+ context: A brief context description for the attachment,
29
+ explaining its purpose or relevance.
30
+ """
31
+
32
+ attachment_data: attachment.Attachment
33
+ entity_type: Literal["span", "trace"]
34
+ entity_id: str
35
+ project_name: str
36
+ context: str
@@ -0,0 +1,153 @@
1
+ import re
2
+ from typing import Dict, Any, Literal, List, NamedTuple
3
+
4
+ from . import attachment, attachment_context, decoder_base64
5
+
6
+
7
+ class ExtractionResult(NamedTuple):
8
+ attachments: List[attachment.Attachment]
9
+ sanitized_data: Any
10
+
11
+
12
+ class AttachmentsExtractor:
13
+ """
14
+ Extracts and processes attachments embedded as Base64 strings within data structures.
15
+
16
+ This class is designed to identify and decode Base64-encoded attachments located
17
+ within the provided data. It uses a regular expression pattern to search for
18
+ Base64 strings that meet a specified minimum length. Extracted attachments are
19
+ decoded and replaced with sanitized placeholders in the original data.
20
+ """
21
+
22
+ def __init__(self, min_attachment_size: int):
23
+ """
24
+ Initializes the class with a minimum attachment size and configures the base64
25
+ pattern for decoding attachments based on its length.
26
+
27
+ Args:
28
+ min_attachment_size: The minimum size of the attachment in characters
29
+ for it to be considered valid. This ensures that only large enough
30
+ base64 strings are matched to minimize false positives.
31
+ """
32
+ self._min_attachment_size = min_attachment_size
33
+ self.decoder = decoder_base64.Base64AttachmentDecoder()
34
+
35
+ # Pattern to match base64 strings (can be embedded in text)
36
+ # Requires at least min_attachment_size characters to reduce false positives
37
+ min_base64_groups = int(min_attachment_size / 4)
38
+ BASE64_PATTERN = (
39
+ r"(?:[A-Za-z0-9+/]{4}){"
40
+ + str(min_base64_groups)
41
+ + ",}(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"
42
+ )
43
+ self.pattern = re.compile(BASE64_PATTERN)
44
+
45
+ def extract_and_replace(
46
+ self,
47
+ data: Dict[str, Any],
48
+ entity_type: Literal["span", "trace"],
49
+ entity_id: str,
50
+ project_name: str,
51
+ context: Literal["input", "output", "metadata"],
52
+ ) -> List[attachment_context.AttachmentWithContext]:
53
+ # iterate over all items and extract attachments
54
+ attachments: List[attachment_context.AttachmentWithContext] = []
55
+ for key, value in data.items():
56
+ extraction_result = self._try_extract_attachments(value, context)
57
+ if extraction_result.attachments:
58
+ # replace the original value with the sanitized one and collect attachments
59
+ data[key] = extraction_result.sanitized_data
60
+ for extracted_attachment in extraction_result.attachments:
61
+ attachments.append(
62
+ attachment_context.AttachmentWithContext(
63
+ attachment_data=extracted_attachment,
64
+ entity_type=entity_type,
65
+ entity_id=entity_id,
66
+ project_name=project_name,
67
+ context=context,
68
+ )
69
+ )
70
+
71
+ return attachments
72
+
73
+ def _try_extract_attachments(
74
+ self, data: Any, context: Literal["input", "output", "metadata"]
75
+ ) -> ExtractionResult:
76
+ """
77
+ Recursively extract attachments from data that can be a string, dict, list, or other type.
78
+
79
+ Args:
80
+ data: The data to process (can be str, dict, list, or other types)
81
+ context: The context where the data is located (input, output, or metadata)
82
+
83
+ Returns:
84
+ ExtractionResult with extracted attachments and sanitized data
85
+ """
86
+ # Handle string data - check for base64 attachments
87
+ if isinstance(data, str):
88
+ return self._extract_from_string(data, context)
89
+
90
+ # Handle dictionary data - recursively process each value
91
+ elif isinstance(data, dict):
92
+ return self._extract_from_dict(data, context)
93
+
94
+ # Handle list data - recursively process each element
95
+ elif isinstance(data, list):
96
+ return self._extract_from_list(data, context)
97
+
98
+ # For other types (int, bool, None, etc.), return as-is
99
+ else:
100
+ return ExtractionResult(attachments=[], sanitized_data=data)
101
+
102
+ def _extract_from_string(
103
+ self, data: str, context: Literal["input", "output", "metadata"]
104
+ ) -> ExtractionResult:
105
+ """Extract attachments from a string value."""
106
+ if len(data) < self._min_attachment_size:
107
+ # skip short strings
108
+ return ExtractionResult(attachments=[], sanitized_data=data)
109
+
110
+ attachments: List[attachment.Attachment] = []
111
+ sanitized_data = data
112
+ for match in self.pattern.finditer(data):
113
+ to_decode = match.group()
114
+ decoded_attachment = self.decoder.decode(to_decode, context)
115
+ if decoded_attachment is not None:
116
+ attachments.append(decoded_attachment)
117
+ sanitized_data = sanitized_data.replace(
118
+ to_decode, f"[{decoded_attachment.file_name}]"
119
+ )
120
+
121
+ return ExtractionResult(attachments=attachments, sanitized_data=sanitized_data)
122
+
123
+ def _extract_from_dict(
124
+ self, data: Dict[str, Any], context: Literal["input", "output", "metadata"]
125
+ ) -> ExtractionResult:
126
+ """Recursively extract attachments from a dictionary."""
127
+ all_attachments: List[attachment.Attachment] = []
128
+ sanitized_dict = {}
129
+
130
+ for key, value in data.items():
131
+ result = self._try_extract_attachments(value, context)
132
+ sanitized_dict[key] = result.sanitized_data
133
+ all_attachments.extend(result.attachments)
134
+
135
+ return ExtractionResult(
136
+ attachments=all_attachments, sanitized_data=sanitized_dict
137
+ )
138
+
139
+ def _extract_from_list(
140
+ self, data: List[Any], context: Literal["input", "output", "metadata"]
141
+ ) -> ExtractionResult:
142
+ """Recursively extract attachments from a list."""
143
+ all_attachments: List[attachment.Attachment] = []
144
+ sanitized_list = []
145
+
146
+ for item in data:
147
+ result = self._try_extract_attachments(item, context)
148
+ sanitized_list.append(result.sanitized_data)
149
+ all_attachments.extend(result.attachments)
150
+
151
+ return ExtractionResult(
152
+ attachments=all_attachments, sanitized_data=sanitized_list
153
+ )
@@ -206,6 +206,7 @@ class AttachmentClient:
206
206
  entity_id=entity_id,
207
207
  project_name=project_name,
208
208
  encoded_url_override=encoded_url_override,
209
+ delete_after_upload=False,
209
210
  )
210
211
 
211
212
  file_uploader.upload_attachment(
@@ -13,6 +13,7 @@ def attachment_to_message(
13
13
  entity_id: str,
14
14
  project_name: str,
15
15
  url_override: str,
16
+ delete_after_upload: bool = False,
16
17
  ) -> messages.CreateAttachmentMessage:
17
18
  if attachment_data.data is None:
18
19
  raise ValueError("Attachment data cannot be None")
@@ -32,6 +33,7 @@ def attachment_to_message(
32
33
  entity_id=entity_id,
33
34
  project_name=project_name,
34
35
  encoded_url_override=base_url_path,
36
+ delete_after_upload=delete_after_upload,
35
37
  )
36
38
 
37
39
 
@@ -0,0 +1,18 @@
1
+ import abc
2
+ from typing import Any, Optional
3
+
4
+ from . import attachment
5
+
6
+
7
+ class AttachmentDecoder(abc.ABC):
8
+ """
9
+ Abstract base class for decoding file attachments.
10
+
11
+ This class serves as an interface for decoding raw attachment data into
12
+ an `Attachment` object. Implementing classes should define the specific
13
+ logic to handle various attachment decoding formats.
14
+ """
15
+
16
+ @abc.abstractmethod
17
+ def decode(self, raw_data: str, **kwargs: Any) -> Optional[attachment.Attachment]:
18
+ pass
@@ -0,0 +1,83 @@
1
+ import base64
2
+ import binascii
3
+ import logging
4
+ import tempfile
5
+ from typing import Any, Optional, Literal
6
+
7
+ from . import attachment, decoder, decoder_helpers
8
+
9
+ LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ class Base64AttachmentDecoder(decoder.AttachmentDecoder):
13
+ """Decodes base64 encoded attachment data.
14
+
15
+ This decoder decodes base64 strings, detects MIME types from content, and creates Attachment objects.
16
+ """
17
+
18
+ def decode(
19
+ self,
20
+ raw_data: str,
21
+ context: Literal["input", "output", "metadata"] = "input",
22
+ **kwargs: Any,
23
+ ) -> Optional[attachment.Attachment]:
24
+ """Decode base64 encoded data into an Attachment object.
25
+
26
+ Args:
27
+ raw_data: Base64 encoded string data
28
+ context: Context string for filename generation.
29
+
30
+ Returns:
31
+ Attachment object with decoded data, or None if decoding fails or type is not recognizable
32
+ """
33
+ if not isinstance(raw_data, str):
34
+ LOGGER.warning("Attachment data is not a string, skipping.")
35
+ return None
36
+
37
+ try:
38
+ # Decode base64 string to bytes
39
+ decoded_bytes = base64.b64decode(raw_data, validate=True)
40
+
41
+ # Detect MIME type from content
42
+ mime_type = decoder_helpers.detect_mime_type(decoded_bytes)
43
+
44
+ # Skip if not a recognizable file type
45
+ if not mime_type or mime_type in ("application/octet-stream", "text/plain"):
46
+ LOGGER.debug("Attachment type is not recognized, skipping.")
47
+ return None
48
+
49
+ # Get file extension from the MIME type
50
+ extension = decoder_helpers.get_file_extension(mime_type)
51
+
52
+ # Generate filename
53
+ file_name = decoder_helpers.create_attachment_filename(
54
+ context, extension=extension
55
+ )
56
+
57
+ # Save decoded bytes to a temporary file
58
+ temp_file = tempfile.NamedTemporaryFile(
59
+ mode="wb", delete=False, suffix=extension
60
+ )
61
+ temp_file.write(decoded_bytes)
62
+ temp_file.flush()
63
+ temp_file.close()
64
+
65
+ # Return Attachment object with a file path
66
+ return attachment.Attachment(
67
+ data=temp_file.name, file_name=file_name, content_type=mime_type
68
+ )
69
+
70
+ except (ValueError, binascii.Error) as e:
71
+ LOGGER.debug(
72
+ "Failed to decode attachment data, reason: invalid base64. Reason: %s",
73
+ e,
74
+ exc_info=True,
75
+ )
76
+ # Not valid base64, return None
77
+ return None
78
+ except Exception as ex:
79
+ LOGGER.warning(
80
+ "Failed to decode attachment data, reason: %s", ex, exc_info=True
81
+ )
82
+ # Unexpected error, return None to avoid crashing the pipeline
83
+ return None
@@ -0,0 +1,137 @@
1
+ import mimetypes
2
+ import random
3
+ import time
4
+ from typing import Optional
5
+
6
+
7
+ # The attachment file name regex
8
+ ATTACHMENT_FILE_NAME_REGEX = r"(?:input|output|metadata)-attachment-\d+-\d+-sdk\.\w+"
9
+ ATTACHMENT_FILE_NAME_PLACEHOLDER_REGEX = (
10
+ r"\[((?:input|output|metadata)-attachment-\d+-\d+-sdk\.\w+)\]"
11
+ )
12
+
13
+
14
+ def get_file_extension(mime_type: str) -> str:
15
+ """Convert MIME type to file extension.
16
+
17
+ Mirrors the Java getFileExtension() method in AttachmentStripperService.
18
+
19
+ Args:
20
+ mime_type: The MIME type (e.g., "image/png", "application/pdf")
21
+
22
+ Returns:
23
+ File extension without a leading dot (e.g., "png", "pdf")
24
+ """
25
+ if not mime_type:
26
+ return "bin"
27
+
28
+ # Try to get extension from mimetypes module
29
+ extension = mimetypes.guess_extension(mime_type, strict=False)
30
+
31
+ if extension:
32
+ # Remove the leading dot
33
+ extension = extension.lstrip(".")
34
+ # Handle special cases where mimetypes returns less common extensions
35
+ if mime_type == "image/jpeg" and extension == "jpe":
36
+ return "jpg"
37
+ return extension
38
+
39
+ # Fallback: extract from the MIME type (e.g., "image/png" -> "png")
40
+ if "/" in mime_type:
41
+ subtype = mime_type.split("/")[1]
42
+ # Handle special cases like "svg+xml" -> "svg"
43
+ if "+" in subtype:
44
+ subtype = subtype.split("+")[0]
45
+ # Remove any parameters (e.g., "jpeg; charset=utf-8" -> "jpeg")
46
+ subtype = subtype.split(";")[0].strip()
47
+ return subtype
48
+
49
+ return "bin"
50
+
51
+
52
+ def detect_mime_type(data: bytes) -> Optional[str]:
53
+ """Detect MIME type from byte content using magic bytes.
54
+
55
+ This provides basic MIME type detection similar to Apache Tika in the Java implementation.
56
+ It checks common file format magic bytes.
57
+
58
+ Args:
59
+ data: The byte data to analyze
60
+
61
+ Returns:
62
+ Detected MIME type string, or "application/octet-stream" if unknown
63
+ """
64
+ if len(data) < 4:
65
+ return "application/octet-stream"
66
+
67
+ # Check common file format magic bytes
68
+ # PNG
69
+ if data[:8] == b"\x89PNG\r\n\x1a\n":
70
+ return "image/png"
71
+
72
+ # JPEG
73
+ if data[:2] == b"\xff\xd8" and data[-2:] == b"\xff\xd9":
74
+ return "image/jpeg"
75
+
76
+ # GIF
77
+ if data[:6] in (b"GIF87a", b"GIF89a"):
78
+ return "image/gif"
79
+
80
+ # PDF
81
+ if data[:4] == b"%PDF":
82
+ return "application/pdf"
83
+
84
+ # WebP
85
+ if data[:4] == b"RIFF" and data[8:12] == b"WEBP":
86
+ return "image/webp"
87
+
88
+ # SVG (XML-based, check for SVG tag)
89
+ try:
90
+ text = data[:1024].decode("utf-8", errors="ignore")
91
+ if "<svg" in text.lower():
92
+ return "image/svg+xml"
93
+ except Exception:
94
+ pass
95
+
96
+ # MP4
97
+ if len(data) >= 12 and data[4:8] == b"ftyp":
98
+ return "video/mp4"
99
+
100
+ # JSON
101
+ try:
102
+ text = data[:100].decode("utf-8", errors="strict").strip()
103
+ if text.startswith("{") or text.startswith("["):
104
+ return "application/json"
105
+ except Exception:
106
+ pass
107
+
108
+ # Default to octet-stream for unknown types
109
+ return "application/octet-stream"
110
+
111
+
112
+ def create_attachment_filename(context: str, extension: str) -> str:
113
+ """
114
+ Generates a unique attachment filename based on the provided context and file extension.
115
+
116
+ This function creates a filename by combining the given context, a randomly generated
117
+ prefix to ensure uniqueness, the current timestamp in milliseconds, and the provided
118
+ file extension. The generated filename aligns with the backend convention for naming
119
+ attachments, which includes specific formatting and structure.
120
+
121
+ Args:
122
+ context: The context to use as the base for the filename (e.g., "input",
123
+ "output", or "metadata").
124
+ extension: The file extension to use for the filename (e.g., "png",
125
+ "jpg", "txt").
126
+
127
+ Returns:
128
+ A generated filename string in the format
129
+ "{context}-attachment-{random_prefix}-{timestamp}.{extension}".
130
+ """
131
+ # The backend has the following naming convention: r"\\[((?:input|output|metadata)-attachment-\\d+-\\d+\\.\\w+)\\]"
132
+ # Example: [input-attachment-1-1704067200000.png]
133
+
134
+ timestamp = int(round(time.time() * 1000))
135
+ # we need to generate a large enough random prefix to avoid collisions
136
+ random_prefix = random.randint(1, 99999999)
137
+ return f"{context}-attachment-{random_prefix}-{timestamp}-sdk.{extension}"
@@ -5,3 +5,5 @@ FEEDBACK_SCORES_MAX_BATCH_SIZE = 1000
5
5
  EXPERIMENT_ITEMS_MAX_BATCH_SIZE = 1000
6
6
  DATASET_ITEMS_MAX_BATCH_SIZE = 1000
7
7
  DELETE_TRACE_BATCH_SIZE = 1000
8
+
9
+ DATASET_STREAM_BATCH_SIZE = 2000