opik 1.9.41__py3-none-any.whl → 1.9.86__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/constants.py +2 -0
- opik/api_objects/dataset/dataset.py +133 -40
- opik/api_objects/dataset/rest_operations.py +2 -0
- opik/api_objects/experiment/experiment.py +6 -0
- opik/api_objects/helpers.py +8 -4
- opik/api_objects/local_recording.py +6 -5
- opik/api_objects/observation_data.py +101 -0
- opik/api_objects/opik_client.py +78 -45
- opik/api_objects/opik_query_language.py +9 -3
- opik/api_objects/prompt/chat/chat_prompt.py +18 -1
- opik/api_objects/prompt/client.py +8 -1
- opik/api_objects/span/span_data.py +3 -88
- opik/api_objects/threads/threads_client.py +7 -4
- opik/api_objects/trace/trace_data.py +3 -74
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +14 -12
- opik/config.py +12 -1
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +4 -1
- opik/decorator/base_track_decorator.py +111 -37
- opik/decorator/context_manager/span_context_manager.py +5 -1
- opik/decorator/generator_wrappers.py +5 -4
- opik/decorator/span_creation_handler.py +13 -4
- opik/evaluation/engine/engine.py +111 -28
- opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
- opik/evaluation/evaluator.py +12 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
- opik/evaluation/metrics/heuristics/equals.py +11 -7
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
- opik/evaluation/models/litellm/util.py +4 -20
- opik/evaluation/models/models_factory.py +19 -5
- opik/evaluation/rest_operations.py +3 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/integrations/adk/legacy_opik_tracer.py +9 -11
- opik/integrations/adk/opik_tracer.py +2 -2
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
- opik/integrations/dspy/callback.py +100 -14
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_tracer.py +2 -2
- opik/integrations/langchain/__init__.py +15 -2
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_tracer.py +258 -160
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
- opik/integrations/llama_index/callback.py +43 -6
- opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
- opik/integrations/openai/opik_tracker.py +99 -4
- opik/integrations/openai/videos/__init__.py +9 -0
- opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
- opik/integrations/openai/videos/videos_create_decorator.py +159 -0
- opik/integrations/openai/videos/videos_download_decorator.py +110 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batchers.py +32 -40
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/emulator_message_processor.py +36 -1
- opik/message_processing/emulation/models.py +21 -0
- opik/message_processing/messages.py +9 -0
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
- opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
- opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
- opik/message_processing/queue_consumer.py +4 -2
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +36 -8
- opik/plugins/pytest/experiment_runner.py +1 -1
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +38 -0
- opik/rest_api/datasets/client.py +249 -148
- opik/rest_api/datasets/raw_client.py +356 -217
- opik/rest_api/experiments/client.py +26 -0
- opik/rest_api/experiments/raw_client.py +26 -0
- opik/rest_api/llm_provider_key/client.py +4 -4
- opik/rest_api/llm_provider_key/raw_client.py +4 -4
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
- opik/rest_api/manual_evaluation/client.py +101 -0
- opik/rest_api/manual_evaluation/raw_client.py +172 -0
- opik/rest_api/optimizations/client.py +0 -166
- opik/rest_api/optimizations/raw_client.py +0 -248
- opik/rest_api/projects/client.py +9 -0
- opik/rest_api/projects/raw_client.py +13 -0
- opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
- opik/rest_api/prompts/client.py +130 -2
- opik/rest_api/prompts/raw_client.py +175 -0
- opik/rest_api/traces/client.py +101 -0
- opik/rest_api/traces/raw_client.py +120 -0
- opik/rest_api/types/__init__.py +46 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +38 -2
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
- opik/rest_api/types/dataset_item.py +1 -1
- opik/rest_api/types/dataset_item_batch.py +4 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +1 -1
- opik/rest_api/types/dataset_item_filter.py +4 -0
- opik/rest_api/types/dataset_item_page_compare.py +0 -1
- opik/rest_api/types/dataset_item_page_public.py +0 -1
- opik/rest_api/types/dataset_item_public.py +1 -1
- opik/rest_api/types/dataset_version_public.py +5 -0
- opik/rest_api/types/dataset_version_summary.py +5 -0
- opik/rest_api/types/dataset_version_summary_public.py +5 -0
- opik/rest_api/types/experiment.py +9 -0
- opik/rest_api/types/experiment_public.py +9 -0
- opik/rest_api/types/llm_as_judge_message_content.py +2 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt_version.py +1 -0
- opik/rest_api/types/prompt_version_detail.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +1 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +5 -1
- opik/rest_api/types/provider_api_key_provider.py +2 -1
- opik/rest_api/types/provider_api_key_public.py +5 -1
- opik/rest_api/types/provider_api_key_public_provider.py +2 -1
- opik/rest_api/types/service_toggles_config.py +11 -1
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/METADATA +5 -5
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/RECORD +190 -141
- opik/cli/export.py +0 -791
- opik/cli/import_command.py +0 -575
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from . import attachment
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclasses.dataclass
|
|
8
|
+
class AttachmentWithContext:
|
|
9
|
+
"""
|
|
10
|
+
Represents an attachment along with its associated context.
|
|
11
|
+
|
|
12
|
+
This class is used to pair an attachment with additional contextual
|
|
13
|
+
information such as the entity type, entity ID, project name, and
|
|
14
|
+
context description. It is specifically useful when dealing with
|
|
15
|
+
attachments related to entities like spans or traces. The context
|
|
16
|
+
can help provide further insights or classification of the
|
|
17
|
+
attachment's purpose.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
attachment_data: The actual attachment
|
|
21
|
+
object containing the associated data.
|
|
22
|
+
entity_type: The type of entity the
|
|
23
|
+
attachment is associated with. It must be either "span"
|
|
24
|
+
or "trace".
|
|
25
|
+
entity_id: The unique identifier of the related entity.
|
|
26
|
+
project_name: The name of the project to which the
|
|
27
|
+
attachment and its entity belong.
|
|
28
|
+
context: A brief context description for the attachment,
|
|
29
|
+
explaining its purpose or relevance.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
attachment_data: attachment.Attachment
|
|
33
|
+
entity_type: Literal["span", "trace"]
|
|
34
|
+
entity_id: str
|
|
35
|
+
project_name: str
|
|
36
|
+
context: str
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Dict, Any, Literal, List, NamedTuple
|
|
3
|
+
|
|
4
|
+
from . import attachment, attachment_context, decoder_base64
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ExtractionResult(NamedTuple):
|
|
8
|
+
attachments: List[attachment.Attachment]
|
|
9
|
+
sanitized_data: Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AttachmentsExtractor:
|
|
13
|
+
"""
|
|
14
|
+
Extracts and processes attachments embedded as Base64 strings within data structures.
|
|
15
|
+
|
|
16
|
+
This class is designed to identify and decode Base64-encoded attachments located
|
|
17
|
+
within the provided data. It uses a regular expression pattern to search for
|
|
18
|
+
Base64 strings that meet a specified minimum length. Extracted attachments are
|
|
19
|
+
decoded and replaced with sanitized placeholders in the original data.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, min_attachment_size: int):
|
|
23
|
+
"""
|
|
24
|
+
Initializes the class with a minimum attachment size and configures the base64
|
|
25
|
+
pattern for decoding attachments based on its length.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
min_attachment_size: The minimum size of the attachment in characters
|
|
29
|
+
for it to be considered valid. This ensures that only large enough
|
|
30
|
+
base64 strings are matched to minimize false positives.
|
|
31
|
+
"""
|
|
32
|
+
self._min_attachment_size = min_attachment_size
|
|
33
|
+
self.decoder = decoder_base64.Base64AttachmentDecoder()
|
|
34
|
+
|
|
35
|
+
# Pattern to match base64 strings (can be embedded in text)
|
|
36
|
+
# Requires at least min_attachment_size characters to reduce false positives
|
|
37
|
+
min_base64_groups = int(min_attachment_size / 4)
|
|
38
|
+
BASE64_PATTERN = (
|
|
39
|
+
r"(?:[A-Za-z0-9+/]{4}){"
|
|
40
|
+
+ str(min_base64_groups)
|
|
41
|
+
+ ",}(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"
|
|
42
|
+
)
|
|
43
|
+
self.pattern = re.compile(BASE64_PATTERN)
|
|
44
|
+
|
|
45
|
+
def extract_and_replace(
|
|
46
|
+
self,
|
|
47
|
+
data: Dict[str, Any],
|
|
48
|
+
entity_type: Literal["span", "trace"],
|
|
49
|
+
entity_id: str,
|
|
50
|
+
project_name: str,
|
|
51
|
+
context: Literal["input", "output", "metadata"],
|
|
52
|
+
) -> List[attachment_context.AttachmentWithContext]:
|
|
53
|
+
# iterate over all items and extract attachments
|
|
54
|
+
attachments: List[attachment_context.AttachmentWithContext] = []
|
|
55
|
+
for key, value in data.items():
|
|
56
|
+
extraction_result = self._try_extract_attachments(value, context)
|
|
57
|
+
if extraction_result.attachments:
|
|
58
|
+
# replace the original value with the sanitized one and collect attachments
|
|
59
|
+
data[key] = extraction_result.sanitized_data
|
|
60
|
+
for extracted_attachment in extraction_result.attachments:
|
|
61
|
+
attachments.append(
|
|
62
|
+
attachment_context.AttachmentWithContext(
|
|
63
|
+
attachment_data=extracted_attachment,
|
|
64
|
+
entity_type=entity_type,
|
|
65
|
+
entity_id=entity_id,
|
|
66
|
+
project_name=project_name,
|
|
67
|
+
context=context,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return attachments
|
|
72
|
+
|
|
73
|
+
def _try_extract_attachments(
|
|
74
|
+
self, data: Any, context: Literal["input", "output", "metadata"]
|
|
75
|
+
) -> ExtractionResult:
|
|
76
|
+
"""
|
|
77
|
+
Recursively extract attachments from data that can be a string, dict, list, or other type.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
data: The data to process (can be str, dict, list, or other types)
|
|
81
|
+
context: The context where the data is located (input, output, or metadata)
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
ExtractionResult with extracted attachments and sanitized data
|
|
85
|
+
"""
|
|
86
|
+
# Handle string data - check for base64 attachments
|
|
87
|
+
if isinstance(data, str):
|
|
88
|
+
return self._extract_from_string(data, context)
|
|
89
|
+
|
|
90
|
+
# Handle dictionary data - recursively process each value
|
|
91
|
+
elif isinstance(data, dict):
|
|
92
|
+
return self._extract_from_dict(data, context)
|
|
93
|
+
|
|
94
|
+
# Handle list data - recursively process each element
|
|
95
|
+
elif isinstance(data, list):
|
|
96
|
+
return self._extract_from_list(data, context)
|
|
97
|
+
|
|
98
|
+
# For other types (int, bool, None, etc.), return as-is
|
|
99
|
+
else:
|
|
100
|
+
return ExtractionResult(attachments=[], sanitized_data=data)
|
|
101
|
+
|
|
102
|
+
def _extract_from_string(
|
|
103
|
+
self, data: str, context: Literal["input", "output", "metadata"]
|
|
104
|
+
) -> ExtractionResult:
|
|
105
|
+
"""Extract attachments from a string value."""
|
|
106
|
+
if len(data) < self._min_attachment_size:
|
|
107
|
+
# skip short strings
|
|
108
|
+
return ExtractionResult(attachments=[], sanitized_data=data)
|
|
109
|
+
|
|
110
|
+
attachments: List[attachment.Attachment] = []
|
|
111
|
+
sanitized_data = data
|
|
112
|
+
for match in self.pattern.finditer(data):
|
|
113
|
+
to_decode = match.group()
|
|
114
|
+
decoded_attachment = self.decoder.decode(to_decode, context)
|
|
115
|
+
if decoded_attachment is not None:
|
|
116
|
+
attachments.append(decoded_attachment)
|
|
117
|
+
sanitized_data = sanitized_data.replace(
|
|
118
|
+
to_decode, f"[{decoded_attachment.file_name}]"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return ExtractionResult(attachments=attachments, sanitized_data=sanitized_data)
|
|
122
|
+
|
|
123
|
+
def _extract_from_dict(
|
|
124
|
+
self, data: Dict[str, Any], context: Literal["input", "output", "metadata"]
|
|
125
|
+
) -> ExtractionResult:
|
|
126
|
+
"""Recursively extract attachments from a dictionary."""
|
|
127
|
+
all_attachments: List[attachment.Attachment] = []
|
|
128
|
+
sanitized_dict = {}
|
|
129
|
+
|
|
130
|
+
for key, value in data.items():
|
|
131
|
+
result = self._try_extract_attachments(value, context)
|
|
132
|
+
sanitized_dict[key] = result.sanitized_data
|
|
133
|
+
all_attachments.extend(result.attachments)
|
|
134
|
+
|
|
135
|
+
return ExtractionResult(
|
|
136
|
+
attachments=all_attachments, sanitized_data=sanitized_dict
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def _extract_from_list(
|
|
140
|
+
self, data: List[Any], context: Literal["input", "output", "metadata"]
|
|
141
|
+
) -> ExtractionResult:
|
|
142
|
+
"""Recursively extract attachments from a list."""
|
|
143
|
+
all_attachments: List[attachment.Attachment] = []
|
|
144
|
+
sanitized_list = []
|
|
145
|
+
|
|
146
|
+
for item in data:
|
|
147
|
+
result = self._try_extract_attachments(item, context)
|
|
148
|
+
sanitized_list.append(result.sanitized_data)
|
|
149
|
+
all_attachments.extend(result.attachments)
|
|
150
|
+
|
|
151
|
+
return ExtractionResult(
|
|
152
|
+
attachments=all_attachments, sanitized_data=sanitized_list
|
|
153
|
+
)
|
|
@@ -13,6 +13,7 @@ def attachment_to_message(
|
|
|
13
13
|
entity_id: str,
|
|
14
14
|
project_name: str,
|
|
15
15
|
url_override: str,
|
|
16
|
+
delete_after_upload: bool = False,
|
|
16
17
|
) -> messages.CreateAttachmentMessage:
|
|
17
18
|
if attachment_data.data is None:
|
|
18
19
|
raise ValueError("Attachment data cannot be None")
|
|
@@ -32,6 +33,7 @@ def attachment_to_message(
|
|
|
32
33
|
entity_id=entity_id,
|
|
33
34
|
project_name=project_name,
|
|
34
35
|
encoded_url_override=base_url_path,
|
|
36
|
+
delete_after_upload=delete_after_upload,
|
|
35
37
|
)
|
|
36
38
|
|
|
37
39
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
from . import attachment
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AttachmentDecoder(abc.ABC):
|
|
8
|
+
"""
|
|
9
|
+
Abstract base class for decoding file attachments.
|
|
10
|
+
|
|
11
|
+
This class serves as an interface for decoding raw attachment data into
|
|
12
|
+
an `Attachment` object. Implementing classes should define the specific
|
|
13
|
+
logic to handle various attachment decoding formats.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@abc.abstractmethod
|
|
17
|
+
def decode(self, raw_data: str, **kwargs: Any) -> Optional[attachment.Attachment]:
|
|
18
|
+
pass
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import binascii
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
from typing import Any, Optional, Literal
|
|
6
|
+
|
|
7
|
+
from . import attachment, decoder, decoder_helpers
|
|
8
|
+
|
|
9
|
+
LOGGER = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Base64AttachmentDecoder(decoder.AttachmentDecoder):
|
|
13
|
+
"""Decodes base64 encoded attachment data.
|
|
14
|
+
|
|
15
|
+
This decoder decodes base64 strings, detects MIME types from content, and creates Attachment objects.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def decode(
|
|
19
|
+
self,
|
|
20
|
+
raw_data: str,
|
|
21
|
+
context: Literal["input", "output", "metadata"] = "input",
|
|
22
|
+
**kwargs: Any,
|
|
23
|
+
) -> Optional[attachment.Attachment]:
|
|
24
|
+
"""Decode base64 encoded data into an Attachment object.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
raw_data: Base64 encoded string data
|
|
28
|
+
context: Context string for filename generation.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Attachment object with decoded data, or None if decoding fails or type is not recognizable
|
|
32
|
+
"""
|
|
33
|
+
if not isinstance(raw_data, str):
|
|
34
|
+
LOGGER.warning("Attachment data is not a string, skipping.")
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
# Decode base64 string to bytes
|
|
39
|
+
decoded_bytes = base64.b64decode(raw_data, validate=True)
|
|
40
|
+
|
|
41
|
+
# Detect MIME type from content
|
|
42
|
+
mime_type = decoder_helpers.detect_mime_type(decoded_bytes)
|
|
43
|
+
|
|
44
|
+
# Skip if not a recognizable file type
|
|
45
|
+
if not mime_type or mime_type in ("application/octet-stream", "text/plain"):
|
|
46
|
+
LOGGER.debug("Attachment type is not recognized, skipping.")
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
# Get file extension from the MIME type
|
|
50
|
+
extension = decoder_helpers.get_file_extension(mime_type)
|
|
51
|
+
|
|
52
|
+
# Generate filename
|
|
53
|
+
file_name = decoder_helpers.create_attachment_filename(
|
|
54
|
+
context, extension=extension
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Save decoded bytes to a temporary file
|
|
58
|
+
temp_file = tempfile.NamedTemporaryFile(
|
|
59
|
+
mode="wb", delete=False, suffix=extension
|
|
60
|
+
)
|
|
61
|
+
temp_file.write(decoded_bytes)
|
|
62
|
+
temp_file.flush()
|
|
63
|
+
temp_file.close()
|
|
64
|
+
|
|
65
|
+
# Return Attachment object with a file path
|
|
66
|
+
return attachment.Attachment(
|
|
67
|
+
data=temp_file.name, file_name=file_name, content_type=mime_type
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
except (ValueError, binascii.Error) as e:
|
|
71
|
+
LOGGER.debug(
|
|
72
|
+
"Failed to decode attachment data, reason: invalid base64. Reason: %s",
|
|
73
|
+
e,
|
|
74
|
+
exc_info=True,
|
|
75
|
+
)
|
|
76
|
+
# Not valid base64, return None
|
|
77
|
+
return None
|
|
78
|
+
except Exception as ex:
|
|
79
|
+
LOGGER.warning(
|
|
80
|
+
"Failed to decode attachment data, reason: %s", ex, exc_info=True
|
|
81
|
+
)
|
|
82
|
+
# Unexpected error, return None to avoid crashing the pipeline
|
|
83
|
+
return None
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import mimetypes
|
|
2
|
+
import random
|
|
3
|
+
import time
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# The attachment file name regex
|
|
8
|
+
ATTACHMENT_FILE_NAME_REGEX = r"(?:input|output|metadata)-attachment-\d+-\d+-sdk\.\w+"
|
|
9
|
+
ATTACHMENT_FILE_NAME_PLACEHOLDER_REGEX = (
|
|
10
|
+
r"\[((?:input|output|metadata)-attachment-\d+-\d+-sdk\.\w+)\]"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_file_extension(mime_type: str) -> str:
|
|
15
|
+
"""Convert MIME type to file extension.
|
|
16
|
+
|
|
17
|
+
Mirrors the Java getFileExtension() method in AttachmentStripperService.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
mime_type: The MIME type (e.g., "image/png", "application/pdf")
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
File extension without a leading dot (e.g., "png", "pdf")
|
|
24
|
+
"""
|
|
25
|
+
if not mime_type:
|
|
26
|
+
return "bin"
|
|
27
|
+
|
|
28
|
+
# Try to get extension from mimetypes module
|
|
29
|
+
extension = mimetypes.guess_extension(mime_type, strict=False)
|
|
30
|
+
|
|
31
|
+
if extension:
|
|
32
|
+
# Remove the leading dot
|
|
33
|
+
extension = extension.lstrip(".")
|
|
34
|
+
# Handle special cases where mimetypes returns less common extensions
|
|
35
|
+
if mime_type == "image/jpeg" and extension == "jpe":
|
|
36
|
+
return "jpg"
|
|
37
|
+
return extension
|
|
38
|
+
|
|
39
|
+
# Fallback: extract from the MIME type (e.g., "image/png" -> "png")
|
|
40
|
+
if "/" in mime_type:
|
|
41
|
+
subtype = mime_type.split("/")[1]
|
|
42
|
+
# Handle special cases like "svg+xml" -> "svg"
|
|
43
|
+
if "+" in subtype:
|
|
44
|
+
subtype = subtype.split("+")[0]
|
|
45
|
+
# Remove any parameters (e.g., "jpeg; charset=utf-8" -> "jpeg")
|
|
46
|
+
subtype = subtype.split(";")[0].strip()
|
|
47
|
+
return subtype
|
|
48
|
+
|
|
49
|
+
return "bin"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def detect_mime_type(data: bytes) -> Optional[str]:
|
|
53
|
+
"""Detect MIME type from byte content using magic bytes.
|
|
54
|
+
|
|
55
|
+
This provides basic MIME type detection similar to Apache Tika in the Java implementation.
|
|
56
|
+
It checks common file format magic bytes.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
data: The byte data to analyze
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Detected MIME type string, or "application/octet-stream" if unknown
|
|
63
|
+
"""
|
|
64
|
+
if len(data) < 4:
|
|
65
|
+
return "application/octet-stream"
|
|
66
|
+
|
|
67
|
+
# Check common file format magic bytes
|
|
68
|
+
# PNG
|
|
69
|
+
if data[:8] == b"\x89PNG\r\n\x1a\n":
|
|
70
|
+
return "image/png"
|
|
71
|
+
|
|
72
|
+
# JPEG
|
|
73
|
+
if data[:2] == b"\xff\xd8" and data[-2:] == b"\xff\xd9":
|
|
74
|
+
return "image/jpeg"
|
|
75
|
+
|
|
76
|
+
# GIF
|
|
77
|
+
if data[:6] in (b"GIF87a", b"GIF89a"):
|
|
78
|
+
return "image/gif"
|
|
79
|
+
|
|
80
|
+
# PDF
|
|
81
|
+
if data[:4] == b"%PDF":
|
|
82
|
+
return "application/pdf"
|
|
83
|
+
|
|
84
|
+
# WebP
|
|
85
|
+
if data[:4] == b"RIFF" and data[8:12] == b"WEBP":
|
|
86
|
+
return "image/webp"
|
|
87
|
+
|
|
88
|
+
# SVG (XML-based, check for SVG tag)
|
|
89
|
+
try:
|
|
90
|
+
text = data[:1024].decode("utf-8", errors="ignore")
|
|
91
|
+
if "<svg" in text.lower():
|
|
92
|
+
return "image/svg+xml"
|
|
93
|
+
except Exception:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# MP4
|
|
97
|
+
if len(data) >= 12 and data[4:8] == b"ftyp":
|
|
98
|
+
return "video/mp4"
|
|
99
|
+
|
|
100
|
+
# JSON
|
|
101
|
+
try:
|
|
102
|
+
text = data[:100].decode("utf-8", errors="strict").strip()
|
|
103
|
+
if text.startswith("{") or text.startswith("["):
|
|
104
|
+
return "application/json"
|
|
105
|
+
except Exception:
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
# Default to octet-stream for unknown types
|
|
109
|
+
return "application/octet-stream"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def create_attachment_filename(context: str, extension: str) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Generates a unique attachment filename based on the provided context and file extension.
|
|
115
|
+
|
|
116
|
+
This function creates a filename by combining the given context, a randomly generated
|
|
117
|
+
prefix to ensure uniqueness, the current timestamp in milliseconds, and the provided
|
|
118
|
+
file extension. The generated filename aligns with the backend convention for naming
|
|
119
|
+
attachments, which includes specific formatting and structure.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
context: The context to use as the base for the filename (e.g., "input",
|
|
123
|
+
"output", or "metadata").
|
|
124
|
+
extension: The file extension to use for the filename (e.g., "png",
|
|
125
|
+
"jpg", "txt").
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
A generated filename string in the format
|
|
129
|
+
"{context}-attachment-{random_prefix}-{timestamp}.{extension}".
|
|
130
|
+
"""
|
|
131
|
+
# The backend has the following naming convention: r"\\[((?:input|output|metadata)-attachment-\\d+-\\d+\\.\\w+)\\]"
|
|
132
|
+
# Example: [input-attachment-1-1704067200000.png]
|
|
133
|
+
|
|
134
|
+
timestamp = int(round(time.time() * 1000))
|
|
135
|
+
# we need to generate a large enough random prefix to avoid collisions
|
|
136
|
+
random_prefix = random.randint(1, 99999999)
|
|
137
|
+
return f"{context}-attachment-{random_prefix}-{timestamp}-sdk.{extension}"
|