nv-ingest-api 2025.5.12.dev20250512__py3-none-any.whl → 2025.5.14.dev20250514__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/transform.py +1 -1
- nv_ingest_api/internal/extract/docx/docx_extractor.py +3 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +3 -3
- nv_ingest_api/internal/extract/image/image_extractor.py +5 -5
- nv_ingest_api/internal/extract/image/image_helpers/common.py +1 -1
- nv_ingest_api/internal/extract/image/infographic_extractor.py +1 -1
- nv_ingest_api/internal/extract/image/table_extractor.py +2 -2
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +2 -2
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +1 -1
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +44 -17
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +1 -1
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -1
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +7 -1
- nv_ingest_api/internal/primitives/nim/nim_client.py +1 -1
- nv_ingest_api/internal/primitives/tracing/tagging.py +20 -16
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +1 -1
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +2 -2
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +1 -1
- nv_ingest_api/internal/transform/caption_image.py +1 -1
- nv_ingest_api/internal/transform/embed_text.py +75 -56
- nv_ingest_api/util/exception_handlers/converters.py +1 -1
- nv_ingest_api/util/exception_handlers/decorators.py +309 -51
- nv_ingest_api/util/image_processing/processing.py +1 -1
- nv_ingest_api/util/logging/configuration.py +15 -8
- nv_ingest_api/util/pdf/pdfium.py +2 -2
- nv_ingest_api/util/service_clients/redis/redis_client.py +1 -1
- nv_ingest_api/util/service_clients/rest/rest_client.py +1 -1
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +426 -0
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/RECORD +34 -32
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/WHEEL +1 -1
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.5.12.dev20250512.dist-info → nv_ingest_api-2025.5.14.dev20250514.dist-info}/top_level.txt +0 -0
|
@@ -207,7 +207,7 @@ def transform_image_create_vlm_caption(
|
|
|
207
207
|
"api_key": api_key,
|
|
208
208
|
"prompt": prompt,
|
|
209
209
|
"endpoint_url": endpoint_url,
|
|
210
|
-
"
|
|
210
|
+
"image_caption_model_name": model_name,
|
|
211
211
|
}
|
|
212
212
|
filtered_task_config: Dict[str, str] = {k: v for k, v in task_config.items() if v is not None}
|
|
213
213
|
|
|
@@ -7,7 +7,7 @@ import base64
|
|
|
7
7
|
import functools
|
|
8
8
|
import io
|
|
9
9
|
import logging
|
|
10
|
-
from typing import Optional, Dict, Any, Union
|
|
10
|
+
from typing import Optional, Dict, Any, Union, Tuple
|
|
11
11
|
|
|
12
12
|
import pandas as pd
|
|
13
13
|
from pydantic import BaseModel
|
|
@@ -146,7 +146,7 @@ def extract_primitives_from_docx_internal(
|
|
|
146
146
|
task_config: Union[Dict[str, Any], BaseModel],
|
|
147
147
|
extraction_config: DocxExtractorSchema,
|
|
148
148
|
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
149
|
-
) -> pd.DataFrame:
|
|
149
|
+
) -> Tuple[pd.DataFrame, Union[Dict, None]]:
|
|
150
150
|
"""
|
|
151
151
|
Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
|
|
152
152
|
each document and replacing the original content with the extracted text.
|
|
@@ -202,4 +202,4 @@ def extract_primitives_from_docx_internal(
|
|
|
202
202
|
else:
|
|
203
203
|
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
204
204
|
|
|
205
|
-
return extracted_df
|
|
205
|
+
return extracted_df, {}
|
|
@@ -27,7 +27,7 @@ from nv_ingest_api.util.nim import create_inference_client
|
|
|
27
27
|
PADDLE_MIN_WIDTH = 32
|
|
28
28
|
PADDLE_MIN_HEIGHT = 32
|
|
29
29
|
|
|
30
|
-
logger = logging.getLogger(f"
|
|
30
|
+
logger = logging.getLogger(f"ray.{__name__}")
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def _filter_valid_chart_images(
|
|
@@ -80,7 +80,7 @@ def _run_chart_inference(
|
|
|
80
80
|
yolox_client.infer,
|
|
81
81
|
data=data_yolox,
|
|
82
82
|
model_name="yolox",
|
|
83
|
-
stage_name="
|
|
83
|
+
stage_name="chart_extraction",
|
|
84
84
|
max_batch_size=8,
|
|
85
85
|
trace_info=trace_info,
|
|
86
86
|
)
|
|
@@ -88,7 +88,7 @@ def _run_chart_inference(
|
|
|
88
88
|
paddle_client.infer,
|
|
89
89
|
data=data_paddle,
|
|
90
90
|
model_name="paddle",
|
|
91
|
-
stage_name="
|
|
91
|
+
stage_name="chart_extraction",
|
|
92
92
|
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
93
93
|
trace_info=trace_info,
|
|
94
94
|
)
|
|
@@ -16,7 +16,7 @@ import pandas as pd
|
|
|
16
16
|
from pydantic import BaseModel
|
|
17
17
|
|
|
18
18
|
from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
|
|
19
|
-
from nv_ingest_api.internal.schemas.extract.extract_image_schema import
|
|
19
|
+
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
|
|
20
20
|
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
|
|
|
26
26
|
def _decode_and_extract_from_image(
|
|
27
27
|
base64_row: pd.Series,
|
|
28
28
|
task_config: Dict[str, Any],
|
|
29
|
-
validated_extraction_config:
|
|
29
|
+
validated_extraction_config: ImageConfigSchema,
|
|
30
30
|
execution_trace_log: Optional[List[Any]] = None,
|
|
31
31
|
) -> Any:
|
|
32
32
|
"""
|
|
@@ -106,10 +106,10 @@ def _decode_and_extract_from_image(
|
|
|
106
106
|
|
|
107
107
|
logger.debug(
|
|
108
108
|
f"decode_and_extract: Extracting image content using image_extraction_config: "
|
|
109
|
-
f"{validated_extraction_config
|
|
109
|
+
f"{validated_extraction_config}"
|
|
110
110
|
)
|
|
111
|
-
if validated_extraction_config
|
|
112
|
-
extract_params["image_extraction_config"] = validated_extraction_config
|
|
111
|
+
if validated_extraction_config is not None:
|
|
112
|
+
extract_params["image_extraction_config"] = validated_extraction_config
|
|
113
113
|
|
|
114
114
|
if execution_trace_log is not None:
|
|
115
115
|
extract_params["trace_info"] = execution_trace_log
|
|
@@ -223,7 +223,7 @@ def extract_page_elements_from_images(
|
|
|
223
223
|
model_name="yolox",
|
|
224
224
|
max_batch_size=YOLOX_MAX_BATCH_SIZE,
|
|
225
225
|
trace_info=trace_info,
|
|
226
|
-
stage_name="
|
|
226
|
+
stage_name="pdf_extraction",
|
|
227
227
|
)
|
|
228
228
|
|
|
229
229
|
# Process each result along with its corresponding image.
|
|
@@ -100,7 +100,7 @@ def _update_infographic_metadata(
|
|
|
100
100
|
paddle_results = paddle_client.infer(
|
|
101
101
|
data=data_paddle,
|
|
102
102
|
model_name="paddle",
|
|
103
|
-
stage_name="
|
|
103
|
+
stage_name="infographic_extraction",
|
|
104
104
|
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
105
105
|
trace_info=trace_info,
|
|
106
106
|
)
|
|
@@ -81,7 +81,7 @@ def _run_inference(
|
|
|
81
81
|
yolox_client.infer,
|
|
82
82
|
data=data_yolox,
|
|
83
83
|
model_name="yolox",
|
|
84
|
-
stage_name="
|
|
84
|
+
stage_name="table_extraction",
|
|
85
85
|
max_batch_size=8,
|
|
86
86
|
trace_info=trace_info,
|
|
87
87
|
)
|
|
@@ -89,7 +89,7 @@ def _run_inference(
|
|
|
89
89
|
paddle_client.infer,
|
|
90
90
|
data=data_paddle,
|
|
91
91
|
model_name="paddle",
|
|
92
|
-
stage_name="
|
|
92
|
+
stage_name="table_extraction",
|
|
93
93
|
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
94
94
|
trace_info=trace_info,
|
|
95
95
|
)
|
|
@@ -466,7 +466,7 @@ def _extract_text_and_bounding_boxes(
|
|
|
466
466
|
inference_results = nemoretriever_parse_client.infer(
|
|
467
467
|
data=data,
|
|
468
468
|
model_name="nemoretriever_parse",
|
|
469
|
-
stage_name="
|
|
469
|
+
stage_name="pdf_extraction",
|
|
470
470
|
max_batch_size=NEMORETRIEVER_PARSE_MAX_BATCH_SIZE,
|
|
471
471
|
execution_trace_log=execution_trace_log,
|
|
472
472
|
)
|
|
@@ -476,7 +476,7 @@ def _extract_text_and_bounding_boxes(
|
|
|
476
476
|
|
|
477
477
|
def _create_clients(nemoretriever_parse_config):
|
|
478
478
|
model_interface = nemoretriever_parse_utils.NemoRetrieverParseModelInterface(
|
|
479
|
-
model_name=nemoretriever_parse_config.
|
|
479
|
+
model_name=nemoretriever_parse_config.nemoretriever_parse_model_name,
|
|
480
480
|
)
|
|
481
481
|
nemoretriever_parse_client = create_inference_client(
|
|
482
482
|
nemoretriever_parse_config.nemoretriever_parse_endpoints,
|
|
@@ -105,7 +105,7 @@ def _extract_page_elements_using_image_ensemble(
|
|
|
105
105
|
model_name="yolox",
|
|
106
106
|
max_batch_size=YOLOX_MAX_BATCH_SIZE,
|
|
107
107
|
trace_info=execution_trace_log,
|
|
108
|
-
stage_name="
|
|
108
|
+
stage_name="pdf_extraction",
|
|
109
109
|
)
|
|
110
110
|
|
|
111
111
|
# Process results: iterate over each image's inference output.
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
|
|
18
18
|
import logging
|
|
19
19
|
import io
|
|
20
|
-
import operator
|
|
21
20
|
import re
|
|
22
21
|
import uuid
|
|
23
22
|
from collections import defaultdict
|
|
@@ -155,6 +154,12 @@ def _finalize_images(
|
|
|
155
154
|
extracted_data.append(image_entry)
|
|
156
155
|
|
|
157
156
|
|
|
157
|
+
def _safe_position(shape):
|
|
158
|
+
top = shape.top if shape.top is not None else float("inf")
|
|
159
|
+
left = shape.left if shape.left is not None else float("inf")
|
|
160
|
+
return (top, left)
|
|
161
|
+
|
|
162
|
+
|
|
158
163
|
# -----------------------------------------------------------------------------
|
|
159
164
|
# Helper Function: Recursive Image Extraction
|
|
160
165
|
# -----------------------------------------------------------------------------
|
|
@@ -283,7 +288,7 @@ def python_pptx(
|
|
|
283
288
|
|
|
284
289
|
for slide_idx, slide in enumerate(presentation.slides):
|
|
285
290
|
# Obtain a flat list of shapes (ungrouped) sorted by top then left.
|
|
286
|
-
shapes = sorted(ungroup_shapes(slide.shapes), key=
|
|
291
|
+
shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
|
|
287
292
|
|
|
288
293
|
page_nearby_blocks = {
|
|
289
294
|
"text": {"content": [], "bbox": []},
|
|
@@ -656,21 +661,43 @@ def get_bbox(
|
|
|
656
661
|
shape_object: Optional[Slide] = None,
|
|
657
662
|
text_depth: Optional[TextTypeEnum] = None,
|
|
658
663
|
):
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
664
|
+
"""
|
|
665
|
+
Safely computes bounding box for a slide, shape, or document.
|
|
666
|
+
Ensures that missing or None values are gracefully handled.
|
|
667
|
+
|
|
668
|
+
Returns
|
|
669
|
+
-------
|
|
670
|
+
Tuple[int, int, int, int]
|
|
671
|
+
Bounding box as (top, left, bottom, right).
|
|
672
|
+
Defaults to (-1, -1, -1, -1) if invalid or unsupported.
|
|
673
|
+
"""
|
|
674
|
+
try:
|
|
675
|
+
if text_depth == TextTypeEnum.DOCUMENT:
|
|
676
|
+
return (-1, -1, -1, -1)
|
|
677
|
+
|
|
678
|
+
elif text_depth == TextTypeEnum.PAGE and presentation_object:
|
|
679
|
+
top = left = 0
|
|
680
|
+
width = presentation_object.slide_width
|
|
681
|
+
height = presentation_object.slide_height
|
|
682
|
+
return (top, left, top + height, left + width)
|
|
683
|
+
|
|
684
|
+
elif shape_object:
|
|
685
|
+
top = shape_object.top if shape_object.top is not None else -1
|
|
686
|
+
left = shape_object.left if shape_object.left is not None else -1
|
|
687
|
+
width = shape_object.width if shape_object.width is not None else -1
|
|
688
|
+
height = shape_object.height if shape_object.height is not None else -1
|
|
689
|
+
|
|
690
|
+
# If all are valid, return normally, else return placeholder
|
|
691
|
+
if -1 in [top, left, width, height]:
|
|
692
|
+
return (-1, -1, -1, -1)
|
|
693
|
+
|
|
694
|
+
return (top, left, top + height, left + width)
|
|
695
|
+
|
|
696
|
+
except Exception as e:
|
|
697
|
+
logger.warning(f"get_bbox: Failed to compute bbox due to {e}")
|
|
698
|
+
return (-1, -1, -1, -1)
|
|
699
|
+
|
|
700
|
+
return (-1, -1, -1, -1)
|
|
674
701
|
|
|
675
702
|
|
|
676
703
|
def ungroup_shapes(shapes):
|
|
@@ -8,7 +8,6 @@ from nv_ingest_api.internal.primitives.nim import ModelInterface
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
# Assume ModelInterface is defined elsewhere in the project.
|
|
12
11
|
class EmbeddingModelInterface(ModelInterface):
|
|
13
12
|
"""
|
|
14
13
|
An interface for handling inference with an embedding model endpoint.
|
|
@@ -709,7 +709,13 @@ def postprocess_results(
|
|
|
709
709
|
raise ValueError(f"Error in postprocessing {result.shape} and {original_image_shape}: {e}")
|
|
710
710
|
|
|
711
711
|
for box, score, label in zip(bboxes, scores, labels):
|
|
712
|
-
|
|
712
|
+
# TODO(Devin): Sometimes we get back unexpected class labels?
|
|
713
|
+
if (label < 0) or (label >= len(class_labels)):
|
|
714
|
+
logger.warning(f"Invalid class label {label} found in postprocessing")
|
|
715
|
+
continue
|
|
716
|
+
else:
|
|
717
|
+
class_name = class_labels[int(label)]
|
|
718
|
+
|
|
713
719
|
annotation_dict[class_name].append([round(float(x), 4) for x in np.concatenate((box, [score]))])
|
|
714
720
|
|
|
715
721
|
out.append(annotation_dict)
|
|
@@ -251,7 +251,7 @@ class NimClient:
|
|
|
251
251
|
model_name=model_name, parameters=parameters, inputs=[input_tensors], outputs=outputs
|
|
252
252
|
)
|
|
253
253
|
logger.debug(f"gRPC inference response: {response}")
|
|
254
|
-
|
|
254
|
+
|
|
255
255
|
if len(outputs) == 1:
|
|
256
256
|
return response.as_numpy(outputs[0].name())
|
|
257
257
|
else:
|
|
@@ -31,13 +31,15 @@ def traceable(trace_name=None):
|
|
|
31
31
|
|
|
32
32
|
Notes
|
|
33
33
|
-----
|
|
34
|
-
The decorated function must accept a IngestControlMessage object as its
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
The decorated function must accept a IngestControlMessage object as one of its arguments.
|
|
35
|
+
For a regular function, this is expected to be the first argument; for a class method,
|
|
36
|
+
this is expected to be the second argument (after 'self'). The IngestControlMessage object
|
|
37
|
+
must implement `has_metadata`, `get_metadata`, and `set_metadata` methods used by the decorator
|
|
38
|
+
to check for the trace tagging flag and to add trace metadata.
|
|
37
39
|
|
|
38
40
|
The trace metadata added by the decorator includes two entries:
|
|
39
|
-
- 'trace::entry::<trace_name>': The
|
|
40
|
-
- 'trace::exit::<trace_name>': The
|
|
41
|
+
- 'trace::entry::<trace_name>': The timestamp marking the function's entry.
|
|
42
|
+
- 'trace::exit::<trace_name>': The timestamp marking the function's exit.
|
|
41
43
|
|
|
42
44
|
Example
|
|
43
45
|
-------
|
|
@@ -47,23 +49,25 @@ def traceable(trace_name=None):
|
|
|
47
49
|
... def process_message(message):
|
|
48
50
|
... pass
|
|
49
51
|
|
|
50
|
-
Applying the decorator with a custom trace name:
|
|
51
|
-
|
|
52
|
-
>>> @traceable(custom_trace_name="CustomTraceName")
|
|
53
|
-
... def process_message(message):
|
|
54
|
-
... pass
|
|
55
|
-
|
|
56
|
-
In both examples, `process_message` will have entry and exit timestamps added to the
|
|
57
|
-
IngestControlMessage's metadata if 'config::add_trace_tagging' is True.
|
|
52
|
+
Applying the decorator with a custom trace name on a class method:
|
|
58
53
|
|
|
54
|
+
>>> class Processor:
|
|
55
|
+
... @traceable(trace_name="CustomTrace")
|
|
56
|
+
... def process(self, message):
|
|
57
|
+
... pass
|
|
59
58
|
"""
|
|
60
59
|
|
|
61
60
|
def decorator_trace_tagging(func):
|
|
62
61
|
@functools.wraps(func)
|
|
63
62
|
def wrapper_trace_tagging(*args, **kwargs):
|
|
64
|
-
# Assuming the first argument is always the message
|
|
65
63
|
ts_fetched = datetime.now()
|
|
66
|
-
|
|
64
|
+
# Determine which argument is the message.
|
|
65
|
+
if hasattr(args[0], "has_metadata"):
|
|
66
|
+
message = args[0]
|
|
67
|
+
elif len(args) > 1 and hasattr(args[1], "has_metadata"):
|
|
68
|
+
message = args[1]
|
|
69
|
+
else:
|
|
70
|
+
raise ValueError("traceable decorator could not find a message argument with 'has_metadata()'")
|
|
67
71
|
|
|
68
72
|
do_trace_tagging = (message.has_metadata("config::add_trace_tagging") is True) and (
|
|
69
73
|
message.get_metadata("config::add_trace_tagging") is True
|
|
@@ -79,7 +83,7 @@ def traceable(trace_name=None):
|
|
|
79
83
|
message.set_timestamp(f"trace::entry::{trace_prefix}_channel_in", ts_send)
|
|
80
84
|
message.set_timestamp(f"trace::exit::{trace_prefix}_channel_in", ts_fetched)
|
|
81
85
|
|
|
82
|
-
# Call the decorated function
|
|
86
|
+
# Call the decorated function.
|
|
83
87
|
result = func(*args, **kwargs)
|
|
84
88
|
|
|
85
89
|
if do_trace_tagging:
|
|
@@ -131,7 +131,7 @@ class NemoRetrieverParseConfigSchema(BaseModel):
|
|
|
131
131
|
nemoretriever_parse_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
132
132
|
nemoretriever_parse_infer_protocol: str = ""
|
|
133
133
|
|
|
134
|
-
|
|
134
|
+
nemoretriever_parse_model_name: str = "nvidia/nemoretriever-parse"
|
|
135
135
|
|
|
136
136
|
timeout: float = 300.0
|
|
137
137
|
|
|
@@ -76,7 +76,7 @@ class IngestTaskCaptionSchema(BaseModelNoExt):
|
|
|
76
76
|
api_key: Optional[str] = None
|
|
77
77
|
endpoint_url: Optional[str] = None
|
|
78
78
|
prompt: Optional[str] = None
|
|
79
|
-
|
|
79
|
+
caption_model_name: Optional[str] = None
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
class IngestTaskFilterParamsSchema(BaseModelNoExt):
|
|
@@ -104,7 +104,7 @@ class IngestTaskDedupSchema(BaseModelNoExt):
|
|
|
104
104
|
|
|
105
105
|
class IngestTaskEmbedSchema(BaseModelNoExt):
|
|
106
106
|
endpoint_url: Optional[str] = None
|
|
107
|
-
|
|
107
|
+
embedding_model_name: Optional[str] = None
|
|
108
108
|
api_key: Optional[str] = None
|
|
109
109
|
filter_errors: bool = False
|
|
110
110
|
|
|
@@ -10,6 +10,6 @@ class ImageCaptionExtractionSchema(BaseModel):
|
|
|
10
10
|
api_key: str = "api_key"
|
|
11
11
|
endpoint_url: str = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
|
|
12
12
|
prompt: str = "Caption the content of this image:"
|
|
13
|
-
|
|
13
|
+
image_caption_model_name: str = "meta/llama-3.2-11b-vision-instruct"
|
|
14
14
|
raise_on_failure: bool = False
|
|
15
15
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -173,7 +173,7 @@ def transform_image_create_vlm_caption_internal(
|
|
|
173
173
|
api_key: str = task_config.get("api_key") or transform_config.api_key
|
|
174
174
|
prompt: str = task_config.get("prompt") or transform_config.prompt
|
|
175
175
|
endpoint_url: str = task_config.get("endpoint_url") or transform_config.endpoint_url
|
|
176
|
-
model_name: str = task_config.get("
|
|
176
|
+
model_name: str = task_config.get("image_caption_model_name") or transform_config.image_caption_model_name
|
|
177
177
|
|
|
178
178
|
# Create a mask for rows where the content type is "image".
|
|
179
179
|
df_mask: pd.Series = df_transform_ledger["metadata"].apply(
|
|
@@ -230,28 +230,35 @@ def _async_runner(
|
|
|
230
230
|
def _add_embeddings(row, embeddings, info_msgs):
|
|
231
231
|
"""
|
|
232
232
|
Updates a DataFrame row with embedding data and associated error info.
|
|
233
|
+
Ensures the 'embedding' field is always present, even if None.
|
|
233
234
|
|
|
234
235
|
Parameters
|
|
235
236
|
----------
|
|
236
237
|
row : pandas.Series
|
|
237
238
|
A row of the DataFrame.
|
|
238
|
-
embeddings :
|
|
239
|
-
|
|
240
|
-
info_msgs :
|
|
241
|
-
|
|
239
|
+
embeddings : dict
|
|
240
|
+
Dictionary mapping row indices to embeddings.
|
|
241
|
+
info_msgs : dict
|
|
242
|
+
Dictionary mapping row indices to info message dicts.
|
|
242
243
|
|
|
243
244
|
Returns
|
|
244
245
|
-------
|
|
245
246
|
pandas.Series
|
|
246
|
-
The updated row with embedding and
|
|
247
|
+
The updated row with 'embedding', 'info_message_metadata', and
|
|
248
|
+
'_contains_embeddings' appropriately set.
|
|
247
249
|
"""
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
250
|
+
embedding = embeddings.get(row.name, None)
|
|
251
|
+
info_msg = info_msgs.get(row.name, None)
|
|
252
|
+
|
|
253
|
+
# Always set embedding, even if None
|
|
254
|
+
row["metadata"]["embedding"] = embedding
|
|
255
|
+
|
|
256
|
+
if info_msg:
|
|
257
|
+
row["metadata"]["info_message_metadata"] = info_msg
|
|
251
258
|
row["document_type"] = ContentTypeEnum.INFO_MSG
|
|
252
259
|
row["_contains_embeddings"] = False
|
|
253
260
|
else:
|
|
254
|
-
row["_contains_embeddings"] =
|
|
261
|
+
row["_contains_embeddings"] = embedding is not None
|
|
255
262
|
|
|
256
263
|
return row
|
|
257
264
|
|
|
@@ -287,7 +294,7 @@ def _get_pandas_table_content(row):
|
|
|
287
294
|
str
|
|
288
295
|
The table/chart content from the row.
|
|
289
296
|
"""
|
|
290
|
-
return row
|
|
297
|
+
return row.get("table_metadata", {}).get("table_content")
|
|
291
298
|
|
|
292
299
|
|
|
293
300
|
def _get_pandas_image_content(row):
|
|
@@ -304,7 +311,14 @@ def _get_pandas_image_content(row):
|
|
|
304
311
|
str
|
|
305
312
|
The image caption from the row.
|
|
306
313
|
"""
|
|
307
|
-
return row
|
|
314
|
+
return row.get("image_metadata", {}).get("caption")
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _get_pandas_audio_content(row):
|
|
318
|
+
"""
|
|
319
|
+
A pandas UDF used to select extracted audio transcription to be used to create embeddings.
|
|
320
|
+
"""
|
|
321
|
+
return row.get("audio_metadata", {}).get("audio_transcript")
|
|
308
322
|
|
|
309
323
|
|
|
310
324
|
# ------------------------------------------------------------------------------
|
|
@@ -352,13 +366,6 @@ def _generate_batches(prompts: List[str], batch_size: int = 100) -> List[str]:
|
|
|
352
366
|
return [batch for batch in _batch_generator(prompts, batch_size)]
|
|
353
367
|
|
|
354
368
|
|
|
355
|
-
def _get_pandas_audio_content(row):
|
|
356
|
-
"""
|
|
357
|
-
A pandas UDF used to select extracted audio transcription to be used to create embeddings.
|
|
358
|
-
"""
|
|
359
|
-
return row["audio_metadata"]["audio_transcript"]
|
|
360
|
-
|
|
361
|
-
|
|
362
369
|
# ------------------------------------------------------------------------------
|
|
363
370
|
# DataFrame Concatenation Utility
|
|
364
371
|
# ------------------------------------------------------------------------------
|
|
@@ -408,17 +415,20 @@ def transform_create_text_embeddings_internal(
|
|
|
408
415
|
execution_trace_log: Optional[Dict] = None,
|
|
409
416
|
) -> Tuple[pd.DataFrame, Dict]:
|
|
410
417
|
"""
|
|
411
|
-
Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE)
|
|
418
|
+
Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE, AUDIO)
|
|
412
419
|
from a pandas DataFrame using asynchronous requests.
|
|
413
420
|
|
|
421
|
+
This function ensures that even if the extracted content is empty or None,
|
|
422
|
+
the embedding field is explicitly created and set to None.
|
|
423
|
+
|
|
414
424
|
Parameters
|
|
415
425
|
----------
|
|
416
426
|
df_transform_ledger : pd.DataFrame
|
|
417
427
|
The DataFrame containing content for embedding extraction.
|
|
418
428
|
task_config : Dict[str, Any]
|
|
419
429
|
Dictionary containing task properties (e.g., filter error flag).
|
|
420
|
-
transform_config :
|
|
421
|
-
Validated configuration for text embedding extraction
|
|
430
|
+
transform_config : TextEmbeddingSchema, optional
|
|
431
|
+
Validated configuration for text embedding extraction.
|
|
422
432
|
execution_trace_log : Optional[Dict], optional
|
|
423
433
|
Optional trace information for debugging or logging (default is None).
|
|
424
434
|
|
|
@@ -429,24 +439,20 @@ def transform_create_text_embeddings_internal(
|
|
|
429
439
|
- The updated DataFrame with embeddings applied.
|
|
430
440
|
- A dictionary with trace information.
|
|
431
441
|
"""
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
endpoint_url: str = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
|
|
436
|
-
model_name: str = task_config.get("model_name") or transform_config.embedding_model
|
|
442
|
+
api_key = task_config.get("api_key") or transform_config.api_key
|
|
443
|
+
endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
|
|
444
|
+
model_name = task_config.get("model_name") or transform_config.embedding_model
|
|
437
445
|
|
|
438
446
|
if execution_trace_log is None:
|
|
439
447
|
execution_trace_log = {}
|
|
440
448
|
logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
|
|
441
449
|
|
|
442
|
-
# TODO(Devin)
|
|
443
450
|
if df_transform_ledger.empty:
|
|
444
451
|
return df_transform_ledger, {"trace_info": execution_trace_log}
|
|
445
452
|
|
|
446
453
|
embedding_dataframes = []
|
|
447
|
-
content_masks = []
|
|
454
|
+
content_masks = []
|
|
448
455
|
|
|
449
|
-
# Define pandas content extractors for supported content types.
|
|
450
456
|
pandas_content_extractor = {
|
|
451
457
|
ContentTypeEnum.TEXT: _get_pandas_text_content,
|
|
452
458
|
ContentTypeEnum.STRUCTURED: _get_pandas_table_content,
|
|
@@ -455,49 +461,62 @@ def transform_create_text_embeddings_internal(
|
|
|
455
461
|
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
456
462
|
}
|
|
457
463
|
|
|
458
|
-
logger.debug("Generating text embeddings for supported content types: TEXT, STRUCTURED, IMAGE.")
|
|
459
|
-
|
|
460
464
|
def _content_type_getter(row):
|
|
461
465
|
return row["content_metadata"]["type"]
|
|
462
466
|
|
|
463
|
-
# Process each supported content type.
|
|
464
467
|
for content_type, content_getter in pandas_content_extractor.items():
|
|
465
468
|
if not content_getter:
|
|
466
469
|
logger.debug(f"Skipping unsupported content type: {content_type}")
|
|
467
470
|
continue
|
|
468
471
|
|
|
472
|
+
# Get rows matching the content type
|
|
469
473
|
content_mask = df_transform_ledger["metadata"].apply(_content_type_getter) == content_type.value
|
|
470
474
|
if not content_mask.any():
|
|
471
475
|
continue
|
|
472
476
|
|
|
473
|
-
#
|
|
474
|
-
|
|
475
|
-
non_empty_mask = extracted_content.notna() & (extracted_content.str.strip() != "")
|
|
476
|
-
final_mask = content_mask & non_empty_mask
|
|
477
|
-
if not final_mask.any():
|
|
478
|
-
continue
|
|
477
|
+
# Always include all content_mask rows and prepare them
|
|
478
|
+
df_content = df_transform_ledger.loc[content_mask].copy().reset_index(drop=True)
|
|
479
479
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
api_key,
|
|
486
|
-
endpoint_url,
|
|
487
|
-
model_name,
|
|
488
|
-
transform_config.encoding_format,
|
|
489
|
-
transform_config.input_type,
|
|
490
|
-
transform_config.truncate,
|
|
491
|
-
False,
|
|
480
|
+
# Extract content and normalize empty or non-str to None
|
|
481
|
+
extracted_content = (
|
|
482
|
+
df_content["metadata"]
|
|
483
|
+
.apply(content_getter)
|
|
484
|
+
.apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
|
|
492
485
|
)
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
486
|
+
df_content["_content"] = extracted_content
|
|
487
|
+
|
|
488
|
+
# Prepare batches for only valid (non-None) content
|
|
489
|
+
valid_content_mask = df_content["_content"].notna()
|
|
490
|
+
if valid_content_mask.any():
|
|
491
|
+
filtered_content_batches = _generate_batches(
|
|
492
|
+
df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
|
|
493
|
+
)
|
|
494
|
+
content_embeddings = _async_runner(
|
|
495
|
+
filtered_content_batches,
|
|
496
|
+
api_key,
|
|
497
|
+
endpoint_url,
|
|
498
|
+
model_name,
|
|
499
|
+
transform_config.encoding_format,
|
|
500
|
+
transform_config.input_type,
|
|
501
|
+
transform_config.truncate,
|
|
502
|
+
False,
|
|
503
|
+
)
|
|
504
|
+
# Build a simple row index -> embedding map
|
|
505
|
+
embeddings_dict = dict(
|
|
506
|
+
zip(df_content.loc[valid_content_mask].index, content_embeddings.get("embeddings", []))
|
|
507
|
+
)
|
|
508
|
+
info_msgs_dict = dict(
|
|
509
|
+
zip(df_content.loc[valid_content_mask].index, content_embeddings.get("info_msgs", []))
|
|
510
|
+
)
|
|
511
|
+
else:
|
|
512
|
+
embeddings_dict = {}
|
|
513
|
+
info_msgs_dict = {}
|
|
514
|
+
|
|
515
|
+
# Apply embeddings or None to all rows
|
|
516
|
+
df_content = df_content.apply(_add_embeddings, embeddings=embeddings_dict, info_msgs=info_msgs_dict, axis=1)
|
|
498
517
|
|
|
499
518
|
embedding_dataframes.append(df_content)
|
|
500
|
-
content_masks.append(
|
|
519
|
+
content_masks.append(content_mask)
|
|
501
520
|
|
|
502
521
|
combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
|
|
503
522
|
return combined_df, {"trace_info": execution_trace_log}
|
|
@@ -66,7 +66,7 @@ def datetools_exception_handler(func: Callable, **kwargs: Dict[str, Any]) -> Cal
|
|
|
66
66
|
return func(*args, **kwargs)
|
|
67
67
|
except Exception as e:
|
|
68
68
|
log_error_message = f"Invalid date format: {e}"
|
|
69
|
-
logger.
|
|
69
|
+
logger.debug(log_error_message)
|
|
70
70
|
return datetools.remove_tz(datetime.now(timezone.utc)).isoformat()
|
|
71
71
|
|
|
72
72
|
return inner_function
|