nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/extract.py +18 -18
- nv_ingest_api/internal/extract/image/chart_extractor.py +80 -75
- nv_ingest_api/internal/extract/image/image_helpers/common.py +5 -6
- nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
- nv_ingest_api/internal/extract/image/table_extractor.py +84 -64
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +10 -7
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +16 -29
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +59 -0
- nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +37 -224
- nv_ingest_api/internal/primitives/nim/nim_client.py +55 -14
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
- nv_ingest_api/internal/transform/split_text.py +13 -8
- nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
- nv_ingest_api/util/image_processing/transforms.py +16 -5
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/RECORD +24 -24
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/top_level.txt +0 -0
|
@@ -141,14 +141,19 @@ def transform_text_split_and_tokenize_internal(
|
|
|
141
141
|
|
|
142
142
|
model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
|
|
143
143
|
|
|
144
|
-
if
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
144
|
+
if model_predownload_path is not None:
|
|
145
|
+
if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
|
|
146
|
+
tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
|
|
147
|
+
):
|
|
148
|
+
tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
|
|
149
|
+
elif os.path.exists(
|
|
150
|
+
os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")
|
|
151
|
+
) and (tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"):
|
|
152
|
+
tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
|
|
153
|
+
|
|
154
|
+
# Defaulto to intfloat/e5-large-unsupervised if no tokenizer predownloaded or specified
|
|
155
|
+
if tokenizer_identifier is None:
|
|
156
|
+
tokenizer_identifier = "intfloat/e5-large-unsupervised"
|
|
152
157
|
|
|
153
158
|
tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)
|
|
154
159
|
|
|
@@ -46,14 +46,14 @@ def process_yolox_graphic_elements(yolox_text_dict):
|
|
|
46
46
|
return chart_content.strip()
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
def match_bboxes(yolox_box,
|
|
49
|
+
def match_bboxes(yolox_box, ocr_boxes, already_matched=None, delta=2.0):
|
|
50
50
|
"""
|
|
51
51
|
Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
|
|
52
52
|
Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
|
|
53
53
|
Boxes are expeceted in format (x0, y0, x1, y1)
|
|
54
54
|
Args:
|
|
55
55
|
yolox_box (np array [4]): Cached Bbox.
|
|
56
|
-
|
|
56
|
+
ocr_boxes (np array [n x 4]): PaddleOCR boxes
|
|
57
57
|
already_matched (list or None, Optional): Already matched ids to ignore.
|
|
58
58
|
delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
|
|
59
59
|
Returns:
|
|
@@ -61,10 +61,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
|
|
|
61
61
|
"""
|
|
62
62
|
x0_1, y0_1, x1_1, y1_1 = yolox_box
|
|
63
63
|
x0_2, y0_2, x1_2, y1_2 = (
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
64
|
+
ocr_boxes[:, 0],
|
|
65
|
+
ocr_boxes[:, 1],
|
|
66
|
+
ocr_boxes[:, 2],
|
|
67
|
+
ocr_boxes[:, 3],
|
|
68
68
|
)
|
|
69
69
|
|
|
70
70
|
# Intersection
|
|
@@ -92,10 +92,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
|
|
|
92
92
|
return matches
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
def
|
|
95
|
+
def join_yolox_graphic_elements_and_ocr_output(yolox_output, ocr_boxes, ocr_txts):
|
|
96
96
|
"""
|
|
97
97
|
Matching boxes
|
|
98
|
-
We need to associate a text to the
|
|
98
|
+
We need to associate a text to the ocr detections.
|
|
99
99
|
For each class and for each CACHED detections, we look for overlapping text bboxes
|
|
100
100
|
with IoU > max_iou / delta where max_iou is the biggest found overlap.
|
|
101
101
|
Found texts are added to the class representation, and removed from the texts to match
|
|
@@ -113,18 +113,18 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
113
113
|
"value_label",
|
|
114
114
|
]
|
|
115
115
|
|
|
116
|
-
|
|
117
|
-
|
|
116
|
+
ocr_txts = np.array(ocr_txts)
|
|
117
|
+
ocr_boxes = np.array(ocr_boxes)
|
|
118
118
|
|
|
119
|
-
if (
|
|
119
|
+
if (ocr_txts.size == 0) or (ocr_boxes.size == 0):
|
|
120
120
|
return {}
|
|
121
121
|
|
|
122
|
-
|
|
122
|
+
ocr_boxes = np.array(
|
|
123
123
|
[
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
124
|
+
ocr_boxes[:, :, 0].min(-1),
|
|
125
|
+
ocr_boxes[:, :, 1].min(-1),
|
|
126
|
+
ocr_boxes[:, :, 0].max(-1),
|
|
127
|
+
ocr_boxes[:, :, 1].max(-1),
|
|
128
128
|
]
|
|
129
129
|
).T
|
|
130
130
|
|
|
@@ -139,10 +139,10 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
139
139
|
for yolox_box in yolox_output[k]:
|
|
140
140
|
# if there's a score at the end, drop the score.
|
|
141
141
|
yolox_box = yolox_box[:4]
|
|
142
|
-
|
|
142
|
+
ocr_ids = match_bboxes(yolox_box, ocr_boxes, already_matched=already_matched, delta=4)
|
|
143
143
|
|
|
144
|
-
if len(
|
|
145
|
-
text = " ".join(
|
|
144
|
+
if len(ocr_ids) > 0:
|
|
145
|
+
text = " ".join(ocr_txts[ocr_ids].tolist())
|
|
146
146
|
texts.append(text)
|
|
147
147
|
|
|
148
148
|
processed_texts = []
|
|
@@ -161,7 +161,7 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
161
161
|
return results
|
|
162
162
|
|
|
163
163
|
|
|
164
|
-
def
|
|
164
|
+
def convert_ocr_response_to_psuedo_markdown(bboxes, texts):
|
|
165
165
|
if (not bboxes) or (not texts):
|
|
166
166
|
return ""
|
|
167
167
|
|
|
@@ -186,22 +186,22 @@ def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
|
|
|
186
186
|
return results
|
|
187
187
|
|
|
188
188
|
|
|
189
|
-
def
|
|
190
|
-
if (not
|
|
189
|
+
def join_yolox_table_structure_and_ocr_output(yolox_cell_preds, ocr_boxes, ocr_txts):
|
|
190
|
+
if (not ocr_boxes) or (not ocr_txts):
|
|
191
191
|
return ""
|
|
192
192
|
|
|
193
|
-
|
|
194
|
-
|
|
193
|
+
ocr_boxes = np.array(ocr_boxes)
|
|
194
|
+
ocr_boxes_ = np.array(
|
|
195
195
|
[
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
196
|
+
ocr_boxes[:, :, 0].min(-1),
|
|
197
|
+
ocr_boxes[:, :, 1].min(-1),
|
|
198
|
+
ocr_boxes[:, :, 0].max(-1),
|
|
199
|
+
ocr_boxes[:, :, 1].max(-1),
|
|
200
200
|
]
|
|
201
201
|
).T
|
|
202
202
|
|
|
203
203
|
assignments = []
|
|
204
|
-
for i, (b, t) in enumerate(zip(
|
|
204
|
+
for i, (b, t) in enumerate(zip(ocr_boxes_, ocr_txts)):
|
|
205
205
|
# Find a cell
|
|
206
206
|
matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
|
|
207
207
|
cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
|
|
@@ -221,7 +221,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
221
221
|
assignments.append(
|
|
222
222
|
{
|
|
223
223
|
"index": i,
|
|
224
|
-
"
|
|
224
|
+
"ocr_box": b,
|
|
225
225
|
"is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
|
|
226
226
|
"cell_id": matches_cell[0] if len(matches_cell) else -1,
|
|
227
227
|
"cell": cell,
|
|
@@ -249,13 +249,13 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
249
249
|
mat = build_markdown(df_table)
|
|
250
250
|
markdown_table = display_markdown(mat, use_header=False)
|
|
251
251
|
|
|
252
|
-
all_boxes = np.stack(df_table.
|
|
252
|
+
all_boxes = np.stack(df_table.ocr_box.values)
|
|
253
253
|
table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
|
|
254
254
|
|
|
255
255
|
df_table_to_text = pd.DataFrame(
|
|
256
256
|
[
|
|
257
257
|
{
|
|
258
|
-
"
|
|
258
|
+
"ocr_box": table_box,
|
|
259
259
|
"text": markdown_table,
|
|
260
260
|
"is_table": True,
|
|
261
261
|
}
|
|
@@ -264,7 +264,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
264
264
|
# Final text representations dataframe
|
|
265
265
|
df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
|
|
266
266
|
|
|
267
|
-
df_text = df_text.rename(columns={"
|
|
267
|
+
df_text = df_text.rename(columns={"ocr_box": "box"})
|
|
268
268
|
|
|
269
269
|
# Sort by y and x
|
|
270
270
|
df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
|
|
@@ -297,12 +297,12 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
297
297
|
return result
|
|
298
298
|
|
|
299
299
|
|
|
300
|
-
def assign_boxes(
|
|
300
|
+
def assign_boxes(ocr_box, boxes, delta=2.0, min_overlap=0.25):
|
|
301
301
|
"""
|
|
302
|
-
Assigns the closest bounding boxes to a reference `
|
|
302
|
+
Assigns the closest bounding boxes to a reference `ocr_box` based on overlap.
|
|
303
303
|
|
|
304
304
|
Args:
|
|
305
|
-
|
|
305
|
+
ocr_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
|
|
306
306
|
boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
|
|
307
307
|
delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
|
|
308
308
|
min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
|
|
@@ -316,7 +316,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
|
|
|
316
316
|
|
|
317
317
|
boxes = np.array(boxes)
|
|
318
318
|
|
|
319
|
-
x0_1, y0_1, x1_1, y1_1 =
|
|
319
|
+
x0_1, y0_1, x1_1, y1_1 = ocr_box
|
|
320
320
|
x0_2, y0_2, x1_2, y1_2 = (
|
|
321
321
|
boxes[:, 0],
|
|
322
322
|
boxes[:, 1],
|
|
@@ -331,7 +331,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
|
|
|
331
331
|
inter_x1 = np.minimum(x1_1, x1_2)
|
|
332
332
|
inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
|
|
333
333
|
|
|
334
|
-
# Normalize by
|
|
334
|
+
# Normalize by ocr_box size
|
|
335
335
|
area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
|
|
336
336
|
ious = inter_area / (area_1 + 1e-6)
|
|
337
337
|
|
|
@@ -385,16 +385,16 @@ def merge_text_in_cell(df_cell):
|
|
|
385
385
|
Returns:
|
|
386
386
|
pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
|
|
387
387
|
"""
|
|
388
|
-
|
|
388
|
+
ocr_boxes = np.stack(df_cell["ocr_box"].values)
|
|
389
389
|
|
|
390
|
-
df_cell["x"] = (
|
|
391
|
-
df_cell["y"] = (
|
|
390
|
+
df_cell["x"] = (ocr_boxes[:, 0] - ocr_boxes[:, 0].min()) // 10
|
|
391
|
+
df_cell["y"] = (ocr_boxes[:, 1] - ocr_boxes[:, 1].min()) // 10
|
|
392
392
|
df_cell = df_cell.sort_values(["y", "x"])
|
|
393
393
|
|
|
394
394
|
text = " ".join(df_cell["text"].values.tolist())
|
|
395
395
|
df_cell["text"] = text
|
|
396
396
|
df_cell = df_cell.head(1)
|
|
397
|
-
df_cell["
|
|
397
|
+
df_cell["ocr_box"] = df_cell["cell"]
|
|
398
398
|
df_cell.drop(["x", "y"], axis=1, inplace=True)
|
|
399
399
|
|
|
400
400
|
return df_cell
|
|
@@ -447,3 +447,58 @@ def display_markdown(
|
|
|
447
447
|
markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
|
|
448
448
|
|
|
449
449
|
return markdown_table
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def reorder_boxes(boxes, texts, confs, mode="top_left", dbscan_eps=10):
|
|
453
|
+
"""
|
|
454
|
+
Reorders the boxes in reading order.
|
|
455
|
+
If mode is "center", the boxes are reordered using bbox center.
|
|
456
|
+
If mode is "top_left", the boxes are reordered using the top left corner.
|
|
457
|
+
If dbscan_eps is not 0, the boxes are reordered using DBSCAN clustering.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
boxes (np array [n x 4 x 2]): The bounding boxes of the OCR results.
|
|
461
|
+
texts (np array [n]): The text of the OCR results.
|
|
462
|
+
confs (np array [n]): The confidence scores of the OCR results.
|
|
463
|
+
mode (str, optional): The mode to reorder the boxes. Defaults to "center".
|
|
464
|
+
dbscan_eps (float, optional): The epsilon parameter for DBSCAN. Defaults to 10.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
List[List[int, ...]]: The reordered bounding boxes.
|
|
468
|
+
List[str]: The reordered texts.
|
|
469
|
+
List[float]: The reordered confidence scores.
|
|
470
|
+
"""
|
|
471
|
+
df = pd.DataFrame(
|
|
472
|
+
[[b, t, c] for b, t, c in zip(boxes, texts, confs)],
|
|
473
|
+
columns=["bbox", "text", "conf"],
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
if mode == "center":
|
|
477
|
+
df["x"] = df["bbox"].apply(lambda box: (box[0][0] + box[2][0]) / 2)
|
|
478
|
+
df["y"] = df["bbox"].apply(lambda box: (box[0][1] + box[2][1]) / 2)
|
|
479
|
+
elif mode == "top_left":
|
|
480
|
+
df["x"] = df["bbox"].apply(lambda box: (box[0][0]))
|
|
481
|
+
df["y"] = df["bbox"].apply(lambda box: (box[0][1]))
|
|
482
|
+
|
|
483
|
+
if dbscan_eps:
|
|
484
|
+
do_naive_sorting = False
|
|
485
|
+
try:
|
|
486
|
+
dbscan = DBSCAN(eps=dbscan_eps, min_samples=1)
|
|
487
|
+
dbscan.fit(df["y"].values[:, None])
|
|
488
|
+
df["cluster"] = dbscan.labels_
|
|
489
|
+
df["cluster_centers"] = df.groupby("cluster")["y"].transform("mean").astype(int)
|
|
490
|
+
df = df.sort_values(["cluster_centers", "x"], ascending=[True, True], ignore_index=True)
|
|
491
|
+
except ValueError:
|
|
492
|
+
do_naive_sorting = True
|
|
493
|
+
else:
|
|
494
|
+
do_naive_sorting = True
|
|
495
|
+
|
|
496
|
+
if do_naive_sorting:
|
|
497
|
+
df["y"] = np.round((df["y"] - df["y"].min()) // 5, 0)
|
|
498
|
+
df = df.sort_values(["y", "x"], ascending=[True, True], ignore_index=True)
|
|
499
|
+
|
|
500
|
+
bboxes = df["bbox"].values.tolist()
|
|
501
|
+
texts = df["text"].values.tolist()
|
|
502
|
+
confs = df["conf"].values.tolist()
|
|
503
|
+
|
|
504
|
+
return bboxes, texts, confs
|
|
@@ -209,6 +209,7 @@ def pad_image(
|
|
|
209
209
|
target_height: int = DEFAULT_MAX_HEIGHT,
|
|
210
210
|
background_color: int = 255,
|
|
211
211
|
dtype=np.uint8,
|
|
212
|
+
how: str = "center",
|
|
212
213
|
) -> Tuple[np.ndarray, Tuple[int, int]]:
|
|
213
214
|
"""
|
|
214
215
|
Pads a NumPy array representing an image to the specified target dimensions.
|
|
@@ -217,6 +218,8 @@ def pad_image(
|
|
|
217
218
|
in that dimension. If the target dimensions are larger, the image will be centered within the
|
|
218
219
|
canvas of the specified target size, with the remaining space filled with white padding.
|
|
219
220
|
|
|
221
|
+
The padding can be done around the center (how="center"), or to the bottom right (how="bottom_right").
|
|
222
|
+
|
|
220
223
|
Parameters
|
|
221
224
|
----------
|
|
222
225
|
array : np.ndarray
|
|
@@ -225,6 +228,8 @@ def pad_image(
|
|
|
225
228
|
The desired target width of the padded image. Defaults to DEFAULT_MAX_WIDTH.
|
|
226
229
|
target_height : int, optional
|
|
227
230
|
The desired target height of the padded image. Defaults to DEFAULT_MAX_HEIGHT.
|
|
231
|
+
how : str, optional
|
|
232
|
+
The method to pad the image. Defaults to "center".
|
|
228
233
|
|
|
229
234
|
Returns
|
|
230
235
|
-------
|
|
@@ -249,17 +254,23 @@ def pad_image(
|
|
|
249
254
|
"""
|
|
250
255
|
height, width = array.shape[:2]
|
|
251
256
|
|
|
252
|
-
# Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
|
|
253
|
-
pad_height = max((target_height - height) // 2, 0)
|
|
254
|
-
pad_width = max((target_width - width) // 2, 0)
|
|
255
|
-
|
|
256
257
|
# Determine final canvas size (may be equal to original if target is smaller)
|
|
257
258
|
final_height = max(height, target_height)
|
|
258
259
|
final_width = max(width, target_width)
|
|
259
260
|
|
|
260
261
|
# Create the canvas and place the original image on it
|
|
261
262
|
canvas = background_color * np.ones((final_height, final_width, array.shape[2]), dtype=dtype)
|
|
262
|
-
|
|
263
|
+
|
|
264
|
+
# Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
|
|
265
|
+
if how == "center":
|
|
266
|
+
pad_height = max((target_height - height) // 2, 0)
|
|
267
|
+
pad_width = max((target_width - width) // 2, 0)
|
|
268
|
+
|
|
269
|
+
canvas[pad_height : pad_height + height, pad_width : pad_width + width] = array # noqa: E203
|
|
270
|
+
elif how == "bottom_right":
|
|
271
|
+
pad_height, pad_width = 0, 0
|
|
272
|
+
|
|
273
|
+
canvas[:height, :width] = array # noqa: E203
|
|
263
274
|
|
|
264
275
|
return canvas, (pad_width, pad_height)
|
|
265
276
|
|
|
@@ -250,7 +250,7 @@ class SimpleMessageBrokerHandler(socketserver.BaseRequestHandler):
|
|
|
250
250
|
with queue_lock:
|
|
251
251
|
if queue.empty():
|
|
252
252
|
# Return failure response immediately
|
|
253
|
-
response = ResponseSchema(response_code=
|
|
253
|
+
response = ResponseSchema(response_code=2, response_reason="Job not ready")
|
|
254
254
|
self._send_response(response)
|
|
255
255
|
return
|
|
256
256
|
# Pop the message from the queue
|
|
@@ -14,7 +14,7 @@ import logging
|
|
|
14
14
|
from typing import Optional, Tuple, Union
|
|
15
15
|
|
|
16
16
|
from nv_ingest_api.internal.schemas.message_brokers.response_schema import ResponseSchema
|
|
17
|
-
from nv_ingest_api.util.service_clients.client_base import MessageBrokerClientBase
|
|
17
|
+
from nv_ingest_api.util.service_clients.client_base import MessageBrokerClientBase
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
@@ -108,29 +108,23 @@ class SimpleClient(MessageBrokerClientBase):
|
|
|
108
108
|
return self._handle_push(queue_name, message, timeout, for_nv_ingest)
|
|
109
109
|
|
|
110
110
|
def fetch_message(
|
|
111
|
-
self,
|
|
112
|
-
queue_name: str,
|
|
113
|
-
timeout: Optional[Tuple[int, Union[float]]] = (100, None),
|
|
114
|
-
override_fetch_mode: FetchMode = None,
|
|
111
|
+
self, queue_name: str, timeout: Optional[Tuple[int, Union[float, None]]] = (1200, None)
|
|
115
112
|
) -> ResponseSchema:
|
|
116
113
|
"""
|
|
117
|
-
Fetch a message from
|
|
114
|
+
Fetch a message from a specified queue.
|
|
118
115
|
|
|
119
116
|
Parameters
|
|
120
117
|
----------
|
|
121
118
|
queue_name : str
|
|
122
119
|
The name of the queue.
|
|
123
|
-
timeout :
|
|
124
|
-
|
|
120
|
+
timeout : tuple, optional
|
|
121
|
+
A tuple containing the timeout value and an unused second element.
|
|
125
122
|
|
|
126
123
|
Returns
|
|
127
124
|
-------
|
|
128
125
|
ResponseSchema
|
|
129
|
-
The response
|
|
126
|
+
The response from the broker.
|
|
130
127
|
"""
|
|
131
|
-
if isinstance(timeout, int):
|
|
132
|
-
timeout = (timeout, None)
|
|
133
|
-
|
|
134
128
|
return self._handle_pop(queue_name, timeout)
|
|
135
129
|
|
|
136
130
|
def ping(self) -> ResponseSchema:
|
|
@@ -208,6 +202,7 @@ class SimpleClient(MessageBrokerClientBase):
|
|
|
208
202
|
|
|
209
203
|
try:
|
|
210
204
|
with socket.create_connection((self._host, self._port), timeout=self._connection_timeout) as sock:
|
|
205
|
+
sock.settimeout(self._connection_timeout)
|
|
211
206
|
self._send(sock, json.dumps(command).encode("utf-8"))
|
|
212
207
|
# Receive initial response with transaction ID
|
|
213
208
|
response_data = self._recv(sock)
|
|
@@ -241,8 +236,9 @@ class SimpleClient(MessageBrokerClientBase):
|
|
|
241
236
|
|
|
242
237
|
return ResponseSchema(**final_response)
|
|
243
238
|
|
|
244
|
-
except (ConnectionError, socket.error, BrokenPipeError):
|
|
245
|
-
|
|
239
|
+
except (ConnectionError, socket.error, BrokenPipeError, socket.timeout) as e:
|
|
240
|
+
logger.debug(f"Connection error during PUSH: {e}")
|
|
241
|
+
pass # Will be retried
|
|
246
242
|
except json.JSONDecodeError:
|
|
247
243
|
return ResponseSchema(response_code=1, response_reason="Invalid JSON response from server.")
|
|
248
244
|
except Exception as e:
|
|
@@ -272,61 +268,67 @@ class SimpleClient(MessageBrokerClientBase):
|
|
|
272
268
|
|
|
273
269
|
command = {"command": "POP", "queue_name": queue_name}
|
|
274
270
|
|
|
275
|
-
|
|
271
|
+
timeout_val = timeout[0] if isinstance(timeout, tuple) else timeout
|
|
276
272
|
|
|
277
|
-
if
|
|
278
|
-
command["timeout"] =
|
|
273
|
+
if timeout_val is not None:
|
|
274
|
+
command["timeout"] = timeout_val
|
|
279
275
|
|
|
280
276
|
start_time = time.time()
|
|
277
|
+
backoff_delay = 1 # Start with a 1-second backoff
|
|
278
|
+
|
|
281
279
|
while True:
|
|
282
280
|
elapsed = time.time() - start_time
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
return ResponseSchema(response_code=1, response_reason="POP operation timed out.")
|
|
281
|
+
if timeout_val is not None and elapsed >= timeout_val:
|
|
282
|
+
return ResponseSchema(response_code=2, response_reason="Job not ready.")
|
|
286
283
|
|
|
287
284
|
try:
|
|
288
285
|
with socket.create_connection((self._host, self._port), timeout=self._connection_timeout) as sock:
|
|
286
|
+
sock.settimeout(self._connection_timeout)
|
|
289
287
|
self._send(sock, json.dumps(command).encode("utf-8"))
|
|
290
288
|
# Receive initial response with transaction ID and message
|
|
291
289
|
response_data = self._recv(sock)
|
|
292
290
|
response = json.loads(response_data)
|
|
293
291
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
292
|
+
# The broker now returns a response_code of 2 for a timeout, which the high-level
|
|
293
|
+
# client should handle as a retryable event.
|
|
294
|
+
if response.get("response_code") == 2:
|
|
295
|
+
# Queue is empty or job not ready, continue to backoff and retry
|
|
296
|
+
pass
|
|
297
|
+
elif response.get("response_code") != 0:
|
|
298
|
+
return ResponseSchema(**response)
|
|
299
|
+
else:
|
|
300
|
+
# Success case: we received a message.
|
|
301
|
+
if "transaction_id" not in response:
|
|
302
|
+
return ResponseSchema(response_code=1, response_reason="No transaction_id in response.")
|
|
303
303
|
|
|
304
|
-
|
|
304
|
+
transaction_id = response["transaction_id"]
|
|
305
|
+
message = response.get("response")
|
|
305
306
|
|
|
306
|
-
|
|
307
|
-
|
|
307
|
+
# Send ACK
|
|
308
|
+
ack_data = json.dumps({"transaction_id": transaction_id, "ack": True}).encode("utf-8")
|
|
309
|
+
self._send(sock, ack_data)
|
|
308
310
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
311
|
+
# Receive final response
|
|
312
|
+
final_response_data = self._recv(sock)
|
|
313
|
+
final_response = json.loads(final_response_data)
|
|
312
314
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
if final_response.get("response_code") == 0:
|
|
318
|
-
return ResponseSchema(response_code=0, response=message, transaction_id=transaction_id)
|
|
319
|
-
else:
|
|
320
|
-
return ResponseSchema(**final_response)
|
|
315
|
+
if final_response.get("response_code") == 0:
|
|
316
|
+
return ResponseSchema(response_code=0, response=message, transaction_id=transaction_id)
|
|
317
|
+
else:
|
|
318
|
+
return ResponseSchema(**final_response)
|
|
321
319
|
|
|
322
|
-
except (ConnectionError, socket.error, BrokenPipeError):
|
|
323
|
-
|
|
320
|
+
except (ConnectionError, socket.error, BrokenPipeError, socket.timeout) as e:
|
|
321
|
+
# Let the high-level client handle connection errors as retryable.
|
|
322
|
+
logger.debug(f"Connection error during POP: {e}, will retry after backoff.")
|
|
323
|
+
pass # Fall through to backoff and retry
|
|
324
324
|
except json.JSONDecodeError:
|
|
325
325
|
return ResponseSchema(response_code=1, response_reason="Invalid JSON response from server.")
|
|
326
326
|
except Exception as e:
|
|
327
327
|
return ResponseSchema(response_code=1, response_reason=str(e))
|
|
328
328
|
|
|
329
|
-
|
|
329
|
+
# Exponential backoff
|
|
330
|
+
time.sleep(backoff_delay)
|
|
331
|
+
backoff_delay = min(backoff_delay * 2, self._max_backoff)
|
|
330
332
|
|
|
331
333
|
def _execute_simple_command(self, command: dict) -> ResponseSchema:
|
|
332
334
|
"""
|
|
@@ -350,12 +352,13 @@ class SimpleClient(MessageBrokerClientBase):
|
|
|
350
352
|
|
|
351
353
|
try:
|
|
352
354
|
with socket.create_connection((self._host, self._port), timeout=self._connection_timeout) as sock:
|
|
355
|
+
sock.settimeout(self._connection_timeout)
|
|
353
356
|
self._send(sock, data)
|
|
354
357
|
response_data = self._recv(sock)
|
|
355
358
|
response = json.loads(response_data)
|
|
356
359
|
return ResponseSchema(**response)
|
|
357
|
-
except (ConnectionError, socket.error, BrokenPipeError) as e:
|
|
358
|
-
return ResponseSchema(response_code=
|
|
360
|
+
except (ConnectionError, socket.error, BrokenPipeError, socket.timeout) as e:
|
|
361
|
+
return ResponseSchema(response_code=2, response_reason=f"Connection error: {e}")
|
|
359
362
|
except json.JSONDecodeError:
|
|
360
363
|
return ResponseSchema(response_code=1, response_reason="Invalid JSON response from server.")
|
|
361
364
|
except Exception as e:
|