data-science-document-ai 1.43.6__tar.gz → 1.45.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/constants.py +4 -1
  4. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/excel_processing.py +1 -2
  5. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/io.py +23 -0
  6. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/pdf_processing.py +117 -40
  7. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/common.py +132 -25
  8. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_partner_invoice.py +121 -55
  9. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/customsInvoice/other/prompt.txt +1 -1
  10. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/partnerInvoice/other/prompt.txt +4 -2
  11. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/utils.py +63 -41
  12. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/constants_sandbox.py +0 -0
  13. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/docai.py +0 -0
  14. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/docai_processor_config.yaml +0 -0
  15. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/llm.py +0 -0
  16. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/log_setup.py +0 -0
  17. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  18. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  19. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  20. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  21. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  22. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  23. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  24. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  25. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  26. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  27. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  28. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  29. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  30. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  31. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  32. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  33. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  34. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  35. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  36. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  37. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  38. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  39. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  40. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  41. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  42. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  43. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  44. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  46. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  47. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  48. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/packingList/other/placeholders.json +0 -0
  49. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/packingList/other/prompt.txt +0 -0
  50. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  51. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  52. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  53. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  54. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  55. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  56. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/prompt_library.py +0 -0
  57. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/setup.py +0 -0
  58. {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.43.6
3
+ Version: 1.45.2
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.43.6"
3
+ version = "1.45.2"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -23,9 +23,12 @@ project_parameters = {
23
23
  "invoice_classification_lookup": "invoice_classification.json",
24
24
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
25
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 70,
26
+ "fuzzy_threshold_item_code": 90,
27
27
  "fuzzy_threshold_reverse_charge": 80,
28
28
  "fuzzy_threshold_invoice_classification": 70,
29
+ # Chunking params
30
+ "chunk_size": 1, # page (do not change this without changing the page number logic)
31
+ "chunk_after": 10, # pages
29
32
  # Big Query
30
33
  "g_ai_gbq_db_schema": "document_ai",
31
34
  "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -4,8 +4,6 @@ import logging
4
4
 
5
5
  from ddtrace import tracer
6
6
 
7
- from src.postprocessing.common import llm_prediction_to_tuples
8
-
9
7
  logger = logging.getLogger(__name__)
10
8
 
11
9
  import asyncio
@@ -78,6 +76,7 @@ async def extract_data_from_excel(
78
76
  "bundeskasse",
79
77
  "commercialInvoice",
80
78
  "packingList",
79
+ "bookingConfirmation",
81
80
  ]
82
81
  else generate_schema_structure(params, input_doc_type)
83
82
  )
@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
156
156
  return result
157
157
 
158
158
 
159
+ def bq_logs(data_to_insert, params):
160
+ """Insert logs into Google BigQuery.
161
+
162
+ Args:
163
+ data_to_insert (list): The data to insert into BigQuery.
164
+ params (dict): The parameters dictionary.
165
+ """
166
+ # Use the pre-initialized BigQuery client
167
+ bq_client = params["bq_client"]
168
+ # Get the table string
169
+ table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
170
+
171
+ logger.info(f"Log table: {table_string}")
172
+ # Insert the rows into the table
173
+ insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
174
+
175
+ # Check if there were any errors inserting the rows
176
+ if not insert_logs:
177
+ logger.info("New rows have been added.")
178
+ else:
179
+ logger.info("Errors occurred while inserting rows: ", insert_logs)
180
+
181
+
159
182
  # type: ignore
@@ -36,6 +36,7 @@ from src.utils import (
36
36
  get_pdf_page_count,
37
37
  get_processor_name,
38
38
  run_background_tasks,
39
+ split_pdf_into_chunks,
39
40
  transform_schema_strings,
40
41
  validate_based_on_schema,
41
42
  )
@@ -195,15 +196,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
195
196
  result (dict): The structured data extracted from the document, formatted as JSON.
196
197
  """
197
198
  # Bundeskasse invoices contains all the required information in the first 3 pages.
198
- file_content = (
199
- extract_top_pages(file_content, num_pages=5)
200
- if input_doc_type == "bundeskasse"
201
- else file_content
202
- )
203
- number_of_pages = get_pdf_page_count(file_content)
199
+ if input_doc_type == "bundeskasse":
200
+ file_content = extract_top_pages(file_content, num_pages=5)
204
201
 
205
- # convert file_content to required document
206
- document = llm_client.prepare_document_for_gemini(file_content)
202
+ number_of_pages = get_pdf_page_count(file_content)
203
+ logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
207
204
 
208
205
  # get the schema placeholder from the Doc AI and generate the response structure
209
206
  response_schema = (
@@ -215,26 +212,28 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
215
212
  "bundeskasse",
216
213
  "commercialInvoice",
217
214
  "packingList",
215
+ "bookingConfirmation",
218
216
  ]
219
217
  else generate_schema_structure(params, input_doc_type)
220
218
  )
221
219
 
222
220
  carrier = "other"
223
- if (
224
- "preprocessing" in prompt_library.library.keys()
225
- and "carrier" in prompt_library.library["preprocessing"].keys()
226
- and input_doc_type
227
- in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
228
- ):
229
- carrier_schema = prompt_library.library["preprocessing"]["carrier"][
230
- "placeholders"
231
- ][input_doc_type]
221
+ carrier_schema = (
222
+ prompt_library.library.get("preprocessing", {})
223
+ .get("carrier", {})
224
+ .get("placeholders", {})
225
+ .get(input_doc_type)
226
+ )
232
227
 
228
+ if carrier_schema:
233
229
  carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
234
230
  carrier_prompt = carrier_prompt.replace(
235
231
  "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
236
232
  )
237
233
 
234
+ # convert file_content to required document
235
+ document = llm_client.prepare_document_for_gemini(file_content)
236
+
238
237
  # identify carrier for customized prompting
239
238
  carrier = await identify_carrier(
240
239
  document,
@@ -244,37 +243,115 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
244
243
  doc_type=input_doc_type,
245
244
  )
246
245
 
247
- if input_doc_type == "bookingConfirmation":
248
- response_schema = prompt_library.library[input_doc_type][carrier][
249
- "placeholders"
250
- ]
251
-
246
+ # Select prompt
252
247
  if (
253
- input_doc_type in prompt_library.library.keys()
254
- and carrier in prompt_library.library[input_doc_type].keys()
248
+ input_doc_type not in prompt_library.library
249
+ or carrier not in prompt_library.library[input_doc_type]
255
250
  ):
256
- # get the related prompt from predefined prompt library
257
- prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
251
+ return {}
258
252
 
259
- # Update schema to extract value-page_number pairs
260
- if number_of_pages > 1:
261
- response_schema = transform_schema_strings(response_schema)
253
+ # get the related prompt from predefined prompt library
254
+ prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
262
255
 
263
- # Update the prompt to instruct LLM to include page numbers
264
- prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
256
+ # Add page-number extraction for moderately large docs
257
+ use_chunking = number_of_pages >= params["chunk_after"]
265
258
 
266
- # generate the result with LLM (gemini)
267
- result = await llm_client.get_unified_json_genai(
268
- prompt=prompt,
269
- document=document,
270
- response_schema=response_schema,
271
- doc_type=input_doc_type,
259
+ # Update schema and prompt to extract value-page_number pairs
260
+ if not use_chunking and number_of_pages > 1:
261
+ response_schema = transform_schema_strings(response_schema)
262
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
263
+
264
+ tasks = []
265
+ # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
266
+ for chunk in (
267
+ split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
268
+ if use_chunking
269
+ else [file_content]
270
+ ):
271
+ tasks.append(
272
+ process_chunk_with_retry(
273
+ chunk, prompt, response_schema, llm_client, input_doc_type
274
+ )
272
275
  )
273
276
 
274
- result = llm_prediction_to_tuples(result, number_of_pages)
277
+ results = await asyncio.gather(*tasks, return_exceptions=True)
275
278
 
276
- return result
277
- return {}
279
+ if use_chunking:
280
+ return merge_llm_results(results, response_schema)
281
+ else:
282
+ return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
283
+
284
+
285
+ async def process_chunk_with_retry(
286
+ chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
287
+ ):
288
+ """Process a chunk with retries in case of failure."""
289
+ for attempt in range(1, retries + 1):
290
+ try:
291
+ return await process_chunk(
292
+ chunk_content=chunk_content,
293
+ prompt=prompt,
294
+ response_schema=response_schema,
295
+ llm_client=llm_client,
296
+ input_doc_type=input_doc_type,
297
+ )
298
+ except Exception as e:
299
+ logger.error(f"Chunk failed on attempt {attempt}: {e}")
300
+ if attempt == retries:
301
+ raise
302
+ await asyncio.sleep(1) # small backoff
303
+
304
+
305
+ async def process_chunk(
306
+ chunk_content, prompt, response_schema, llm_client, input_doc_type
307
+ ):
308
+ """Process a chunk with Gemini."""
309
+ document = llm_client.prepare_document_for_gemini(chunk_content)
310
+ return await llm_client.get_unified_json_genai(
311
+ prompt=prompt,
312
+ document=document,
313
+ response_schema=response_schema,
314
+ doc_type=input_doc_type,
315
+ )
316
+
317
+
318
+ def merge_llm_results(results, response_schema):
319
+ """Merge LLM results from multiple chunks."""
320
+ merged = {}
321
+ for i, result in enumerate(results):
322
+ if not isinstance(result, dict):
323
+ continue
324
+ # Add page number to all values coming from this chunk
325
+ result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
326
+
327
+ # Merge the result into the final merged dictionary
328
+ for key, value in result.items():
329
+ field_type = (
330
+ response_schema["properties"].get(key, {}).get("type", "").upper()
331
+ )
332
+
333
+ if key not in merged:
334
+ if field_type == "ARRAY":
335
+ # append the values as a list
336
+ merged[key] = (
337
+ value if isinstance(value, list) else ([value] if value else [])
338
+ )
339
+ else:
340
+ merged[key] = value
341
+ continue
342
+
343
+ if field_type == "ARRAY":
344
+ # append list contents across chunks
345
+ if isinstance(value, list):
346
+ merged[key].extend(value)
347
+ else:
348
+ merged[key].append(value)
349
+
350
+ # take first non-null value only
351
+ if merged[key] in (None, "", [], {}):
352
+ merged[key] = value
353
+
354
+ return merged
278
355
 
279
356
 
280
357
  async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
12
12
  from src.io import logger
13
13
  from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
14
14
  from src.prompts.prompt_library import prompt_library
15
- from src.utils import get_tms_mappings
15
+ from src.utils import batch_fetch_all_mappings, get_tms_mappings
16
16
 
17
17
  tms_domain = os.environ["TMS_DOMAIN"]
18
18
 
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
134
134
  formatted_value: string
135
135
 
136
136
  """
137
+ # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
138
+ value = remove_unwanted_patterns(data_field_value)
139
+
137
140
  formatted_value = ""
138
- for c in data_field_value:
141
+ for c in value:
139
142
  if c.isnumeric() or c in [",", ".", "-"]:
140
143
  formatted_value += c
141
144
 
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
320
323
  lineitem = lineitem.replace("HIGH CUBE", "")
321
324
 
322
325
  # Remove container size e.g., 20FT, 40HC, etc.
323
- lineitem = re.sub(
324
- r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
325
- ).strip()
326
+ pattern = [
327
+ f"{s}{t}"
328
+ for s in ("20|22|40|45".split("|"))
329
+ for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
330
+ ]
331
+ lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
326
332
 
327
333
  return lineitem
328
334
 
@@ -372,18 +378,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
372
378
  return re.sub(r"\s{2,}", " ", lineitem).strip()
373
379
 
374
380
 
375
- async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
381
+ async def format_label(
382
+ entity_k,
383
+ entity_value,
384
+ document_type_code,
385
+ params,
386
+ mime_type,
387
+ container_map,
388
+ terminal_map,
389
+ depot_map,
390
+ ):
376
391
  llm_client = params["LlmClient"]
377
392
  if isinstance(entity_value, dict): # if it's a nested entity
378
393
  format_tasks = [
379
- format_label(sub_k, sub_v, document_type_code, params, mime_type)
394
+ format_label(
395
+ sub_k,
396
+ sub_v,
397
+ document_type_code,
398
+ params,
399
+ mime_type,
400
+ container_map,
401
+ terminal_map,
402
+ depot_map,
403
+ )
380
404
  for sub_k, sub_v in entity_value.items()
381
405
  ]
382
406
  return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
383
407
  if isinstance(entity_value, list):
384
408
  format_tasks = await asyncio.gather(
385
409
  *[
386
- format_label(entity_k, sub_v, document_type_code, params, mime_type)
410
+ format_label(
411
+ entity_k,
412
+ sub_v,
413
+ document_type_code,
414
+ params,
415
+ mime_type,
416
+ container_map,
417
+ terminal_map,
418
+ depot_map,
419
+ )
387
420
  for sub_v in entity_value
388
421
  ]
389
422
  )
@@ -405,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
405
438
  )
406
439
 
407
440
  elif (entity_key == "containertype") or (entity_key == "containersize"):
408
- formatted_value = get_tms_mappings(entity_value, "container_types")
441
+ formatted_value = container_map.get(entity_value)
409
442
 
410
443
  elif check_formatting_rule(entity_k, document_type_code, "terminal"):
411
- formatted_value = get_tms_mappings(entity_value, "terminals")
444
+ formatted_value = terminal_map.get(entity_value)
412
445
 
413
446
  elif check_formatting_rule(entity_k, document_type_code, "depot"):
414
- formatted_value = get_tms_mappings(entity_value, "depots")
447
+ formatted_value = depot_map.get(entity_value)
415
448
 
416
449
  elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
417
450
  try:
@@ -507,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
507
540
  """Get port code using AI model."""
508
541
  port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
509
542
 
510
- return get_tms_mappings(port, "ports", port_llm)
543
+ result = await get_tms_mappings(port, "ports", port_llm)
544
+ return result.get(port, None)
511
545
 
512
546
 
513
547
  async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -598,6 +632,74 @@ def decimal_convertor(value, quantity=False):
598
632
  return value
599
633
 
600
634
 
635
+ async def collect_mapping_requests(entity_value, document_type_code):
636
+ """Collect all unique container types, terminals, and depots from the entity value."""
637
+ # Sets to store unique values
638
+ container_types = set()
639
+ terminals = set()
640
+ depots = set()
641
+
642
+ def walk(key, value):
643
+ key_lower = key.lower()
644
+
645
+ # nested dict
646
+ if isinstance(value, dict):
647
+ for k, v in value.items():
648
+ walk(k, v)
649
+
650
+ # list of values
651
+ elif isinstance(value, list):
652
+ for item in value:
653
+ walk(key, item)
654
+
655
+ # leaf node
656
+ else:
657
+ if key_lower in ("containertype", "containersize"):
658
+ # Take only "20DV" from ('20DV', 0) if it's a tuple
659
+ container_types.add(value[0]) if isinstance(
660
+ value, tuple
661
+ ) else container_types.add(value)
662
+
663
+ elif check_formatting_rule(key, document_type_code, "terminal"):
664
+ terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
665
+ value
666
+ )
667
+
668
+ elif check_formatting_rule(key, document_type_code, "depot"):
669
+ depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
670
+
671
+ walk("root", entity_value)
672
+
673
+ return container_types, terminals, depots
674
+
675
+
676
+ async def format_all_labels(entity_data, document_type_code, params, mime_type):
677
+ """Format all labels in the entity data using cached mappings."""
678
+ # Collect all mapping values needed
679
+ container_req, terminal_req, depot_req = await collect_mapping_requests(
680
+ entity_data, document_type_code
681
+ )
682
+
683
+ # Batch fetch mappings
684
+ container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
685
+ container_req, terminal_req, depot_req
686
+ )
687
+
688
+ # Format labels using cached mappings
689
+ _, result = await format_label(
690
+ "root",
691
+ entity_data,
692
+ document_type_code,
693
+ params,
694
+ mime_type,
695
+ container_map,
696
+ terminal_map,
697
+ depot_map,
698
+ )
699
+
700
+ return _, result
701
+
702
+
601
703
  async def format_all_entities(result, document_type_code, params, mime_type):
602
704
  """Format the entity values in the result dictionary."""
603
705
  # Since we treat `customsInvoice` same as `partnerInvoice`
@@ -613,13 +715,13 @@ async def format_all_entities(result, document_type_code, params, mime_type):
613
715
  return {}
614
716
 
615
717
  # Format all entities recursively
616
- _, aggregated_data = await format_label(
617
- None, result, document_type_code, params, mime_type
718
+ _, aggregated_data = await format_all_labels(
719
+ result, document_type_code, params, mime_type
618
720
  )
619
721
 
620
722
  # Process partner invoice on lineitem mapping and reverse charge sentence
621
723
  if document_type_code in ["partnerInvoice", "bundeskasse"]:
622
- process_partner_invoice(params, aggregated_data, document_type_code)
724
+ await process_partner_invoice(params, aggregated_data, document_type_code)
623
725
 
624
726
  logger.info("Data Extraction completed successfully")
625
727
  return aggregated_data
@@ -651,41 +753,46 @@ def remove_stop_words(lineitem: str):
651
753
  )
652
754
 
653
755
 
654
- def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
756
+ def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
655
757
  """Convert LLM prediction dictionary to tuples of (value, page_number)."""
656
-
657
758
  # If only 1 page, simply pair each value with page number 0
658
759
  if number_of_pages == 1:
760
+ effective_page = 0 if page_number is None else page_number
659
761
  if isinstance(llm_prediction, dict):
660
762
  return {
661
- k: llm_prediction_to_tuples(v, number_of_pages)
763
+ k: llm_prediction_to_tuples(
764
+ v, number_of_pages, page_number=effective_page
765
+ )
662
766
  for k, v in llm_prediction.items()
663
767
  }
664
768
  elif isinstance(llm_prediction, list):
665
769
  return [
666
- llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
770
+ llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
771
+ for v in llm_prediction
667
772
  ]
668
773
  else:
669
- return (llm_prediction, 0) if llm_prediction else None
774
+ return (llm_prediction, effective_page) if llm_prediction else None
670
775
 
671
776
  # logic for multi-page predictions
672
777
  if isinstance(llm_prediction, dict):
673
778
  if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
674
779
  if llm_prediction["value"]:
675
780
  try:
676
- page_number = int(llm_prediction["page_number"])
781
+ _page_number = int(llm_prediction["page_number"])
677
782
  except: # noqa: E722
678
- page_number = -1
679
- return (llm_prediction["value"], page_number)
783
+ _page_number = -1
784
+ return (llm_prediction["value"], _page_number)
680
785
  return None
681
786
 
682
787
  for key, value in llm_prediction.items():
683
788
  llm_prediction[key] = llm_prediction_to_tuples(
684
- llm_prediction.get(key, value), number_of_pages
789
+ llm_prediction.get(key, value), number_of_pages, page_number
685
790
  )
686
791
 
687
792
  elif isinstance(llm_prediction, list):
688
793
  for i, item in enumerate(llm_prediction):
689
- llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
794
+ llm_prediction[i] = llm_prediction_to_tuples(
795
+ item, number_of_pages, page_number
796
+ )
690
797
 
691
798
  return llm_prediction
@@ -1,7 +1,5 @@
1
1
  """This module contains the postprocessing functions for the partner invoice."""
2
- from concurrent.futures import ThreadPoolExecutor
3
-
4
- from fuzzywuzzy import fuzz
2
+ from rapidfuzz import fuzz, process
5
3
 
6
4
  from src.io import logger
7
5
  from src.utils import get_tms_mappings
@@ -136,7 +134,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
136
134
  ] = "Dasbachstraße 15, 54292 Trier, Germany"
137
135
 
138
136
 
139
- def process_partner_invoice(params, aggregated_data, document_type_code):
137
+ async def process_partner_invoice(params, aggregated_data, document_type_code):
140
138
  """Process the partner invoice data."""
141
139
  # Post process bundeskasse invoices
142
140
  if document_type_code == "bundeskasse":
@@ -160,27 +158,80 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
160
158
  reverse_charge_info["formattedValue"] = reverse_charge_value
161
159
  reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
162
160
 
163
- # Process each line item
164
- for line_item in line_items:
165
- if line_item.get("lineItemDescription", None) is not None:
166
- line_item["itemCode"] = associate_forto_item_code(
167
- line_item["lineItemDescription"]["formattedValue"],
168
- params,
169
- )
161
+ # Process everything in one go
162
+ processed_items = await process_line_items_batch(params, line_items, reverse_charge)
170
163
 
171
- # Add page number for the consistency
172
- line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
164
+ # Update your main data structure
165
+ aggregated_data["lineItem"] = processed_items
173
166
 
174
- if reverse_charge:
175
- # Distribute reverseChargeSentence to all line items
176
- line_item["reverseChargeSentence"] = reverse_charge
177
- line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
178
167
 
168
+ async def process_line_items_batch(
169
+ params: dict, line_items: list[dict], reverse_charge=None
170
+ ):
171
+ """
172
+ Processes all line items efficiently using a "Split-Apply-Combine" strategy.
173
+ """
174
+ # To store items that need external API lookup
175
+ pending_line_items = {}
176
+
177
+ # Check Fuzzy Matching
178
+ logger.info(f"Mapping line item codes with Fuzzy matching....")
179
+ for i, item in enumerate(line_items):
180
+ description_obj = item.get("lineItemDescription")
181
+
182
+ if not description_obj or not description_obj.get("formattedValue"):
183
+ continue
184
+ # Get the formatted description text
185
+ desc = description_obj["formattedValue"]
186
+
187
+ # Find Fuzzy Match
188
+ matched_code = find_matching_lineitem(
189
+ desc,
190
+ params["lookup_data"]["item_code"],
191
+ params["fuzzy_threshold_item_code"],
192
+ )
193
+
194
+ if matched_code:
195
+ # Set the code to the line item
196
+ item["itemCode"] = {
197
+ "documentValue": desc,
198
+ "formattedValue": matched_code,
199
+ "page": description_obj.get("page"),
200
+ }
201
+ else:
202
+ # Store for batch API call
203
+ pending_line_items[i] = desc
204
+
205
+ # Batch API Call for Embedding lookups
206
+ if pending_line_items:
207
+ values_to_fetch = list(set(pending_line_items.values()))
208
+ logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
209
+
210
+ # Await the batch response {"desc1": "code1", "desc2": "code2"}
211
+ api_results = await get_tms_mappings(
212
+ input_list=values_to_fetch, embedding_type="line_items"
213
+ )
214
+
215
+ # Merge API results back into original list
216
+ for index, desc in pending_line_items.items():
217
+ # Get result from API response, or None if API failed for that item
218
+ forto_code = api_results.get(desc)
219
+
220
+ # Update the original item
221
+ line_items[index]["itemCode"] = {
222
+ "documentValue": desc,
223
+ "formattedValue": forto_code, # Might be None if API failed
224
+ "page": line_items[index]["lineItemDescription"].get("page"),
225
+ }
179
226
 
180
- def compute_score(args):
181
- """Compute the fuzzy matching score between a new line item and a key."""
182
- new_lineitem, key = args
183
- return key, fuzz.ratio(new_lineitem, key)
227
+ # Add reverse charge here if exists
228
+ if reverse_charge:
229
+ [
230
+ item.update({"reverseChargeSentence": reverse_charge})
231
+ for item in line_items
232
+ if item["itemCode"]["formattedValue"] != "CDU"
233
+ ]
234
+ return line_items
184
235
 
185
236
 
186
237
  def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
@@ -195,16 +246,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
195
246
  tuple: (best_match, score) if above threshold, else (None, 0)
196
247
  """
197
248
  # Use multiprocessing to find the best match
198
- with ThreadPoolExecutor() as executor:
199
- results = executor.map(compute_score, [(target, s) for s in sentences])
249
+ result = process.extractOne(
250
+ target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
251
+ )
200
252
 
201
- # Find the best match and score
202
- best_match, best_score = max(results, key=lambda x: x[1], default=(None, 0))
253
+ if result is None:
254
+ return None, False
203
255
 
204
- # return best_match, best_score
205
- # If the best match score is above a threshold (e.g., 80), return it
206
- if best_score >= threshold:
207
- return best_match, True
256
+ match, score, index = result
257
+
258
+ # return best_match if the best match score is above a threshold (e.g., 80)
259
+ if match:
260
+ return match, True
208
261
 
209
262
  return None, False
210
263
 
@@ -236,46 +289,59 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
236
289
  Returns:
237
290
  str: The best matching 'Forto SLI' value from the dictionary.
238
291
  """
239
- new_lineitem = new_lineitem.upper()
240
-
241
292
  # Check if the new line item is already in the dictionary
242
293
  if new_lineitem in kvp_dict:
243
294
  return kvp_dict[new_lineitem]
244
295
 
245
296
  # Get the best fuzzy match score for the extracted line item
246
- best_match, _ = get_fuzzy_match_score(
247
- new_lineitem, list(kvp_dict.keys()), threshold
297
+ match, _ = get_fuzzy_match_score(
298
+ new_lineitem,
299
+ list(kvp_dict.keys()),
300
+ threshold,
248
301
  )
249
302
 
250
- return kvp_dict.get(best_match, None)
303
+ if match:
304
+ # find the code from the kvp_dict
305
+ return kvp_dict[match]
251
306
 
307
+ return None
252
308
 
253
- def associate_forto_item_code(input_string, params):
254
- """
255
- Finds a match for the input string using fuzzy matching first, then embedding fallback.
256
-
257
- 1. Tries to find a fuzzy match for input_string against the keys in
258
- mapping_data using RapidFuzz, requiring a score >= fuzzy_threshold.
259
- 2. If found, returns the corresponding value from mapping_data.
260
- 3. If not found above threshold, calls the embedding_fallback function.
261
309
 
310
+ async def associate_forto_item_code(line_item_data, params):
311
+ """
312
+ Associates Forto item codes to a list of line item descriptions.
262
313
  Args:
263
- input_string: The string to find a match for.
264
- params: Parameters containing the lookup data and fuzzy threshold.
314
+ line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
315
+ params (dict): Parameters containing lookup data and thresholds.
265
316
 
266
317
  Returns:
267
- The matched value (from fuzzy match or embedding), or None if no match found.
318
+ list: A list of dictionaries with 'description' and 'itemCode' keys.
268
319
  """
269
- # Get the Forto item code using fuzzy matching
270
- forto_item_code = find_matching_lineitem(
271
- new_lineitem=input_string,
272
- kvp_dict=params["lookup_data"]["item_code"], # TODO: Parse the KVP dictionary
273
- threshold=params["fuzzy_threshold_item_code"],
274
- )
275
320
 
276
- if forto_item_code is None:
277
- # 2. Fallback to embedding function if no good fuzzy match
278
- forto_item_code = get_tms_mappings(input_string, "line_items")
321
+ result = []
322
+ pending_line_items = {}
323
+ for desc, f_desc in line_item_data.items():
324
+ # Get the Forto item code using fuzzy matching
325
+ code = find_matching_lineitem(
326
+ new_lineitem=f_desc,
327
+ kvp_dict=params["lookup_data"]["item_code"],
328
+ threshold=params["fuzzy_threshold_item_code"],
329
+ )
330
+ if code:
331
+ result.append({"description": desc, "itemCode": code})
332
+ else:
333
+ pending_line_items[desc] = f_desc
334
+
335
+ # Batch API Call for Embedding lookups
336
+ if pending_line_items:
337
+ api_results = await get_tms_mappings(
338
+ input_list=list(pending_line_items.values()),
339
+ embedding_type="line_items",
340
+ )
341
+
342
+ # Merge API results back into original list
343
+ for desc, f_desc in pending_line_items.items():
344
+ code = api_results.get(f_desc)
345
+ result.append({"description": desc, "itemCode": code})
279
346
 
280
- result = {"documentValue": input_string, "formattedValue": forto_item_code}
281
347
  return result
@@ -54,7 +54,7 @@ Your role is to accurately extract specific entities from these invoices to supp
54
54
  - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
55
55
  - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
56
56
  - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
57
- - quantity: The quantity of the item or service provided in the line item.
57
+ - quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
58
58
  - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
59
59
 
60
60
  - hblNumber and mblNumber:
@@ -52,8 +52,8 @@ Your role is to accurately extract specific entities from these invoices to supp
52
52
  - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
53
53
  - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
54
54
  - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
55
- - quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
56
- - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
55
+ - quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
56
+ - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
57
57
 
58
58
  - hblNumber and mblNumber:
59
59
  - The Master Bill of Lading number. Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", or "HBL No.".
@@ -68,6 +68,7 @@ Your role is to accurately extract specific entities from these invoices to supp
68
68
  - Example:
69
69
  - "COSCO SHIPPING Lines Italy, Poland, or France S.R.L. – Genova Office – As Agent For COSCO SHIPPING Lines Co.,Ltd."
70
70
  - vendorName: COSCO SHIPPING Lines Co.,Ltd.
71
+ - From Hapag-Lloyd invoices, look for "Ballindamm 25" address to extract the vendorAddress.
71
72
 
72
73
  - agentName: Name of the agent. Agencies are offices authorized to act on behalf of a company. This details usually available including the branch name of the parent company name in the invoice.
73
74
  - agentKeyWord:
@@ -81,6 +82,7 @@ Your role is to accurately extract specific entities from these invoices to supp
81
82
 
82
83
  IMPORTANT NOTE:
83
84
  - Ensure all extracted values are directly from the document. Do not make assumptions, modifications or calculations.
85
+ - Do not split the quantity into different line items. e.g., if quantity is 2 or 2 CTR or 2 BIL, do not create 2 separate line items with quantity 1 each.
84
86
  - Do not normalize or modify any entity values.
85
87
  - Pay attention to the line item details and paymentInformation, as they may vary significantly across different invoices.
86
88
 
@@ -6,16 +6,16 @@ import json
6
6
  import os
7
7
  import pickle
8
8
  from datetime import datetime
9
- from typing import Literal
9
+ from typing import Any, Dict, List, Literal, Optional
10
10
 
11
+ import httpx
11
12
  import numpy as np
12
13
  import openpyxl
13
14
  import pandas as pd
14
- import requests
15
15
  from google.cloud import documentai_v1beta3 as docu_ai_beta
16
16
  from pypdf import PdfReader, PdfWriter
17
17
 
18
- from src.io import get_storage_client, logger
18
+ from src.io import bq_logs, get_storage_client, logger
19
19
 
20
20
 
21
21
  def get_pdf_page_count(pdf_bytes):
@@ -31,29 +31,6 @@ def get_pdf_page_count(pdf_bytes):
31
31
  return len(reader.pages)
32
32
 
33
33
 
34
- def bq_logs(data_to_insert, params):
35
- """Insert logs into Google BigQuery.
36
-
37
- Args:
38
- data_to_insert (list): The data to insert into BigQuery.
39
- params (dict): The parameters dictionary.
40
- """
41
- # Use the pre-initialized BigQuery client
42
- bq_client = params["bq_client"]
43
- # Get the table string
44
- table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
45
-
46
- logger.info(f"Log table: {table_string}")
47
- # Insert the rows into the table
48
- insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
49
-
50
- # Check if there were any errors inserting the rows
51
- if not insert_logs:
52
- logger.info("New rows have been added.")
53
- else:
54
- logger.info("Errors occurred while inserting rows: ", insert_logs)
55
-
56
-
57
34
  async def get_data_set_schema_from_docai(
58
35
  schema_client, project_id=None, location=None, processor_id=None, name=None
59
36
  ):
@@ -383,9 +360,9 @@ def extract_top_pages(pdf_bytes, num_pages=4):
383
360
  return output.getvalue()
384
361
 
385
362
 
386
- def get_tms_mappings(
387
- input_list: list[str], embedding_type: str, llm_ports: list[str] = None
388
- ):
363
+ async def get_tms_mappings(
364
+ input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
365
+ ) -> Dict[str, Any]:
389
366
  """Get TMS mappings for the given values.
390
367
 
391
368
  Args:
@@ -395,39 +372,66 @@ def get_tms_mappings(
395
372
  llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
396
373
 
397
374
  Returns:
398
- dict: A dictionary with the mapping results.
375
+ dict or string: A dictionary or a string with the mapping results.
399
376
  """
400
- # To test the API locally, port-forward the embedding service in the sandbox to 8080:80
401
- # If you want to launch uvicorn from the tms-embedding repo, then use --port 8080 in the config file
402
377
  base_url = (
403
378
  "http://0.0.0.0:8080/"
404
379
  if os.getenv("CLUSTER") is None
405
380
  else "http://tms-mappings.api.svc.cluster.local./"
406
381
  )
407
382
 
383
+ # Ensure clean inputs
384
+ if not input_list:
385
+ return {}
386
+
408
387
  # Ensure input_list is a list
409
388
  if not isinstance(input_list, list):
410
389
  input_list = [input_list]
411
390
 
412
391
  # Always send a dict with named keys
413
392
  payload = {embedding_type: input_list}
393
+
414
394
  if llm_ports:
415
395
  payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
416
396
 
417
397
  # Make the POST request to the TMS mappings API
418
- url = f"{base_url}/{embedding_type}"
419
- response = requests.post(url=url, json=payload)
398
+ url = f"{base_url}{embedding_type}"
420
399
 
421
- if response.status_code != 200:
422
- logger.error(
423
- f"Error from TMS mappings API: {response.status_code} - {response.text}"
424
- )
400
+ # Use a timeout so the code doesn't hang forever
401
+ timeout = httpx.Timeout(60.0, connect=10.0)
402
+
403
+ async with httpx.AsyncClient(timeout=timeout) as client:
404
+ try:
405
+ response = await client.post(url, json=payload)
406
+ response.raise_for_status()
425
407
 
426
- formatted_values = (
427
- response.json().get("response", {}).get("data", {}).get(input_list[0], None)
408
+ # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
409
+ return response.json().get("response", {}).get("data", {})
410
+
411
+ except httpx.HTTPStatusError as exc:
412
+ logger.error(
413
+ f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
414
+ )
415
+ return {}
416
+
417
+
418
+ async def batch_fetch_all_mappings(container_types, terminals, depots):
419
+ """Batch fetch all mappings for container types, terminals, and depots."""
420
+ # run batch calls concurrently
421
+ results = await asyncio.gather(
422
+ get_tms_mappings(list(container_types), "container_types"),
423
+ get_tms_mappings(list(terminals), "terminals"),
424
+ get_tms_mappings(list(depots), "depots"),
428
425
  )
429
426
 
430
- return formatted_values
427
+ batch_container_map, batch_terminal_map, batch_depot_map = results
428
+
429
+ # Convert lists of tuples to dicts if necessary
430
+ return (
431
+ dict(batch_container_map or {}),
432
+ dict(batch_terminal_map or {}),
433
+ dict(batch_depot_map or {}),
434
+ )
431
435
 
432
436
 
433
437
  def transform_schema_strings(schema):
@@ -502,3 +506,21 @@ def estimate_page_count(sheet):
502
506
  else:
503
507
  return None
504
508
  return np.ceil(pg_cnt / 500)
509
+
510
+
511
+ def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
512
+ """Split PDF into smaller page chunks."""
513
+ pdf = PdfReader(io.BytesIO(file_content))
514
+ total_pages = len(pdf.pages)
515
+
516
+ # TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
517
+ for i in range(0, total_pages, chunk_size):
518
+ writer = PdfWriter()
519
+ for j in range(i, min(i + chunk_size, total_pages)):
520
+ writer.add_page(pdf.pages[j])
521
+
522
+ buffer = io.BytesIO()
523
+ writer.write(buffer)
524
+ buffer.seek(0)
525
+
526
+ yield buffer.getvalue()