data-science-document-ai 1.43.6__tar.gz → 1.51.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/constants.py +6 -8
  4. data_science_document_ai-1.51.0/src/docai_processor_config.yaml +22 -0
  5. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/excel_processing.py +7 -18
  6. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/io.py +23 -0
  7. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/llm.py +0 -29
  8. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/pdf_processing.py +118 -53
  9. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/common.py +132 -25
  10. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/postprocess_partner_invoice.py +137 -58
  11. data_science_document_ai-1.51.0/src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  12. data_science_document_ai-1.51.0/src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  13. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bundeskasse/other/placeholders.json +5 -5
  14. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bundeskasse/other/prompt.txt +6 -4
  15. data_science_document_ai-1.51.0/src/prompts/library/customsAssessment/other/placeholders.json +70 -0
  16. data_science_document_ai-1.51.0/src/prompts/library/customsAssessment/other/prompt.txt +29 -0
  17. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/customsInvoice/other/prompt.txt +1 -1
  18. data_science_document_ai-1.51.0/src/prompts/library/deliveryOrder/other/placeholders.json +82 -0
  19. data_science_document_ai-1.51.0/src/prompts/library/deliveryOrder/other/prompt.txt +36 -0
  20. data_science_document_ai-1.51.0/src/prompts/library/draftMbl/other/placeholders.json +80 -0
  21. data_science_document_ai-1.51.0/src/prompts/library/draftMbl/other/prompt.txt +34 -0
  22. data_science_document_ai-1.51.0/src/prompts/library/finalMbL/other/placeholders.json +80 -0
  23. data_science_document_ai-1.51.0/src/prompts/library/finalMbL/other/prompt.txt +34 -0
  24. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/partnerInvoice/other/prompt.txt +4 -2
  25. data_science_document_ai-1.51.0/src/prompts/library/preprocessing/carrier/placeholders.json +14 -0
  26. data_science_document_ai-1.51.0/src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  27. data_science_document_ai-1.51.0/src/prompts/library/shippingInstruction/other/prompt.txt +28 -0
  28. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/setup.py +9 -16
  29. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/utils.py +63 -41
  30. data_science_document_ai-1.43.6/src/docai_processor_config.yaml +0 -64
  31. data_science_document_ai-1.43.6/src/prompts/library/customsAssessment/other/prompt.txt +0 -42
  32. data_science_document_ai-1.43.6/src/prompts/library/deliveryOrder/other/placeholders.json +0 -29
  33. data_science_document_ai-1.43.6/src/prompts/library/deliveryOrder/other/prompt.txt +0 -50
  34. data_science_document_ai-1.43.6/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
  35. data_science_document_ai-1.43.6/src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
  36. data_science_document_ai-1.43.6/src/prompts/library/draftMbl/other/placeholders.json +0 -80
  37. data_science_document_ai-1.43.6/src/prompts/library/draftMbl/other/prompt.txt +0 -44
  38. data_science_document_ai-1.43.6/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  39. data_science_document_ai-1.43.6/src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
  40. data_science_document_ai-1.43.6/src/prompts/library/finalMbL/other/prompt.txt +0 -44
  41. data_science_document_ai-1.43.6/src/prompts/library/preprocessing/carrier/placeholders.json +0 -30
  42. data_science_document_ai-1.43.6/src/prompts/library/shippingInstruction/other/prompt.txt +0 -16
  43. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/constants_sandbox.py +0 -0
  44. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/docai.py +0 -0
  45. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/log_setup.py +0 -0
  46. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  47. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  48. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  49. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  50. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  51. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  52. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  53. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  54. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  55. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  56. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  57. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  58. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  59. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  60. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  61. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  62. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  63. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  64. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  65. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
  66. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  67. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  68. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  69. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  70. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  71. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/prompt_library.py +0 -0
  72. {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.43.6
3
+ Version: 1.51.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.43.6"
3
+ version = "1.51.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -23,9 +23,12 @@ project_parameters = {
23
23
  "invoice_classification_lookup": "invoice_classification.json",
24
24
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
25
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 70,
26
+ "fuzzy_threshold_item_code": 90,
27
27
  "fuzzy_threshold_reverse_charge": 80,
28
28
  "fuzzy_threshold_invoice_classification": 70,
29
+ # Chunking params
30
+ "chunk_size": 1, # page (do not change this without changing the page number logic)
31
+ "chunk_after": 10, # pages
29
32
  # Big Query
30
33
  "g_ai_gbq_db_schema": "document_ai",
31
34
  "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -50,13 +53,6 @@ project_parameters = {
50
53
  "model_selector": {
51
54
  "stable": {
52
55
  "bookingConfirmation": 1,
53
- "finalMbL": 0,
54
- "draftMbl": 0,
55
- "arrivalNotice": 0,
56
- "shippingInstruction": 0,
57
- "customsAssessment": 0,
58
- "deliveryOrder": 0,
59
- "partnerInvoice": 0,
60
56
  },
61
57
  "beta": {
62
58
  "bookingConfirmation": 0,
@@ -84,8 +80,10 @@ project_parameters = {
84
80
  # Key to combine the LLM results with the Doc Ai results
85
81
  "key_to_combine": {
86
82
  "bookingConfirmation": ["transportLegs"],
83
+ "arrivalNotice": ["containers"],
87
84
  "finalMbL": ["containers"],
88
85
  "draftMbl": ["containers"],
86
+ "deliveryOrder": ["Equipment", "TransportLeg"],
89
87
  "customsAssessment": ["containers"],
90
88
  "packingList": ["skuData"],
91
89
  "commercialInvoice": ["skus"],
@@ -0,0 +1,22 @@
1
+ models_project_id: "738250249861"
2
+ model_config:
3
+ stable:
4
+ bookingConfirmation:
5
+ - id: "dc3e714cd168aeaa"
6
+ details:
7
+ display_name: "doc_cap_bookingConfirmation"
8
+ author: "reet.kanjilal@forto.com"
9
+ created_date: ""
10
+ - id: "3c280b11bdb3ed89"
11
+ details:
12
+ display_name: "doc_cap_BC_mlg"
13
+ author: "igor.tonko@forto.com"
14
+ created_date: ""
15
+
16
+ beta:
17
+ bookingConfirmation:
18
+ - id: "3c280b11bdb3ed89"
19
+ details:
20
+ display_name: "doc_cap_BC_mlg"
21
+ author: "igor.tonko@forto.com"
22
+ created_date: ""
@@ -4,8 +4,6 @@ import logging
4
4
 
5
5
  from ddtrace import tracer
6
6
 
7
- from src.postprocessing.common import llm_prediction_to_tuples
8
-
9
7
  logger = logging.getLogger(__name__)
10
8
 
11
9
  import asyncio
@@ -13,9 +11,8 @@ import asyncio
13
11
  import numpy as np
14
12
  import pandas as pd
15
13
 
16
- from src.llm import prompt_excel_extraction
17
14
  from src.prompts.prompt_library import prompt_library
18
- from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
15
+ from src.utils import estimate_page_count, get_excel_sheets
19
16
 
20
17
 
21
18
  async def extract_data_from_sheet(
@@ -31,11 +28,14 @@ async def extract_data_from_sheet(
31
28
  )
32
29
 
33
30
  # Prompt for the LLM JSON
34
- prompt_docai = prompt_excel_extraction(worksheet)
31
+ prompt = prompt_library.library[doc_type]["other"]["prompt"]
32
+
33
+ # Join the worksheet content with the prompt
34
+ prompt = worksheet + "\n" + prompt
35
35
 
36
36
  try:
37
37
  result = await llm_client.get_unified_json_genai(
38
- prompt_docai,
38
+ prompt,
39
39
  response_schema=response_schema,
40
40
  doc_type=doc_type,
41
41
  )
@@ -69,18 +69,7 @@ async def extract_data_from_excel(
69
69
 
70
70
  """
71
71
  # Generate the response structure
72
- response_schema = (
73
- prompt_library.library[input_doc_type]["other"]["placeholders"]
74
- if input_doc_type
75
- in [
76
- "partnerInvoice",
77
- "customsInvoice",
78
- "bundeskasse",
79
- "commercialInvoice",
80
- "packingList",
81
- ]
82
- else generate_schema_structure(params, input_doc_type)
83
- )
72
+ response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
84
73
 
85
74
  # Load the Excel file and get ONLY the "visible" sheet names
86
75
  sheets, workbook = get_excel_sheets(file_content, mime_type)
@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
156
156
  return result
157
157
 
158
158
 
159
+ def bq_logs(data_to_insert, params):
160
+ """Insert logs into Google BigQuery.
161
+
162
+ Args:
163
+ data_to_insert (list): The data to insert into BigQuery.
164
+ params (dict): The parameters dictionary.
165
+ """
166
+ # Use the pre-initialized BigQuery client
167
+ bq_client = params["bq_client"]
168
+ # Get the table string
169
+ table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
170
+
171
+ logger.info(f"Log table: {table_string}")
172
+ # Insert the rows into the table
173
+ insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
174
+
175
+ # Check if there were any errors inserting the rows
176
+ if not insert_logs:
177
+ logger.info("New rows have been added.")
178
+ else:
179
+ logger.info("Errors occurred while inserting rows: ", insert_logs)
180
+
181
+
159
182
  # type: ignore
@@ -201,33 +201,4 @@ class LlmClient:
201
201
  return response
202
202
 
203
203
 
204
- def prompt_excel_extraction(excel_structured_text):
205
- """Write a prompt to extract data from Excel files.
206
-
207
- Args:
208
- excel_structured_text (str): The structured text of the Excel file.
209
-
210
- Returns:
211
- prompt str: The prompt for common json.
212
- """
213
- prompt = f"""{excel_structured_text}
214
-
215
- Task: Fill in the following dictionary from the information in the given in the above excel data.
216
-
217
- Instructions:
218
- - Do not change the keys of the following dictionary.
219
- - The values should be filled in as per the schema provided below.
220
- - If an entity contains a 'display_name', consider its properties as child data points in the below format.
221
- {{'data-field': {{
222
- 'child-data-field': 'type -occurrence_type- description',
223
- }}
224
- }}
225
- - The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
226
- - Ensure the schema reflects the hierarchical relationship.
227
- - Use the data field description to understand the context of the data.
228
-
229
- """
230
- return prompt
231
-
232
-
233
204
  # pylint: enable=all
@@ -36,6 +36,7 @@ from src.utils import (
36
36
  get_pdf_page_count,
37
37
  get_processor_name,
38
38
  run_background_tasks,
39
+ split_pdf_into_chunks,
39
40
  transform_schema_strings,
40
41
  validate_based_on_schema,
41
42
  )
@@ -195,46 +196,32 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
195
196
  result (dict): The structured data extracted from the document, formatted as JSON.
196
197
  """
197
198
  # Bundeskasse invoices contains all the required information in the first 3 pages.
198
- file_content = (
199
- extract_top_pages(file_content, num_pages=5)
200
- if input_doc_type == "bundeskasse"
201
- else file_content
202
- )
199
+ if input_doc_type == "bundeskasse":
200
+ file_content = extract_top_pages(file_content, num_pages=5)
201
+
203
202
  number_of_pages = get_pdf_page_count(file_content)
203
+ logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
204
204
 
205
- # convert file_content to required document
206
- document = llm_client.prepare_document_for_gemini(file_content)
207
-
208
- # get the schema placeholder from the Doc AI and generate the response structure
209
- response_schema = (
210
- prompt_library.library[input_doc_type]["other"]["placeholders"]
211
- if input_doc_type
212
- in [
213
- "partnerInvoice",
214
- "customsInvoice",
215
- "bundeskasse",
216
- "commercialInvoice",
217
- "packingList",
218
- ]
219
- else generate_schema_structure(params, input_doc_type)
220
- )
205
+ # get the schema placeholder
206
+ response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
221
207
 
222
208
  carrier = "other"
223
- if (
224
- "preprocessing" in prompt_library.library.keys()
225
- and "carrier" in prompt_library.library["preprocessing"].keys()
226
- and input_doc_type
227
- in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
228
- ):
229
- carrier_schema = prompt_library.library["preprocessing"]["carrier"][
230
- "placeholders"
231
- ][input_doc_type]
209
+ carrier_schema = (
210
+ prompt_library.library.get("preprocessing", {})
211
+ .get("carrier", {})
212
+ .get("placeholders", {})
213
+ .get(input_doc_type)
214
+ )
232
215
 
216
+ if carrier_schema:
233
217
  carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
234
218
  carrier_prompt = carrier_prompt.replace(
235
219
  "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
236
220
  )
237
221
 
222
+ # convert file_content to required document
223
+ document = llm_client.prepare_document_for_gemini(file_content)
224
+
238
225
  # identify carrier for customized prompting
239
226
  carrier = await identify_carrier(
240
227
  document,
@@ -244,37 +231,115 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
244
231
  doc_type=input_doc_type,
245
232
  )
246
233
 
247
- if input_doc_type == "bookingConfirmation":
248
- response_schema = prompt_library.library[input_doc_type][carrier][
249
- "placeholders"
250
- ]
251
-
234
+ # Select prompt
252
235
  if (
253
- input_doc_type in prompt_library.library.keys()
254
- and carrier in prompt_library.library[input_doc_type].keys()
236
+ input_doc_type not in prompt_library.library
237
+ or carrier not in prompt_library.library[input_doc_type]
255
238
  ):
256
- # get the related prompt from predefined prompt library
257
- prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
239
+ return {}
258
240
 
259
- # Update schema to extract value-page_number pairs
260
- if number_of_pages > 1:
261
- response_schema = transform_schema_strings(response_schema)
241
+ # get the related prompt from predefined prompt library
242
+ prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
262
243
 
263
- # Update the prompt to instruct LLM to include page numbers
264
- prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
244
+ # Add page-number extraction for moderately large docs
245
+ use_chunking = number_of_pages >= params["chunk_after"]
265
246
 
266
- # generate the result with LLM (gemini)
267
- result = await llm_client.get_unified_json_genai(
268
- prompt=prompt,
269
- document=document,
270
- response_schema=response_schema,
271
- doc_type=input_doc_type,
247
+ # Update schema and prompt to extract value-page_number pairs
248
+ if not use_chunking and number_of_pages > 1:
249
+ response_schema = transform_schema_strings(response_schema)
250
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
251
+
252
+ tasks = []
253
+ # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
254
+ for chunk in (
255
+ split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
256
+ if use_chunking
257
+ else [file_content]
258
+ ):
259
+ tasks.append(
260
+ process_chunk_with_retry(
261
+ chunk, prompt, response_schema, llm_client, input_doc_type
262
+ )
272
263
  )
273
264
 
274
- result = llm_prediction_to_tuples(result, number_of_pages)
265
+ results = await asyncio.gather(*tasks, return_exceptions=True)
275
266
 
276
- return result
277
- return {}
267
+ if use_chunking:
268
+ return merge_llm_results(results, response_schema)
269
+ else:
270
+ return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
271
+
272
+
273
+ async def process_chunk_with_retry(
274
+ chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
275
+ ):
276
+ """Process a chunk with retries in case of failure."""
277
+ for attempt in range(1, retries + 1):
278
+ try:
279
+ return await process_chunk(
280
+ chunk_content=chunk_content,
281
+ prompt=prompt,
282
+ response_schema=response_schema,
283
+ llm_client=llm_client,
284
+ input_doc_type=input_doc_type,
285
+ )
286
+ except Exception as e:
287
+ logger.error(f"Chunk failed on attempt {attempt}: {e}")
288
+ if attempt == retries:
289
+ raise
290
+ await asyncio.sleep(1) # small backoff
291
+
292
+
293
+ async def process_chunk(
294
+ chunk_content, prompt, response_schema, llm_client, input_doc_type
295
+ ):
296
+ """Process a chunk with Gemini."""
297
+ document = llm_client.prepare_document_for_gemini(chunk_content)
298
+ return await llm_client.get_unified_json_genai(
299
+ prompt=prompt,
300
+ document=document,
301
+ response_schema=response_schema,
302
+ doc_type=input_doc_type,
303
+ )
304
+
305
+
306
+ def merge_llm_results(results, response_schema):
307
+ """Merge LLM results from multiple chunks."""
308
+ merged = {}
309
+ for i, result in enumerate(results):
310
+ if not isinstance(result, dict):
311
+ continue
312
+ # Add page number to all values coming from this chunk
313
+ result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
314
+
315
+ # Merge the result into the final merged dictionary
316
+ for key, value in result.items():
317
+ field_type = (
318
+ response_schema["properties"].get(key, {}).get("type", "").upper()
319
+ )
320
+
321
+ if key not in merged:
322
+ if field_type == "ARRAY":
323
+ # append the values as a list
324
+ merged[key] = (
325
+ value if isinstance(value, list) else ([value] if value else [])
326
+ )
327
+ else:
328
+ merged[key] = value
329
+ continue
330
+
331
+ if field_type == "ARRAY":
332
+ # append list contents across chunks
333
+ if isinstance(value, list):
334
+ merged[key].extend(value)
335
+ else:
336
+ merged[key].append(value)
337
+
338
+ # take first non-null value only
339
+ if merged[key] in (None, "", [], {}):
340
+ merged[key] = value
341
+
342
+ return merged
278
343
 
279
344
 
280
345
  async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
12
12
  from src.io import logger
13
13
  from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
14
14
  from src.prompts.prompt_library import prompt_library
15
- from src.utils import get_tms_mappings
15
+ from src.utils import batch_fetch_all_mappings, get_tms_mappings
16
16
 
17
17
  tms_domain = os.environ["TMS_DOMAIN"]
18
18
 
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
134
134
  formatted_value: string
135
135
 
136
136
  """
137
+ # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
138
+ value = remove_unwanted_patterns(data_field_value)
139
+
137
140
  formatted_value = ""
138
- for c in data_field_value:
141
+ for c in value:
139
142
  if c.isnumeric() or c in [",", ".", "-"]:
140
143
  formatted_value += c
141
144
 
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
320
323
  lineitem = lineitem.replace("HIGH CUBE", "")
321
324
 
322
325
  # Remove container size e.g., 20FT, 40HC, etc.
323
- lineitem = re.sub(
324
- r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
325
- ).strip()
326
+ pattern = [
327
+ f"{s}{t}"
328
+ for s in ("20|22|40|45".split("|"))
329
+ for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
330
+ ]
331
+ lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
326
332
 
327
333
  return lineitem
328
334
 
@@ -372,18 +378,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
372
378
  return re.sub(r"\s{2,}", " ", lineitem).strip()
373
379
 
374
380
 
375
- async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
381
+ async def format_label(
382
+ entity_k,
383
+ entity_value,
384
+ document_type_code,
385
+ params,
386
+ mime_type,
387
+ container_map,
388
+ terminal_map,
389
+ depot_map,
390
+ ):
376
391
  llm_client = params["LlmClient"]
377
392
  if isinstance(entity_value, dict): # if it's a nested entity
378
393
  format_tasks = [
379
- format_label(sub_k, sub_v, document_type_code, params, mime_type)
394
+ format_label(
395
+ sub_k,
396
+ sub_v,
397
+ document_type_code,
398
+ params,
399
+ mime_type,
400
+ container_map,
401
+ terminal_map,
402
+ depot_map,
403
+ )
380
404
  for sub_k, sub_v in entity_value.items()
381
405
  ]
382
406
  return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
383
407
  if isinstance(entity_value, list):
384
408
  format_tasks = await asyncio.gather(
385
409
  *[
386
- format_label(entity_k, sub_v, document_type_code, params, mime_type)
410
+ format_label(
411
+ entity_k,
412
+ sub_v,
413
+ document_type_code,
414
+ params,
415
+ mime_type,
416
+ container_map,
417
+ terminal_map,
418
+ depot_map,
419
+ )
387
420
  for sub_v in entity_value
388
421
  ]
389
422
  )
@@ -405,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
405
438
  )
406
439
 
407
440
  elif (entity_key == "containertype") or (entity_key == "containersize"):
408
- formatted_value = get_tms_mappings(entity_value, "container_types")
441
+ formatted_value = container_map.get(entity_value)
409
442
 
410
443
  elif check_formatting_rule(entity_k, document_type_code, "terminal"):
411
- formatted_value = get_tms_mappings(entity_value, "terminals")
444
+ formatted_value = terminal_map.get(entity_value)
412
445
 
413
446
  elif check_formatting_rule(entity_k, document_type_code, "depot"):
414
- formatted_value = get_tms_mappings(entity_value, "depots")
447
+ formatted_value = depot_map.get(entity_value)
415
448
 
416
449
  elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
417
450
  try:
@@ -507,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
507
540
  """Get port code using AI model."""
508
541
  port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
509
542
 
510
- return get_tms_mappings(port, "ports", port_llm)
543
+ result = await get_tms_mappings(port, "ports", port_llm)
544
+ return result.get(port, None)
511
545
 
512
546
 
513
547
  async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -598,6 +632,74 @@ def decimal_convertor(value, quantity=False):
598
632
  return value
599
633
 
600
634
 
635
+ async def collect_mapping_requests(entity_value, document_type_code):
636
+ """Collect all unique container types, terminals, and depots from the entity value."""
637
+ # Sets to store unique values
638
+ container_types = set()
639
+ terminals = set()
640
+ depots = set()
641
+
642
+ def walk(key, value):
643
+ key_lower = key.lower()
644
+
645
+ # nested dict
646
+ if isinstance(value, dict):
647
+ for k, v in value.items():
648
+ walk(k, v)
649
+
650
+ # list of values
651
+ elif isinstance(value, list):
652
+ for item in value:
653
+ walk(key, item)
654
+
655
+ # leaf node
656
+ else:
657
+ if key_lower in ("containertype", "containersize"):
658
+ # Take only "20DV" from ('20DV', 0) if it's a tuple
659
+ container_types.add(value[0]) if isinstance(
660
+ value, tuple
661
+ ) else container_types.add(value)
662
+
663
+ elif check_formatting_rule(key, document_type_code, "terminal"):
664
+ terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
665
+ value
666
+ )
667
+
668
+ elif check_formatting_rule(key, document_type_code, "depot"):
669
+ depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
670
+
671
+ walk("root", entity_value)
672
+
673
+ return container_types, terminals, depots
674
+
675
+
676
+ async def format_all_labels(entity_data, document_type_code, params, mime_type):
677
+ """Format all labels in the entity data using cached mappings."""
678
+ # Collect all mapping values needed
679
+ container_req, terminal_req, depot_req = await collect_mapping_requests(
680
+ entity_data, document_type_code
681
+ )
682
+
683
+ # Batch fetch mappings
684
+ container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
685
+ container_req, terminal_req, depot_req
686
+ )
687
+
688
+ # Format labels using cached mappings
689
+ _, result = await format_label(
690
+ "root",
691
+ entity_data,
692
+ document_type_code,
693
+ params,
694
+ mime_type,
695
+ container_map,
696
+ terminal_map,
697
+ depot_map,
698
+ )
699
+
700
+ return _, result
701
+
702
+
601
703
  async def format_all_entities(result, document_type_code, params, mime_type):
602
704
  """Format the entity values in the result dictionary."""
603
705
  # Since we treat `customsInvoice` same as `partnerInvoice`
@@ -613,13 +715,13 @@ async def format_all_entities(result, document_type_code, params, mime_type):
613
715
  return {}
614
716
 
615
717
  # Format all entities recursively
616
- _, aggregated_data = await format_label(
617
- None, result, document_type_code, params, mime_type
718
+ _, aggregated_data = await format_all_labels(
719
+ result, document_type_code, params, mime_type
618
720
  )
619
721
 
620
722
  # Process partner invoice on lineitem mapping and reverse charge sentence
621
723
  if document_type_code in ["partnerInvoice", "bundeskasse"]:
622
- process_partner_invoice(params, aggregated_data, document_type_code)
724
+ await process_partner_invoice(params, aggregated_data, document_type_code)
623
725
 
624
726
  logger.info("Data Extraction completed successfully")
625
727
  return aggregated_data
@@ -651,41 +753,46 @@ def remove_stop_words(lineitem: str):
651
753
  )
652
754
 
653
755
 
654
- def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
756
+ def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
655
757
  """Convert LLM prediction dictionary to tuples of (value, page_number)."""
656
-
657
758
  # If only 1 page, simply pair each value with page number 0
658
759
  if number_of_pages == 1:
760
+ effective_page = 0 if page_number is None else page_number
659
761
  if isinstance(llm_prediction, dict):
660
762
  return {
661
- k: llm_prediction_to_tuples(v, number_of_pages)
763
+ k: llm_prediction_to_tuples(
764
+ v, number_of_pages, page_number=effective_page
765
+ )
662
766
  for k, v in llm_prediction.items()
663
767
  }
664
768
  elif isinstance(llm_prediction, list):
665
769
  return [
666
- llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
770
+ llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
771
+ for v in llm_prediction
667
772
  ]
668
773
  else:
669
- return (llm_prediction, 0) if llm_prediction else None
774
+ return (llm_prediction, effective_page) if llm_prediction else None
670
775
 
671
776
  # logic for multi-page predictions
672
777
  if isinstance(llm_prediction, dict):
673
778
  if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
674
779
  if llm_prediction["value"]:
675
780
  try:
676
- page_number = int(llm_prediction["page_number"])
781
+ _page_number = int(llm_prediction["page_number"])
677
782
  except: # noqa: E722
678
- page_number = -1
679
- return (llm_prediction["value"], page_number)
783
+ _page_number = -1
784
+ return (llm_prediction["value"], _page_number)
680
785
  return None
681
786
 
682
787
  for key, value in llm_prediction.items():
683
788
  llm_prediction[key] = llm_prediction_to_tuples(
684
- llm_prediction.get(key, value), number_of_pages
789
+ llm_prediction.get(key, value), number_of_pages, page_number
685
790
  )
686
791
 
687
792
  elif isinstance(llm_prediction, list):
688
793
  for i, item in enumerate(llm_prediction):
689
- llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
794
+ llm_prediction[i] = llm_prediction_to_tuples(
795
+ item, number_of_pages, page_number
796
+ )
690
797
 
691
798
  return llm_prediction