data-science-document-ai 1.51.0__tar.gz → 1.59.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/PKG-INFO +1 -1
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/pyproject.toml +1 -1
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/constants.py +10 -27
- data_science_document_ai-1.59.0/src/docai_processor_config.yaml +9 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/pdf_processing.py +34 -29
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/postprocessing/postprocess_partner_invoice.py +82 -26
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +146 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +56 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +146 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +61 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json +146 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/maersk/prompt.txt +59 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/msc/placeholders.json +146 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/msc/prompt.txt +76 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json +160 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/oocl/prompt.txt +49 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/other/placeholders.json +160 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/other/prompt.txt +81 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/yangming/placeholders.json +160 -0
- data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/yangming/prompt.txt +60 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/customsInvoice/other/prompt.txt +2 -1
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/partnerInvoice/other/prompt.txt +3 -4
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/setup.py +17 -9
- data_science_document_ai-1.51.0/src/docai_processor_config.yaml +0 -22
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -32
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -36
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -32
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -65
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -32
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -58
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -32
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -70
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -32
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -16
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -32
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -58
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -32
- data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -62
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/docai.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/excel_processing.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/io.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/llm.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/log_setup.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/postprocessing/common.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/tms.py +0 -0
- {data_science_document_ai-1.51.0 → data_science_document_ai-1.59.0}/src/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.59.0"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -20,10 +20,11 @@ project_parameters = {
|
|
|
20
20
|
# Fuzzy lookup
|
|
21
21
|
"g_model_fuzzy_lookup_folder": "fuzzy_lookup",
|
|
22
22
|
"item_code_lookup": "line_item_kvp_table.json",
|
|
23
|
+
"intermodal_partners": "intermodal_partners.json",
|
|
23
24
|
"invoice_classification_lookup": "invoice_classification.json",
|
|
24
25
|
"reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
|
|
25
26
|
# Fuzzy logic params
|
|
26
|
-
"fuzzy_threshold_item_code":
|
|
27
|
+
"fuzzy_threshold_item_code": 92,
|
|
27
28
|
"fuzzy_threshold_reverse_charge": 80,
|
|
28
29
|
"fuzzy_threshold_invoice_classification": 70,
|
|
29
30
|
# Chunking params
|
|
@@ -36,6 +37,8 @@ project_parameters = {
|
|
|
36
37
|
# models metadata (confidence),
|
|
37
38
|
"g_model_data_folder": "models",
|
|
38
39
|
"local_model_data_folder": "data",
|
|
40
|
+
"if_use_docai": False,
|
|
41
|
+
"if_use_llm": True, # Keep it always True
|
|
39
42
|
"released_doc_types": {
|
|
40
43
|
"bookingConfirmation",
|
|
41
44
|
"packingList",
|
|
@@ -50,16 +53,6 @@ project_parameters = {
|
|
|
50
53
|
"customsInvoice",
|
|
51
54
|
"bundeskasse",
|
|
52
55
|
},
|
|
53
|
-
"model_selector": {
|
|
54
|
-
"stable": {
|
|
55
|
-
"bookingConfirmation": 1,
|
|
56
|
-
},
|
|
57
|
-
"beta": {
|
|
58
|
-
"bookingConfirmation": 0,
|
|
59
|
-
},
|
|
60
|
-
},
|
|
61
|
-
# this is the model selector for the model to be used from the model_config.yaml
|
|
62
|
-
# file based on the environment, 0 mean the first model in the list
|
|
63
56
|
# LLM model parameters
|
|
64
57
|
"gemini_params": {
|
|
65
58
|
"temperature": 0,
|
|
@@ -77,25 +70,15 @@ project_parameters = {
|
|
|
77
70
|
"seed": 42,
|
|
78
71
|
"model_id": "gemini-2.5-flash",
|
|
79
72
|
},
|
|
80
|
-
# Key to combine the LLM results with the Doc Ai results
|
|
81
|
-
"key_to_combine": {
|
|
82
|
-
"bookingConfirmation": ["transportLegs"],
|
|
83
|
-
"arrivalNotice": ["containers"],
|
|
84
|
-
"finalMbL": ["containers"],
|
|
85
|
-
"draftMbl": ["containers"],
|
|
86
|
-
"deliveryOrder": ["Equipment", "TransportLeg"],
|
|
87
|
-
"customsAssessment": ["containers"],
|
|
88
|
-
"packingList": ["skuData"],
|
|
89
|
-
"commercialInvoice": ["skus"],
|
|
90
|
-
"shippingInstruction": ["containers"],
|
|
91
|
-
"partnerInvoice": ["lineItem"],
|
|
92
|
-
"customsInvoice": ["lineItem"],
|
|
93
|
-
"bundeskasse": ["lineItem"],
|
|
94
|
-
},
|
|
95
73
|
}
|
|
96
74
|
|
|
97
75
|
# Hardcoded rules for data points formatting that can't be based on label name alone
|
|
98
76
|
formatting_rules = {
|
|
99
|
-
"bookingConfirmation": {
|
|
77
|
+
"bookingConfirmation": {
|
|
78
|
+
"pickUpDepotCode": "depot",
|
|
79
|
+
"dropOffDepotCode": "depot",
|
|
80
|
+
"gateInTerminalCode": "terminal",
|
|
81
|
+
"pickUpTerminalCode": "terminal",
|
|
82
|
+
},
|
|
100
83
|
"deliveryOrder": {"pickUpTerminal": "terminal", "EmptyContainerDepot": "depot"},
|
|
101
84
|
}
|
|
@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
|
|
|
32
32
|
from src.prompts.prompt_library import prompt_library
|
|
33
33
|
from src.utils import (
|
|
34
34
|
extract_top_pages,
|
|
35
|
-
generate_schema_structure,
|
|
36
35
|
get_pdf_page_count,
|
|
37
36
|
get_processor_name,
|
|
38
37
|
run_background_tasks,
|
|
@@ -202,9 +201,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
202
201
|
number_of_pages = get_pdf_page_count(file_content)
|
|
203
202
|
logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
|
|
204
203
|
|
|
205
|
-
# get the schema placeholder
|
|
206
|
-
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
207
|
-
|
|
208
204
|
carrier = "other"
|
|
209
205
|
carrier_schema = (
|
|
210
206
|
prompt_library.library.get("preprocessing", {})
|
|
@@ -241,6 +237,9 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
241
237
|
# get the related prompt from predefined prompt library
|
|
242
238
|
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
243
239
|
|
|
240
|
+
# get the schema placeholder
|
|
241
|
+
response_schema = prompt_library.library[input_doc_type][carrier]["placeholders"]
|
|
242
|
+
|
|
244
243
|
# Add page-number extraction for moderately large docs
|
|
245
244
|
use_chunking = number_of_pages >= params["chunk_after"]
|
|
246
245
|
|
|
@@ -258,7 +257,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
258
257
|
):
|
|
259
258
|
tasks.append(
|
|
260
259
|
process_chunk_with_retry(
|
|
261
|
-
chunk,
|
|
260
|
+
chunk,
|
|
261
|
+
prompt,
|
|
262
|
+
response_schema,
|
|
263
|
+
llm_client,
|
|
264
|
+
input_doc_type,
|
|
262
265
|
)
|
|
263
266
|
)
|
|
264
267
|
|
|
@@ -350,8 +353,7 @@ async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_
|
|
|
350
353
|
# Add currency from the amount field
|
|
351
354
|
if input_doc_type in ["commercialInvoice"]:
|
|
352
355
|
result = postprocessing_commercial_invoice(result, params, input_doc_type)
|
|
353
|
-
|
|
354
|
-
result = postprocess_booking_confirmation(result)
|
|
356
|
+
|
|
355
357
|
return result, llm_client.model_id
|
|
356
358
|
|
|
357
359
|
|
|
@@ -370,13 +372,14 @@ def combine_llm_results_w_doc_ai(
|
|
|
370
372
|
Returns:
|
|
371
373
|
combined result
|
|
372
374
|
"""
|
|
373
|
-
result =
|
|
374
|
-
|
|
375
|
-
|
|
375
|
+
result = remove_none_values(llm)
|
|
376
|
+
|
|
377
|
+
docAi = doc_ai.copy()
|
|
378
|
+
if not docAi:
|
|
376
379
|
return result
|
|
377
380
|
|
|
378
381
|
# Merge top-level keys
|
|
379
|
-
result.update({k: v for k, v in
|
|
382
|
+
result.update({k: v for k, v in docAi.items() if k not in result})
|
|
380
383
|
|
|
381
384
|
if (
|
|
382
385
|
input_doc_type
|
|
@@ -384,28 +387,28 @@ def combine_llm_results_w_doc_ai(
|
|
|
384
387
|
and keys_to_combine
|
|
385
388
|
):
|
|
386
389
|
result.update(
|
|
387
|
-
{key:
|
|
390
|
+
{key: docAi.get(key) for key in keys_to_combine if key in docAi.keys()}
|
|
388
391
|
)
|
|
389
392
|
return result
|
|
390
393
|
|
|
391
394
|
# Handle specific key-based merging logic for multiple keys
|
|
392
395
|
if keys_to_combine:
|
|
393
396
|
for key in keys_to_combine:
|
|
394
|
-
if key in
|
|
397
|
+
if key in docAi.keys():
|
|
395
398
|
# Merge the list of dictionaries
|
|
396
|
-
# If the length of the
|
|
397
|
-
if len(
|
|
398
|
-
result[key] =
|
|
399
|
+
# If the length of the docAi list is less than the LLM result, replace with the docAi list
|
|
400
|
+
if len(docAi[key]) < len(result[key]):
|
|
401
|
+
result[key] = docAi[key]
|
|
399
402
|
else:
|
|
400
|
-
# If the length of the
|
|
403
|
+
# If the length of the docAi list is greater than or equal to the LLM result,
|
|
401
404
|
# add & merge the dictionaries
|
|
402
|
-
if isinstance(
|
|
403
|
-
for i in range(len(
|
|
405
|
+
if isinstance(docAi[key], list):
|
|
406
|
+
for i in range(len(docAi[key])):
|
|
404
407
|
if i == len(result[key]):
|
|
405
|
-
result[key].append(
|
|
408
|
+
result[key].append(docAi[key][i])
|
|
406
409
|
else:
|
|
407
|
-
for sub_key in
|
|
408
|
-
result[key][i][sub_key] =
|
|
410
|
+
for sub_key in docAi[key][i].keys():
|
|
411
|
+
result[key][i][sub_key] = docAi[key][i][sub_key]
|
|
409
412
|
return result
|
|
410
413
|
|
|
411
414
|
|
|
@@ -499,13 +502,15 @@ async def data_extraction_manual_flow(
|
|
|
499
502
|
page_count = None
|
|
500
503
|
# Validate the file type
|
|
501
504
|
if mime_type == "application/pdf":
|
|
505
|
+
if_use_docai = params["if_use_docai"]
|
|
506
|
+
|
|
502
507
|
# Enable Doc Ai only for certain document types.
|
|
503
|
-
if_use_docai
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
508
|
+
if params["if_use_docai"]:
|
|
509
|
+
if_use_docai = (
|
|
510
|
+
True
|
|
511
|
+
if meta.documentTypeCode in params["model_config"]["stable"]
|
|
512
|
+
else False
|
|
513
|
+
)
|
|
509
514
|
|
|
510
515
|
(
|
|
511
516
|
extracted_data,
|
|
@@ -517,7 +522,7 @@ async def data_extraction_manual_flow(
|
|
|
517
522
|
meta.documentTypeCode,
|
|
518
523
|
processor_client,
|
|
519
524
|
if_use_docai=if_use_docai,
|
|
520
|
-
if_use_llm=if_use_llm,
|
|
525
|
+
if_use_llm=params["if_use_llm"],
|
|
521
526
|
llm_client=llm_client,
|
|
522
527
|
isBetaTest=False,
|
|
523
528
|
)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
"""This module contains the postprocessing functions for the partner invoice."""
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
2
4
|
from rapidfuzz import fuzz, process
|
|
3
5
|
|
|
4
6
|
from src.io import logger
|
|
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
143
145
|
] = "Dasbachstraße 15, 54292 Trier, Germany"
|
|
144
146
|
|
|
145
147
|
|
|
148
|
+
def select_unique_bank_account(bank_account):
|
|
149
|
+
# Select the unique bank account if multiple are present
|
|
150
|
+
if isinstance(bank_account, list) and bank_account:
|
|
151
|
+
best = defaultdict(lambda: None)
|
|
152
|
+
|
|
153
|
+
for item in bank_account:
|
|
154
|
+
dv = item["documentValue"]
|
|
155
|
+
if best[dv] is None or item["page"] < best[dv]["page"]:
|
|
156
|
+
best[dv] = item
|
|
157
|
+
|
|
158
|
+
unique = list(best.values())
|
|
159
|
+
return unique
|
|
160
|
+
|
|
161
|
+
|
|
146
162
|
async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
147
163
|
"""Process the partner invoice data."""
|
|
148
164
|
# Post process bundeskasse invoices
|
|
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
150
166
|
post_process_bundeskasse(aggregated_data)
|
|
151
167
|
return
|
|
152
168
|
|
|
169
|
+
if "bankAccount" in aggregated_data:
|
|
170
|
+
aggregated_data["bankAccount"] = select_unique_bank_account(
|
|
171
|
+
aggregated_data["bankAccount"]
|
|
172
|
+
)
|
|
173
|
+
|
|
153
174
|
line_items = aggregated_data.get("lineItem", [])
|
|
154
175
|
# Add debug logging
|
|
155
176
|
logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
|
|
@@ -167,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
167
188
|
reverse_charge_info["formattedValue"] = reverse_charge_value
|
|
168
189
|
reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
|
|
169
190
|
|
|
191
|
+
# Partner Name
|
|
192
|
+
partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
|
|
193
|
+
|
|
170
194
|
# Process everything in one go
|
|
171
|
-
processed_items = await process_line_items_batch(
|
|
195
|
+
processed_items = await process_line_items_batch(
|
|
196
|
+
params, line_items, reverse_charge, partner_name
|
|
197
|
+
)
|
|
172
198
|
|
|
173
199
|
# Update your main data structure
|
|
174
200
|
aggregated_data["lineItem"] = processed_items
|
|
175
201
|
|
|
176
202
|
|
|
177
203
|
async def process_line_items_batch(
|
|
178
|
-
params: dict, line_items: list[dict], reverse_charge=None
|
|
204
|
+
params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
|
|
179
205
|
):
|
|
180
206
|
"""
|
|
181
207
|
Processes all line items efficiently using a "Split-Apply-Combine" strategy.
|
|
@@ -213,23 +239,12 @@ async def process_line_items_batch(
|
|
|
213
239
|
|
|
214
240
|
# Batch API Call for Embedding lookups
|
|
215
241
|
if pending_line_items:
|
|
216
|
-
|
|
217
|
-
logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
|
|
218
|
-
|
|
219
|
-
# Await the batch response {"desc1": "code1", "desc2": "code2"}
|
|
220
|
-
api_results = await get_tms_mappings(
|
|
221
|
-
input_list=values_to_fetch, embedding_type="line_items"
|
|
222
|
-
)
|
|
242
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
223
243
|
|
|
224
|
-
# Merge API results back into original list
|
|
225
244
|
for index, desc in pending_line_items.items():
|
|
226
|
-
# Get result from API response, or None if API failed for that item
|
|
227
|
-
forto_code = api_results.get(desc)
|
|
228
|
-
|
|
229
|
-
# Update the original item
|
|
230
245
|
line_items[index]["itemCode"] = {
|
|
231
246
|
"documentValue": desc,
|
|
232
|
-
"formattedValue":
|
|
247
|
+
"formattedValue": code_map.get(desc),
|
|
233
248
|
"page": line_items[index]["lineItemDescription"].get("page"),
|
|
234
249
|
}
|
|
235
250
|
|
|
@@ -285,11 +300,14 @@ def if_reverse_charge_sentence(sentence: str, params):
|
|
|
285
300
|
return False
|
|
286
301
|
|
|
287
302
|
# Check if the sentence is similar to any of the reverse charge sentences
|
|
288
|
-
|
|
289
|
-
sentence, reverse_charge_sentences, threshold
|
|
303
|
+
match, _ = get_fuzzy_match_score(
|
|
304
|
+
sentence, list(reverse_charge_sentences.keys()), threshold
|
|
290
305
|
)
|
|
291
306
|
|
|
292
|
-
|
|
307
|
+
if match:
|
|
308
|
+
return reverse_charge_sentences[match]
|
|
309
|
+
|
|
310
|
+
return False
|
|
293
311
|
|
|
294
312
|
|
|
295
313
|
def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
@@ -320,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
|
320
338
|
return None
|
|
321
339
|
|
|
322
340
|
|
|
323
|
-
async def associate_forto_item_code(line_item_data, params):
|
|
341
|
+
async def associate_forto_item_code(line_item_data, params, partner_name=None):
|
|
324
342
|
"""
|
|
325
343
|
Associates Forto item codes to a list of line item descriptions.
|
|
326
344
|
Args:
|
|
327
345
|
line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
|
|
328
346
|
params (dict): Parameters containing lookup data and thresholds.
|
|
347
|
+
partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
|
|
329
348
|
|
|
330
349
|
Returns:
|
|
331
350
|
list: A list of dictionaries with 'description' and 'itemCode' keys.
|
|
@@ -347,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
|
|
|
347
366
|
|
|
348
367
|
# Batch API Call for Embedding lookups
|
|
349
368
|
if pending_line_items:
|
|
350
|
-
|
|
351
|
-
input_list=list(pending_line_items.values()),
|
|
352
|
-
embedding_type="line_items",
|
|
353
|
-
)
|
|
369
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
354
370
|
|
|
355
|
-
# Merge API results back into original list
|
|
356
371
|
for desc, f_desc in pending_line_items.items():
|
|
357
|
-
|
|
358
|
-
|
|
372
|
+
result.append(
|
|
373
|
+
{
|
|
374
|
+
"description": desc,
|
|
375
|
+
"itemCode": code_map.get(f_desc),
|
|
376
|
+
}
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
return result
|
|
359
380
|
|
|
381
|
+
|
|
382
|
+
async def fetch_line_item_codes(
|
|
383
|
+
pending_line_items: dict,
|
|
384
|
+
partner_name: str | None,
|
|
385
|
+
params: dict,
|
|
386
|
+
):
|
|
387
|
+
"""Returns: {original_description: mapped_code_or_None}"""
|
|
388
|
+
t_mode = (
|
|
389
|
+
find_matching_lineitem(
|
|
390
|
+
partner_name.upper(),
|
|
391
|
+
params["lookup_data"]["intermodal_partners"],
|
|
392
|
+
threshold=87,
|
|
393
|
+
)
|
|
394
|
+
if partner_name
|
|
395
|
+
else None
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
unique_descs = list(set(pending_line_items.values()))
|
|
399
|
+
logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
|
|
400
|
+
|
|
401
|
+
# Build API input map
|
|
402
|
+
api_input_map = {
|
|
403
|
+
desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
api_results = await get_tms_mappings(
|
|
407
|
+
input_list=list(api_input_map.values()),
|
|
408
|
+
embedding_type="line_items",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Normalize response back to original descriptions
|
|
412
|
+
result = {
|
|
413
|
+
original_desc: api_results.get(api_desc)
|
|
414
|
+
for original_desc, api_desc in api_input_map.items()
|
|
415
|
+
}
|
|
360
416
|
return result
|
data_science_document_ai-1.59.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"bookingNumber": {
|
|
5
|
+
"type": "STRING",
|
|
6
|
+
"nullable": true,
|
|
7
|
+
"description": "A unique identifier assigned to the shipment booking, used for tracking and reference. They are often referred to as 'Booking No.', 'Booking Reference', 'Our Reference', or 'Order Ref'."
|
|
8
|
+
},
|
|
9
|
+
"contractNumber": {
|
|
10
|
+
"type": "STRING",
|
|
11
|
+
"nullable": true,
|
|
12
|
+
"description": "It's a contract number between the carrier and Forto Logistics SE & Co KG."
|
|
13
|
+
},
|
|
14
|
+
"pickUpTerminalCode": {
|
|
15
|
+
"type": "STRING",
|
|
16
|
+
"nullable": true,
|
|
17
|
+
"description": "The specific terminal for cargo pickup during the import shipment."
|
|
18
|
+
},
|
|
19
|
+
"gateInTerminalCode": {
|
|
20
|
+
"type": "STRING",
|
|
21
|
+
"nullable": true,
|
|
22
|
+
"description": "The specific terminal where cargo is gated in especially Export terminal delivery address. E.g., FULL RETURN TO or Export terminal name."
|
|
23
|
+
},
|
|
24
|
+
"performaDate": {
|
|
25
|
+
"type": "STRING",
|
|
26
|
+
"nullable": true,
|
|
27
|
+
"description": "The date considered to apply the rates and charges specified in the booking confirmation"
|
|
28
|
+
},
|
|
29
|
+
"cyCutOff": {
|
|
30
|
+
"type": "STRING",
|
|
31
|
+
"nullable": true,
|
|
32
|
+
"description": "The datetime by which the cargo to be delivered to the Container Yard. It can be found with keys CARGO CUT OFF DATE/TIME"
|
|
33
|
+
},
|
|
34
|
+
"gateInReference": {
|
|
35
|
+
"type": "STRING",
|
|
36
|
+
"nullable": true,
|
|
37
|
+
"description": "A reference code for cargo entering the terminal to drop the loaded cargo for Export. Sometimes it can be 'Our Reference'."
|
|
38
|
+
},
|
|
39
|
+
"mblNumber": {
|
|
40
|
+
"type": "STRING",
|
|
41
|
+
"nullable": true,
|
|
42
|
+
"description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
|
|
43
|
+
},
|
|
44
|
+
"pickUpReference": {
|
|
45
|
+
"type": "STRING",
|
|
46
|
+
"nullable": true,
|
|
47
|
+
"description": "A reference code for cargo pickup during the import shipment. Sometimes it can be 'Our Reference'."
|
|
48
|
+
},
|
|
49
|
+
"siCutOff": {
|
|
50
|
+
"type": "STRING",
|
|
51
|
+
"nullable": true,
|
|
52
|
+
"description": "The deadline datetime for submitting the Shipping Instructions (SI) to the carrier. It can be found with keys DOC CUT OFF DATE/TIME"
|
|
53
|
+
},
|
|
54
|
+
"vgmCutOff": {
|
|
55
|
+
"type": "STRING",
|
|
56
|
+
"nullable": true,
|
|
57
|
+
"description": "The deadline datetime for submitting the Verified Gross Mass (VGM) to the carrier. It can be found with keys VGM DEADLINE, VGM DUE, VGM CUT OFF."
|
|
58
|
+
},
|
|
59
|
+
"containers": {
|
|
60
|
+
"type": "ARRAY",
|
|
61
|
+
"items": {
|
|
62
|
+
"type": "OBJECT",
|
|
63
|
+
"properties": {
|
|
64
|
+
"containerType": {
|
|
65
|
+
"type": "STRING",
|
|
66
|
+
"nullable": true,
|
|
67
|
+
"description": "The size / type of the container, such as 20ft, 40ft, 40HC, 20DC etc under Type/Size column."
|
|
68
|
+
},
|
|
69
|
+
"pickUpDepotCode": {
|
|
70
|
+
"type": "STRING",
|
|
71
|
+
"nullable": true,
|
|
72
|
+
"description": "The depot code where the empty container will be picked up. It is identified as Empty Pick Up AT Depot or Export Empty Pick Up Depot(s)."
|
|
73
|
+
},
|
|
74
|
+
"dropOffDepotCode": {
|
|
75
|
+
"type": "STRING",
|
|
76
|
+
"nullable": true,
|
|
77
|
+
"description": "The depot code where the empty container will be dropped off."
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
},
|
|
81
|
+
"required": ["containerType", "pickupDepotCode", "dropoffDepotCode"]
|
|
82
|
+
},
|
|
83
|
+
"transportLegs": {
|
|
84
|
+
"type": "ARRAY",
|
|
85
|
+
"items": {
|
|
86
|
+
"type": "OBJECT",
|
|
87
|
+
"properties": {
|
|
88
|
+
"eta": {
|
|
89
|
+
"type": "STRING",
|
|
90
|
+
"nullable": true,
|
|
91
|
+
"description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."
|
|
92
|
+
},
|
|
93
|
+
"etd": {
|
|
94
|
+
"type": "STRING",
|
|
95
|
+
"nullable": true,
|
|
96
|
+
"description": "Estimated Time of Departure (ETD) is the expected date when the shipment will leave the origin port."
|
|
97
|
+
},
|
|
98
|
+
"imoNumber": {
|
|
99
|
+
"type": "STRING",
|
|
100
|
+
"nullable": true,
|
|
101
|
+
"description": "The International Maritime Organization number for a specific leg. It can be found as IMO No, IMO number."
|
|
102
|
+
},
|
|
103
|
+
"portOfDischarge": {
|
|
104
|
+
"type": "STRING",
|
|
105
|
+
"nullable": true,
|
|
106
|
+
"description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment. It can be found at POD, Port of Discharge, To, Discharge Port"
|
|
107
|
+
},
|
|
108
|
+
"portOfLoading": {
|
|
109
|
+
"type": "STRING",
|
|
110
|
+
"nullable": true,
|
|
111
|
+
"description": "The port where the goods are loaded onto the vessel. This is the origin port for the shipment. It can be found at POL, Port of Loading, From, Load Port"
|
|
112
|
+
},
|
|
113
|
+
"vesselName": {
|
|
114
|
+
"type": "STRING",
|
|
115
|
+
"nullable": true,
|
|
116
|
+
"description": "The name of the vessel carrying the shipment. It can be found at VESSEL/VOYAGE e.g., MOL EMERALD"
|
|
117
|
+
},
|
|
118
|
+
"voyage": {
|
|
119
|
+
"type": "STRING",
|
|
120
|
+
"nullable": true,
|
|
121
|
+
"description": "The journey or route taken by the vessel for a specific leg. It can be found at VESSEL/VOYAGE e.g., 087E"
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
"required": [
|
|
126
|
+
"eta",
|
|
127
|
+
"etd",
|
|
128
|
+
"portOfDischarge",
|
|
129
|
+
"portOfLoading",
|
|
130
|
+
"vesselName",
|
|
131
|
+
"voyage"
|
|
132
|
+
]
|
|
133
|
+
},
|
|
134
|
+
"carrierAddress": {
|
|
135
|
+
"type": "STRING",
|
|
136
|
+
"nullable": true,
|
|
137
|
+
"description": "The address of the carrier who provides service and issued the document."
|
|
138
|
+
},
|
|
139
|
+
"carrierName": {
|
|
140
|
+
"type": "STRING",
|
|
141
|
+
"nullable": true,
|
|
142
|
+
"description": "The name of the carrier who issued the document e,g, Evergreen Line."
|
|
143
|
+
}
|
|
144
|
+
},
|
|
145
|
+
"required": ["bookingNumber", "transportLegs", "containers", "cyCutOff", "vgmCutOff", "siCutOff"]
|
|
146
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
|
+
|
|
3
|
+
<TASK> Your task is to extract data from Booking Confirmation documents as per the given response schema structure. <TASK>
|
|
4
|
+
|
|
5
|
+
<CONTEXT>
|
|
6
|
+
The Freight Forwarding company receives Booking Confirmation from EverGreen Carrier (Shipping Lines) partner.
|
|
7
|
+
These Booking Confirmations contain various details related to booking, container pick up and drop off depot details, vessel details, as well as other transport Legs data.
|
|
8
|
+
They may be written in different languages such as English, German, Vietnamese, Chinese, and other European languages, and can appear in a variety of formats and layouts.
|
|
9
|
+
Your role is to accurately extract specific entities from these Booking Confirmations to support efficient processing and accurate record-keeping.
|
|
10
|
+
|
|
11
|
+
To provide context on the journey of a containers for both Export and Import shipments,
|
|
12
|
+
For Export shipment: An empty container is picked up from a depot (pickupDepotCode) using a pickUpReference and goods loaded into it at a warehouse. Then the loaded container / cargo is transported back to a Container Yard or gateInTerminal before the cyCutOff date for further shipping processes. Then the POL of the First TransportLeg may start from the gateInTerminal or a different POL too.
|
|
13
|
+
For Import Shipment: The loaded container / cargo arrives at a port of discharge then picked up at pickUpTerminal using pickUpReference. After delivery, an empty container is returned to a depot (dropOffDepotCode).
|
|
14
|
+
<CONTEXT>
|
|
15
|
+
|
|
16
|
+
<INSTRUCTIONS>
|
|
17
|
+
- Populate fields as defined in the response schema.
|
|
18
|
+
- Use the data field description to understand the context of the data.
|
|
19
|
+
|
|
20
|
+
- transportLegs: Multiple Transport Legs entries may exist, capture all instances under "transportLegs". Make sure the order of the legs are important.
|
|
21
|
+
- eta: The estimated time of arrival for a specific leg.
|
|
22
|
+
- etd: The estimated time of departure for a specific leg. ETD DATE above the PORT OF DISCHARGING information.
|
|
23
|
+
- imoNumber: The International Maritime Organization number for a specific leg.
|
|
24
|
+
- portOfDischarge: The port where cargo is unloaded for a specific leg.
|
|
25
|
+
- portOfLoading: The port where cargo is loaded for a specific leg.
|
|
26
|
+
- vesselName: The name of the vessel for a specific leg. Can be found at VESSEL/VOYAGE (e.g., EVER LAUREL).
|
|
27
|
+
- voyage: The journey or route taken by the vessel for a specific leg. It can be found at VESSEL/VOYAGE e.g., 087E.
|
|
28
|
+
|
|
29
|
+
IMPORTANT explanation for the transportLegs part as follows:
|
|
30
|
+
- There is at least one leg in each document.
|
|
31
|
+
- 'eta' must be equal or later than 'etd'!
|
|
32
|
+
- Multiple legs are possible. When there are multiple legs,
|
|
33
|
+
- Sequential Sorting: You must manually re-order legs based on etd then eta, regardless of their order in the source text.
|
|
34
|
+
- "T/S PORT OF LOADING" indicates the presence of a multi-leg journey.
|
|
35
|
+
- Transhipment Handling: Treat any mentioned "T/S PORT OF LOADING" as the bridge between two legs (Discharge for Leg A and Loading for Leg B).
|
|
36
|
+
- The Connectivity Rule: For any sequence of legs, the Port of Discharge of the previous leg must match the Port of Loading of the following leg.
|
|
37
|
+
- First T/S PORT OF LOADING is the Port of Discharge for the first transportLegs and Port of Loading for the second transportLegs.
|
|
38
|
+
- Second T/S PORT OF LOADING is the Port of Discharge for the second transportLegs and Port of Loading for the third transportLegs.
|
|
39
|
+
- Timeline Integrity: Ensure a "No Time Travel" policy: The eta of a previous leg must be earlier than or equal to the etd of the following leg.
|
|
40
|
+
|
|
41
|
+
Structure of Multiple Leg Sequence & Mapping
|
|
42
|
+
Leg 1 (Initial):
|
|
43
|
+
- `portOfLoading`: PORT OF LOADING.
|
|
44
|
+
- `portOfDischarge`: T/S PORT OF LOADING.
|
|
45
|
+
- `vesselName`: VESSEL/VOYAGE (ignore parentheses).
|
|
46
|
+
- `etd`: ETD DATE above the PORT OF DISCHARGING information.
|
|
47
|
+
- `eta`: ETA DATE below first T/S PORT OF LOADING
|
|
48
|
+
|
|
49
|
+
Leg 2 (Intermediate): Trigger: Only if T/S PORT OF LOADING exists.
|
|
50
|
+
- `portOfLoading`: First T/S PORT OF LOADING. POD of Leg 1.
|
|
51
|
+
- `portOfDischarge`: Second T/S PORT OF LOADING (if exists), otherwise PORT OF DISCHARGE before the FINAL DESTINATION.
|
|
52
|
+
- `vesselName`: EST. CONNECTING VESSEL / VOY.
|
|
53
|
+
- `etd`: ETD DATE after first T/S PORT OF LOADING
|
|
54
|
+
- `eta`: ETA DATE after second T/S PORT OF LOADING (if exists), otherwise ETA next to the FINAL DESTINATION section.
|
|
55
|
+
|
|
56
|
+
<INSTRUCTIONS>
|