data-science-document-ai 1.40.3__py3-none-any.whl → 1.41.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.41.0.dist-info}/METADATA +1 -1
- data_science_document_ai-1.41.0.dist-info/RECORD +57 -0
- src/excel_processing.py +4 -0
- src/pdf_processing.py +14 -3
- src/postprocessing/common.py +66 -17
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
- src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
- src/prompts/library/bundeskasse/other/placeholders.json +19 -19
- src/prompts/library/bundeskasse/other/prompt.txt +1 -1
- src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
- src/prompts/library/customsAssessment/other/prompt.txt +1 -1
- src/prompts/library/customsInvoice/other/placeholders.json +19 -19
- src/prompts/library/customsInvoice/other/prompt.txt +1 -1
- src/prompts/library/deliveryOrder/other/placeholders.json +15 -17
- src/prompts/library/deliveryOrder/other/prompt.txt +1 -1
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +2 -1
- src/prompts/library/draftMbl/maersk/prompt.txt +2 -0
- src/prompts/library/draftMbl/other/prompt.txt +1 -1
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +1 -1
- src/prompts/library/finalMbL/maersk/prompt.txt +2 -0
- src/prompts/library/finalMbL/other/prompt.txt +1 -1
- src/prompts/library/packingList/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/placeholders.json +12 -60
- src/prompts/library/partnerInvoice/other/prompt.txt +1 -1
- src/prompts/library/shippingInstruction/other/prompt.txt +1 -0
- src/prompts/prompt_library.py +4 -0
- src/utils.py +57 -0
- data_science_document_ai-1.40.3.dist-info/RECORD +0 -59
- src/prompts/library/customsAssessment/other/placeholders.json +0 -19
- src/prompts/library/finalMbL/other/placeholders.json +0 -80
- {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.41.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
src/constants.py,sha256=TF_UblovdXZnKIb1lnyJwUqQncJCbzBVihoelI6foSU,3579
|
|
2
|
+
src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
|
|
3
|
+
src/docai.py,sha256=AepGdF3ZuSGkujLpewX393FgOBMy-e4sEudiGKho5EA,5280
|
|
4
|
+
src/docai_processor_config.yaml,sha256=qOMmCIORpLQ_D-ytvejXxFvER0e0uGYuzPVdZBGv4Pc,2105
|
|
5
|
+
src/excel_processing.py,sha256=wArdSxwxdgyj5WVgVTaWsVSmF7z5zK6rq-bUKGENmo4,2660
|
|
6
|
+
src/io.py,sha256=IXz4wWqiHa9mnHNgtrC6X9M2lItYp9eu6rHCThUIh5c,3585
|
|
7
|
+
src/llm.py,sha256=aEK3rL8XvY7CakvkOJQmcHpEKwZRd8PPrLrzHiO-GFk,7827
|
|
8
|
+
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
+
src/pdf_processing.py,sha256=GNHQl_ryyVOHu3FK39XzPJzOCrn01NNW3E2HO43Ot_c,15836
|
|
10
|
+
src/postprocessing/common.py,sha256=Vj_NohcgWZRCzipnPGeM-rg11wdDJ-wwCR12QeE6qOY,21451
|
|
11
|
+
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
|
+
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
|
+
src/postprocessing/postprocess_partner_invoice.py,sha256=cM4te4qjOI_bXyrF8Zhb6X7eNf5aMKoRaPCFfqFv-98,11538
|
|
14
|
+
src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
15
|
+
src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
|
|
16
|
+
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
|
|
17
|
+
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=XgfhrFTXLJ467L4Cer77K0KTPtWTg_-QJXCsltvLlpI,3430
|
|
18
|
+
src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=6p_IQMA1PUgGZqjf_by4ja9jK27ba4loYhEpIa7Oxx4,1406
|
|
19
|
+
src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=t-yh1dOrcRa0fm0VPFC1xCRBf0R0Zjp9j_Hb31aZS1w,3223
|
|
20
|
+
src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
21
|
+
src/prompts/library/bookingConfirmation/msc/prompt.txt,sha256=_Jfioislp7SNs2BEXoklvnTPVXe6Z0M6myD1IWnBFYQ,4705
|
|
22
|
+
src/prompts/library/bookingConfirmation/oocl/placeholders.json,sha256=JTtWvLSsoxN7huXY8ZNqqPkODM-DOs5wu3YvNHOna3k,1404
|
|
23
|
+
src/prompts/library/bookingConfirmation/oocl/prompt.txt,sha256=xNTrJdUtDalcP3AKkfRiOnHjAdRCbcTvehcBQKurRj0,2201
|
|
24
|
+
src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
25
|
+
src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
|
|
26
|
+
src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
27
|
+
src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
|
|
28
|
+
src/prompts/library/bundeskasse/other/placeholders.json,sha256=J57CNPWcz87PV4k3ctb_gVuOu-zGObolqst-y-mESQY,4054
|
|
29
|
+
src/prompts/library/bundeskasse/other/prompt.txt,sha256=t5nsLK-6rpOqcVnfHtU04RxN8wXTi9WpX1f4ASLoZ3E,2923
|
|
30
|
+
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=6sowYMzrKvgmTDpDnAzkeG4OqA44e6-8aUKWRKNziBY,2699
|
|
31
|
+
src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
|
|
32
|
+
src/prompts/library/customsInvoice/other/placeholders.json,sha256=g82I3LbLZr2mwfpFIFoCTK1Y_MEkGaEK4ew7jovr6nw,12172
|
|
33
|
+
src/prompts/library/customsInvoice/other/prompt.txt,sha256=ZMdIysq7B1CvOG93YSHorDJpqNUDxXEnomS4cVNaJ90,9632
|
|
34
|
+
src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
|
|
35
|
+
src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
|
|
36
|
+
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
|
|
37
|
+
src/prompts/library/draftMbl/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTaMeOFt92Emba01mzJA,2192
|
|
38
|
+
src/prompts/library/draftMbl/other/placeholders.json,sha256=wIN06_NWsESDyNEDfOLPi3F2Vq-XPa4O3U32A32s-_Q,1736
|
|
39
|
+
src/prompts/library/draftMbl/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
|
|
40
|
+
src/prompts/library/finalMbL/hapag-lloyd/prompt.txt,sha256=RhxEJ4eWikAQiE40cuPsssnzizge6AJYFTSJLGUmz_U,2326
|
|
41
|
+
src/prompts/library/finalMbL/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTaMeOFt92Emba01mzJA,2192
|
|
42
|
+
src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
|
|
43
|
+
src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
|
|
44
|
+
src/prompts/library/partnerInvoice/other/placeholders.json,sha256=AJNBVKwDGebyNAuyWEwEuaUTL9hbLK0Rjr2H0lNfOBY,8686
|
|
45
|
+
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=HuTUlCpUgDQKUKF5QYYoUoHZ0pkBIqX0g5NzciF_fps,9393
|
|
46
|
+
src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
|
|
47
|
+
src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
|
|
48
|
+
src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
|
|
49
|
+
src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
|
|
50
|
+
src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYpwmok_1dWQ2Oa8Qy9NGZ6CCLOUI4,1468
|
|
51
|
+
src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg,2760
|
|
52
|
+
src/setup.py,sha256=kPSZosrICfaGZeDaajr40Ha7Ok4XK4fo_uq35Omiwr0,7128
|
|
53
|
+
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
54
|
+
src/utils.py,sha256=SIEThJlaXWGoWV7236iNoAlabCPNge5oTBpDywTxJw0,15968
|
|
55
|
+
data_science_document_ai-1.41.0.dist-info/METADATA,sha256=MmKbqDbe9voabVucTrE-GoM192GMqFgD09_KNvp6Wsg,2153
|
|
56
|
+
data_science_document_ai-1.41.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
57
|
+
data_science_document_ai-1.41.0.dist-info/RECORD,,
|
src/excel_processing.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
# flake8: noqa: E402
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
|
+
from src.postprocessing.common import llm_prediction_to_tuples
|
|
6
|
+
|
|
5
7
|
logger = logging.getLogger(__name__)
|
|
6
8
|
|
|
7
9
|
import asyncio
|
|
@@ -73,6 +75,8 @@ async def extract_data_from_excel(
|
|
|
73
75
|
]
|
|
74
76
|
extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
|
|
75
77
|
|
|
78
|
+
# Convert LLM prediction dictionary to tuples of (value, page_number).
|
|
79
|
+
extracted_data = llm_prediction_to_tuples(extracted_data)
|
|
76
80
|
stored_data = json.dumps(extracted_data)
|
|
77
81
|
|
|
78
82
|
return extracted_data, stored_data, params["gemini_params"]["model_id"]
|
src/pdf_processing.py
CHANGED
|
@@ -14,7 +14,7 @@ from google.cloud.documentai_v1 import Document as docaiv1_document
|
|
|
14
14
|
|
|
15
15
|
from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
|
|
16
16
|
from src.excel_processing import extract_data_from_excel
|
|
17
|
-
from src.postprocessing.common import format_all_entities, remove_none_values
|
|
17
|
+
from src.postprocessing.common import format_all_entities, remove_none_values, llm_prediction_to_tuples
|
|
18
18
|
from src.postprocessing.postprocess_booking_confirmation import (
|
|
19
19
|
postprocess_booking_confirmation,
|
|
20
20
|
)
|
|
@@ -31,6 +31,7 @@ from src.utils import (
|
|
|
31
31
|
get_processor_name,
|
|
32
32
|
run_background_tasks,
|
|
33
33
|
validate_based_on_schema,
|
|
34
|
+
transform_schema_strings
|
|
34
35
|
)
|
|
35
36
|
|
|
36
37
|
|
|
@@ -104,9 +105,16 @@ async def extract_data_from_pdf_w_docai(
|
|
|
104
105
|
# Extract entities from the result
|
|
105
106
|
for entity in result.entities:
|
|
106
107
|
value = (
|
|
107
|
-
{child.type_: child.mention_text
|
|
108
|
+
{child.type_: (child.mention_text,
|
|
109
|
+
child.page_anchor.page_refs[0].page
|
|
110
|
+
if hasattr(child.page_anchor.page_refs[0], "page")
|
|
111
|
+
else 0)
|
|
112
|
+
for child in entity.properties}
|
|
108
113
|
if entity.properties
|
|
109
|
-
else entity.mention_text
|
|
114
|
+
else (entity.mention_text,
|
|
115
|
+
entity.page_anchor.page_refs[0].page
|
|
116
|
+
if hasattr(entity.page_anchor.page_refs[0], "page")
|
|
117
|
+
else 0)
|
|
110
118
|
)
|
|
111
119
|
aggregated_data[entity.type_].append(value)
|
|
112
120
|
|
|
@@ -220,6 +228,9 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
220
228
|
result = await llm_client.get_unified_json_genai(
|
|
221
229
|
prompt=prompt, document=document, response_schema=response_schema
|
|
222
230
|
)
|
|
231
|
+
|
|
232
|
+
result = llm_prediction_to_tuples(result)
|
|
233
|
+
|
|
223
234
|
return result
|
|
224
235
|
return {}
|
|
225
236
|
|
src/postprocessing/common.py
CHANGED
|
@@ -380,6 +380,11 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
380
380
|
]
|
|
381
381
|
)
|
|
382
382
|
return entity_k, [v for _, v in format_tasks]
|
|
383
|
+
if isinstance(entity_value, tuple):
|
|
384
|
+
page = entity_value[1]
|
|
385
|
+
entity_value = entity_value[0]
|
|
386
|
+
else:
|
|
387
|
+
page = -1
|
|
383
388
|
entity_key = entity_k.lower()
|
|
384
389
|
formatted_value = None
|
|
385
390
|
|
|
@@ -446,10 +451,18 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
446
451
|
elif "reversechargesentence" in entity_key:
|
|
447
452
|
formatted_value = clean_item_description(entity_value, remove_numbers=False)
|
|
448
453
|
|
|
454
|
+
elif "quantity" in entity_key:
|
|
455
|
+
if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
|
|
456
|
+
# For partner invoice, quantity can be mentioned as whole number
|
|
457
|
+
formatted_value = decimal_convertor(
|
|
458
|
+
extract_number(entity_value), quantity=True
|
|
459
|
+
)
|
|
460
|
+
else:
|
|
461
|
+
formatted_value = extract_number(entity_value)
|
|
462
|
+
|
|
449
463
|
elif any(
|
|
450
464
|
numeric_indicator in entity_key
|
|
451
465
|
for numeric_indicator in [
|
|
452
|
-
"quantity",
|
|
453
466
|
"value",
|
|
454
467
|
"amount",
|
|
455
468
|
"price",
|
|
@@ -466,6 +479,7 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
466
479
|
result = {
|
|
467
480
|
"documentValue": entity_value,
|
|
468
481
|
"formattedValue": formatted_value,
|
|
482
|
+
"page": page,
|
|
469
483
|
}
|
|
470
484
|
return entity_k, result
|
|
471
485
|
|
|
@@ -514,7 +528,7 @@ async def get_port_code_llm(port: str, llm_client):
|
|
|
514
528
|
return None
|
|
515
529
|
|
|
516
530
|
|
|
517
|
-
def decimal_convertor(value):
|
|
531
|
+
def decimal_convertor(value, quantity=False):
|
|
518
532
|
"""Convert EU values to English values."""
|
|
519
533
|
if value is None:
|
|
520
534
|
return None
|
|
@@ -522,25 +536,39 @@ def decimal_convertor(value):
|
|
|
522
536
|
# Remove spaces
|
|
523
537
|
value = value.strip().replace(" ", "")
|
|
524
538
|
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
539
|
+
if not quantity:
|
|
540
|
+
# Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
|
|
541
|
+
if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
|
|
542
|
+
value = value.replace(".", "").replace(",", ".")
|
|
543
|
+
|
|
544
|
+
# European style integer with thousand separator: 2.500
|
|
545
|
+
elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
|
|
546
|
+
value = value.replace(".", "")
|
|
547
|
+
|
|
548
|
+
# Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
|
|
549
|
+
elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
|
|
550
|
+
value = value.replace(",", "")
|
|
528
551
|
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
552
|
+
# English style integer with thousand separator: 2,500
|
|
553
|
+
elif re.match(r"^\d{1,3}(,\d{3})+$", value):
|
|
554
|
+
value = value.replace(",", "")
|
|
532
555
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
556
|
+
# Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
|
|
557
|
+
if re.match(r"^\d+,\d{1,2}$", value):
|
|
558
|
+
value = value.replace(",", ".")
|
|
536
559
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
560
|
+
# If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
|
|
561
|
+
elif re.match(r"^\d+\.\d{3,}$", value):
|
|
562
|
+
value = value[: value.index(".") + 3]
|
|
540
563
|
|
|
541
|
-
#
|
|
542
|
-
|
|
543
|
-
|
|
564
|
+
else: # quantity=True → only last two
|
|
565
|
+
# Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
|
|
566
|
+
if re.match(r"^\d+,\d{1,2}$", value):
|
|
567
|
+
value = value.replace(",", ".")
|
|
568
|
+
|
|
569
|
+
# If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
|
|
570
|
+
elif re.match(r"^\d+\.\d{3,}$", value):
|
|
571
|
+
value = value[: value.index(".") + 3]
|
|
544
572
|
|
|
545
573
|
return value
|
|
546
574
|
|
|
@@ -594,3 +622,24 @@ def remove_stop_words(lineitem: str):
|
|
|
594
622
|
.upper()
|
|
595
623
|
.strip()
|
|
596
624
|
)
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def llm_prediction_to_tuples(llm_prediction):
|
|
628
|
+
"""Convert LLM prediction dictionary to tuples of (value, page_number)."""
|
|
629
|
+
if isinstance(llm_prediction, dict):
|
|
630
|
+
if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
|
|
631
|
+
if llm_prediction["value"]:
|
|
632
|
+
try:
|
|
633
|
+
page_number = int(llm_prediction["page_number"])
|
|
634
|
+
except: # noqa: E722
|
|
635
|
+
page_number = -1
|
|
636
|
+
return (llm_prediction["value"], page_number)
|
|
637
|
+
return None
|
|
638
|
+
for key, value in llm_prediction.items():
|
|
639
|
+
llm_prediction[key] = llm_prediction_to_tuples(
|
|
640
|
+
llm_prediction.get(key, value)
|
|
641
|
+
)
|
|
642
|
+
elif isinstance(llm_prediction, list):
|
|
643
|
+
for i, item in enumerate(llm_prediction):
|
|
644
|
+
llm_prediction[i] = llm_prediction_to_tuples(item)
|
|
645
|
+
return llm_prediction
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "OBJECT",
|
|
3
3
|
"properties": {
|
|
4
|
-
"cfsCutOff": {"type": "
|
|
5
|
-
"bookingNumber": {"type": "
|
|
6
|
-
"cyCutOff": {"type": "
|
|
7
|
-
"gateInReference": {"type": "
|
|
8
|
-
"gateInTerminal": {"type": "
|
|
9
|
-
"mblNumber": {"type": "
|
|
10
|
-
"pickUpReference": {"type": "
|
|
11
|
-
"pickUpTerminal": {"type": "
|
|
12
|
-
"siCutOff": {"type": "
|
|
13
|
-
"vgmCutOff": {"type": "
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
14
|
"transportLegs": {
|
|
15
15
|
"type": "ARRAY",
|
|
16
16
|
"items": {
|
|
17
17
|
"type": "OBJECT",
|
|
18
18
|
"properties": {
|
|
19
|
-
"eta": {"type": "
|
|
20
|
-
"etd": {"type": "
|
|
21
|
-
"imoNumber": {"type": "
|
|
22
|
-
"portOfDischarge": {"type": "
|
|
23
|
-
"portOfLoading": {"type": "
|
|
24
|
-
"vesselName": {"type": "
|
|
25
|
-
"voyage": {"type": "
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
26
|
},
|
|
27
27
|
"required": []
|
|
28
28
|
}
|
|
@@ -1,32 +1,32 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "OBJECT",
|
|
3
3
|
"properties": {
|
|
4
|
-
"cfsCutOff": {"type": "
|
|
5
|
-
"bookingNumber": {"type": "
|
|
6
|
-
"cyCutOff": {"type": "
|
|
7
|
-
"gateInReference": {"type": "
|
|
8
|
-
"gateInTerminal": {"type": "
|
|
9
|
-
"mblNumber": {"type": "
|
|
10
|
-
"pickUpReference": {"type": "
|
|
11
|
-
"pickUpTerminal": {"type": "
|
|
12
|
-
"siCutOff": {"type": "
|
|
13
|
-
"vgmCutOff": {"type": "
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
14
|
"transportLegs": {
|
|
15
15
|
"type": "ARRAY",
|
|
16
16
|
"items": {
|
|
17
17
|
"type": "OBJECT",
|
|
18
18
|
"properties": {
|
|
19
|
-
"eta": {"type": "
|
|
20
|
-
"etd": {"type": "
|
|
21
|
-
"imoNumber": {"type": "
|
|
22
|
-
"portOfDischarge": {"type": "
|
|
23
|
-
"portOfLoading": {"type": "
|
|
24
|
-
"vesselName": {"type": "
|
|
25
|
-
"voyage": {"type": "
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
26
|
},
|
|
27
27
|
"required": []
|
|
28
28
|
}
|
|
29
29
|
}
|
|
30
30
|
},
|
|
31
31
|
"required": []
|
|
32
|
-
}
|
|
32
|
+
}
|
|
@@ -18,7 +18,7 @@ transportLegs:
|
|
|
18
18
|
vesselName: The name of the vessel for a specific leg.
|
|
19
19
|
voyage: The journey or route taken by the vessel for a specific leg.
|
|
20
20
|
|
|
21
|
-
your task is to extract the text value of the following entities:
|
|
21
|
+
your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
22
22
|
SCHEMA_PLACEHOLDER
|
|
23
23
|
|
|
24
24
|
Keywords for datapoints:
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "OBJECT",
|
|
3
3
|
"properties": {
|
|
4
|
-
"bookingNumber": {"type": "
|
|
5
|
-
"cfsCutOff": {"type": "
|
|
6
|
-
"cyCutOff": {"type": "
|
|
7
|
-
"gateInReference": {"type": "
|
|
8
|
-
"gateInTerminal": {"type": "
|
|
9
|
-
"mblNumber": {"type": "
|
|
10
|
-
"pickUpReference": {"type": "
|
|
11
|
-
"pickUpTerminal": {"type": "
|
|
12
|
-
"siCutOff": {"type": "
|
|
13
|
-
"vgmCutOff": {"type": "
|
|
4
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
5
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
14
|
"transportLegs": {
|
|
15
15
|
"type": "ARRAY",
|
|
16
16
|
"items": {
|
|
17
17
|
"type": "OBJECT",
|
|
18
18
|
"properties": {
|
|
19
|
-
"eta": {"type": "
|
|
20
|
-
"etd": {"type": "
|
|
21
|
-
"imoNumber": {"type": "
|
|
22
|
-
"portOfDischarge": {"type": "
|
|
23
|
-
"portOfLoading": {"type": "
|
|
24
|
-
"vesselName": {"type": "
|
|
25
|
-
"voyage": {"type": "
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
26
|
},
|
|
27
27
|
"required": []
|
|
28
28
|
}
|
|
@@ -18,7 +18,7 @@ transportLegs:
|
|
|
18
18
|
vesselName: The name of the vessel for a specific leg.
|
|
19
19
|
voyage: The journey or route taken by the vessel for a specific leg.
|
|
20
20
|
|
|
21
|
-
your task is to extract the text value of the following entities:
|
|
21
|
+
your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
22
22
|
SCHEMA_PLACEHOLDER
|
|
23
23
|
|
|
24
24
|
Keywords for datapoints:
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "OBJECT",
|
|
3
3
|
"properties": {
|
|
4
|
-
"cfsCutOff": {"type": "
|
|
5
|
-
"bookingNumber": {"type": "
|
|
6
|
-
"cyCutOff": {"type": "
|
|
7
|
-
"gateInReference": {"type": "
|
|
8
|
-
"gateInTerminal": {"type": "
|
|
9
|
-
"mblNumber": {"type": "
|
|
10
|
-
"pickUpReference": {"type": "
|
|
11
|
-
"pickUpTerminal": {"type": "
|
|
12
|
-
"siCutOff": {"type": "
|
|
13
|
-
"vgmCutOff": {"type": "
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
14
|
"transportLegs": {
|
|
15
15
|
"type": "ARRAY",
|
|
16
16
|
"items": {
|
|
17
17
|
"type": "OBJECT",
|
|
18
18
|
"properties": {
|
|
19
|
-
"eta": {"type": "
|
|
20
|
-
"etd": {"type": "
|
|
21
|
-
"imoNumber": {"type": "
|
|
22
|
-
"portOfDischarge": {"type": "
|
|
23
|
-
"portOfLoading": {"type": "
|
|
24
|
-
"vesselName": {"type": "
|
|
25
|
-
"voyage": {"type": "
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
26
|
},
|
|
27
27
|
"required": []
|
|
28
28
|
}
|
|
@@ -18,7 +18,7 @@ transportLegs:
|
|
|
18
18
|
vesselName: The name of the vessel for a specific leg.
|
|
19
19
|
voyage: The journey or route taken by the vessel for a specific leg.
|
|
20
20
|
|
|
21
|
-
your task is to extract the text value of the following entities:
|
|
21
|
+
your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
22
22
|
SCHEMA_PLACEHOLDER
|
|
23
23
|
|
|
24
24
|
Further explanation and Keywords for the transportLegs part as follows. The below 2 conditions is crucial. Take attention here:
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "OBJECT",
|
|
3
3
|
"properties": {
|
|
4
|
-
"cfsCutOff": {"type": "
|
|
5
|
-
"bookingNumber": {"type": "
|
|
6
|
-
"cyCutOff": {"type": "
|
|
7
|
-
"gateInReference": {"type": "
|
|
8
|
-
"gateInTerminal": {"type": "
|
|
9
|
-
"mblNumber": {"type": "
|
|
10
|
-
"pickUpReference": {"type": "
|
|
11
|
-
"pickUpTerminal": {"type": "
|
|
12
|
-
"siCutOff": {"type": "
|
|
13
|
-
"vgmCutOff": {"type": "
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
14
|
"transportLegs": {
|
|
15
15
|
"type": "ARRAY",
|
|
16
16
|
"items": {
|
|
17
17
|
"type": "OBJECT",
|
|
18
18
|
"properties": {
|
|
19
|
-
"eta": {"type": "
|
|
20
|
-
"etd": {"type": "
|
|
21
|
-
"portOfDischarge": {"type": "
|
|
22
|
-
"portOfLoading": {"type": "
|
|
23
|
-
"vesselName": {"type": "
|
|
24
|
-
"voyage": {"type": "
|
|
25
|
-
"imoNumber": {"type": "
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
23
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
24
|
+
"voyage": {"type": "STRING", "nullable": true},
|
|
25
|
+
"imoNumber": {"type": "STRING", "nullable": true}
|
|
26
26
|
},
|
|
27
27
|
"required": []
|
|
28
28
|
}
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
2
|
+
|
|
3
|
+
bookingNumber: Extract the booking number. This information can be found near the labels "BOOKING ACKNOWLEDGEMENT" or "BOOKING NUMBER".
|
|
2
4
|
gateInReference: This field should have the same value as the bookingNumber.
|
|
3
5
|
cyCutOff: Look for the "INTENDED FCL CY CUT-OFF" label and extract the date and time value.
|
|
4
6
|
vgmCutOff: Look for the "INTENDED VGM CUT-OFF" label and extract the date and time value.
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "OBJECT",
|
|
3
3
|
"properties": {
|
|
4
|
-
"cfsCutOff": {"type": "
|
|
5
|
-
"bookingNumber": {"type": "
|
|
6
|
-
"cyCutOff": {"type": "
|
|
7
|
-
"gateInReference": {"type": "
|
|
8
|
-
"gateInTerminal": {"type": "
|
|
9
|
-
"mblNumber": {"type": "
|
|
10
|
-
"pickUpReference": {"type": "
|
|
11
|
-
"pickUpTerminal": {"type": "
|
|
12
|
-
"siCutOff": {"type": "
|
|
13
|
-
"vgmCutOff": {"type": "
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
14
|
"transportLegs": {
|
|
15
15
|
"type": "ARRAY",
|
|
16
16
|
"items": {
|
|
17
17
|
"type": "OBJECT",
|
|
18
18
|
"properties": {
|
|
19
|
-
"eta": {"type": "
|
|
20
|
-
"etd": {"type": "
|
|
21
|
-
"imoNumber": {"type": "
|
|
22
|
-
"portOfDischarge": {"type": "
|
|
23
|
-
"portOfLoading": {"type": "
|
|
24
|
-
"vesselName": {"type": "
|
|
25
|
-
"voyage": {"type": "
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
26
|
},
|
|
27
27
|
"required": []
|
|
28
28
|
}
|
|
@@ -18,7 +18,7 @@ transportLegs:
|
|
|
18
18
|
vesselName: The name of the vessel for a specific leg.
|
|
19
19
|
voyage: The journey or route taken by the vessel for a specific leg.
|
|
20
20
|
|
|
21
|
-
your task is to extract the text value of the following entities:
|
|
21
|
+
your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
22
22
|
SCHEMA_PLACEHOLDER
|
|
23
23
|
|
|
24
24
|
Further explanation for the transportLegs part as follows:
|