data-science-document-ai 1.44.0__py3-none-any.whl → 1.45.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.44.0.dist-info → data_science_document_ai-1.45.1.dist-info}/METADATA +1 -1
- {data_science_document_ai-1.44.0.dist-info → data_science_document_ai-1.45.1.dist-info}/RECORD +9 -9
- src/pdf_processing.py +1 -0
- src/postprocessing/common.py +116 -14
- src/postprocessing/postprocess_partner_invoice.py +21 -22
- src/prompts/library/customsInvoice/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/prompt.txt +2 -1
- src/utils.py +20 -10
- {data_science_document_ai-1.44.0.dist-info → data_science_document_ai-1.45.1.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.44.0.dist-info → data_science_document_ai-1.45.1.dist-info}/RECORD
RENAMED
|
@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=_vP2q1xEIeyjO8TvZlSTeEM-M1PMceyDSuYGfyZeceY,3361
|
|
|
6
6
|
src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
|
|
7
7
|
src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=
|
|
10
|
-
src/postprocessing/common.py,sha256=
|
|
9
|
+
src/pdf_processing.py,sha256=lzvoza9itpEyl-rcBQbIcWuFxUAvF_Qyc-OpuPQWWMk,20354
|
|
10
|
+
src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
|
-
src/postprocessing/postprocess_partner_invoice.py,sha256=
|
|
13
|
+
src/postprocessing/postprocess_partner_invoice.py,sha256=LZcMZfJeLdcbYqPemO8gn9SmJxv-NPmb4uVCT3lKg18,12341
|
|
14
14
|
src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
15
15
|
src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
|
|
16
16
|
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
|
|
@@ -31,7 +31,7 @@ src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHi
|
|
|
31
31
|
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
|
|
32
32
|
src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
|
|
33
33
|
src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
|
|
34
|
-
src/prompts/library/customsInvoice/other/prompt.txt,sha256=
|
|
34
|
+
src/prompts/library/customsInvoice/other/prompt.txt,sha256=1dR73TQZJAfO9dKl-h7VhiJkdli498IV4e5JgBlOoYw,9695
|
|
35
35
|
src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
|
|
36
36
|
src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
|
|
37
37
|
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
|
|
@@ -44,7 +44,7 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylg
|
|
|
44
44
|
src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
|
|
45
45
|
src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
|
|
46
46
|
src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
|
|
47
|
-
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=
|
|
47
|
+
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=vMk-FBq9XkWiFiCf36t43DcIKNYh7IcGAsnfXq8vqio,8052
|
|
48
48
|
src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
|
|
49
49
|
src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
|
|
50
50
|
src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
|
|
@@ -53,7 +53,7 @@ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYp
|
|
|
53
53
|
src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
|
|
54
54
|
src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
|
|
55
55
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
56
|
-
src/utils.py,sha256=
|
|
57
|
-
data_science_document_ai-1.
|
|
58
|
-
data_science_document_ai-1.
|
|
59
|
-
data_science_document_ai-1.
|
|
56
|
+
src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
|
|
57
|
+
data_science_document_ai-1.45.1.dist-info/METADATA,sha256=U2ASt9xmLqXeWIDx7cr0LBJFV9yJC4yh398R25jkWvs,2152
|
|
58
|
+
data_science_document_ai-1.45.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
59
|
+
data_science_document_ai-1.45.1.dist-info/RECORD,,
|
src/pdf_processing.py
CHANGED
|
@@ -200,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
200
200
|
file_content = extract_top_pages(file_content, num_pages=5)
|
|
201
201
|
|
|
202
202
|
number_of_pages = get_pdf_page_count(file_content)
|
|
203
|
+
logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
|
|
203
204
|
|
|
204
205
|
# get the schema placeholder from the Doc AI and generate the response structure
|
|
205
206
|
response_schema = (
|
src/postprocessing/common.py
CHANGED
|
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
|
|
|
12
12
|
from src.io import logger
|
|
13
13
|
from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
|
|
14
14
|
from src.prompts.prompt_library import prompt_library
|
|
15
|
-
from src.utils import get_tms_mappings
|
|
15
|
+
from src.utils import batch_fetch_all_mappings, get_tms_mappings
|
|
16
16
|
|
|
17
17
|
tms_domain = os.environ["TMS_DOMAIN"]
|
|
18
18
|
|
|
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
|
|
|
134
134
|
formatted_value: string
|
|
135
135
|
|
|
136
136
|
"""
|
|
137
|
+
# Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
|
|
138
|
+
value = remove_unwanted_patterns(data_field_value)
|
|
139
|
+
|
|
137
140
|
formatted_value = ""
|
|
138
|
-
for c in
|
|
141
|
+
for c in value:
|
|
139
142
|
if c.isnumeric() or c in [",", ".", "-"]:
|
|
140
143
|
formatted_value += c
|
|
141
144
|
|
|
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
|
|
|
320
323
|
lineitem = lineitem.replace("HIGH CUBE", "")
|
|
321
324
|
|
|
322
325
|
# Remove container size e.g., 20FT, 40HC, etc.
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
+
pattern = [
|
|
327
|
+
f"{s}{t}"
|
|
328
|
+
for s in ("20|22|40|45".split("|"))
|
|
329
|
+
for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
|
|
330
|
+
]
|
|
331
|
+
lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
|
|
326
332
|
|
|
327
333
|
return lineitem
|
|
328
334
|
|
|
@@ -372,18 +378,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
372
378
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
373
379
|
|
|
374
380
|
|
|
375
|
-
async def format_label(
|
|
381
|
+
async def format_label(
|
|
382
|
+
entity_k,
|
|
383
|
+
entity_value,
|
|
384
|
+
document_type_code,
|
|
385
|
+
params,
|
|
386
|
+
mime_type,
|
|
387
|
+
container_map,
|
|
388
|
+
terminal_map,
|
|
389
|
+
depot_map,
|
|
390
|
+
):
|
|
376
391
|
llm_client = params["LlmClient"]
|
|
377
392
|
if isinstance(entity_value, dict): # if it's a nested entity
|
|
378
393
|
format_tasks = [
|
|
379
|
-
format_label(
|
|
394
|
+
format_label(
|
|
395
|
+
sub_k,
|
|
396
|
+
sub_v,
|
|
397
|
+
document_type_code,
|
|
398
|
+
params,
|
|
399
|
+
mime_type,
|
|
400
|
+
container_map,
|
|
401
|
+
terminal_map,
|
|
402
|
+
depot_map,
|
|
403
|
+
)
|
|
380
404
|
for sub_k, sub_v in entity_value.items()
|
|
381
405
|
]
|
|
382
406
|
return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
|
|
383
407
|
if isinstance(entity_value, list):
|
|
384
408
|
format_tasks = await asyncio.gather(
|
|
385
409
|
*[
|
|
386
|
-
format_label(
|
|
410
|
+
format_label(
|
|
411
|
+
entity_k,
|
|
412
|
+
sub_v,
|
|
413
|
+
document_type_code,
|
|
414
|
+
params,
|
|
415
|
+
mime_type,
|
|
416
|
+
container_map,
|
|
417
|
+
terminal_map,
|
|
418
|
+
depot_map,
|
|
419
|
+
)
|
|
387
420
|
for sub_v in entity_value
|
|
388
421
|
]
|
|
389
422
|
)
|
|
@@ -405,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
|
|
|
405
438
|
)
|
|
406
439
|
|
|
407
440
|
elif (entity_key == "containertype") or (entity_key == "containersize"):
|
|
408
|
-
formatted_value =
|
|
441
|
+
formatted_value = container_map.get(entity_value)
|
|
409
442
|
|
|
410
443
|
elif check_formatting_rule(entity_k, document_type_code, "terminal"):
|
|
411
|
-
formatted_value =
|
|
444
|
+
formatted_value = terminal_map.get(entity_value)
|
|
412
445
|
|
|
413
446
|
elif check_formatting_rule(entity_k, document_type_code, "depot"):
|
|
414
|
-
formatted_value =
|
|
447
|
+
formatted_value = depot_map.get(entity_value)
|
|
415
448
|
|
|
416
449
|
elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
|
|
417
450
|
try:
|
|
@@ -507,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
|
|
|
507
540
|
"""Get port code using AI model."""
|
|
508
541
|
port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
|
|
509
542
|
|
|
510
|
-
|
|
543
|
+
result = await get_tms_mappings(port, "ports", port_llm)
|
|
544
|
+
return result.get(port, None)
|
|
511
545
|
|
|
512
546
|
|
|
513
547
|
async def get_port_code_llm(port: str, llm_client, doc_type=None):
|
|
@@ -598,6 +632,74 @@ def decimal_convertor(value, quantity=False):
|
|
|
598
632
|
return value
|
|
599
633
|
|
|
600
634
|
|
|
635
|
+
async def collect_mapping_requests(entity_value, document_type_code):
|
|
636
|
+
"""Collect all unique container types, terminals, and depots from the entity value."""
|
|
637
|
+
# Sets to store unique values
|
|
638
|
+
container_types = set()
|
|
639
|
+
terminals = set()
|
|
640
|
+
depots = set()
|
|
641
|
+
|
|
642
|
+
def walk(key, value):
|
|
643
|
+
key_lower = key.lower()
|
|
644
|
+
|
|
645
|
+
# nested dict
|
|
646
|
+
if isinstance(value, dict):
|
|
647
|
+
for k, v in value.items():
|
|
648
|
+
walk(k, v)
|
|
649
|
+
|
|
650
|
+
# list of values
|
|
651
|
+
elif isinstance(value, list):
|
|
652
|
+
for item in value:
|
|
653
|
+
walk(key, item)
|
|
654
|
+
|
|
655
|
+
# leaf node
|
|
656
|
+
else:
|
|
657
|
+
if key_lower in ("containertype", "containersize"):
|
|
658
|
+
# Take only "20DV" from ('20DV', 0) if it's a tuple
|
|
659
|
+
container_types.add(value[0]) if isinstance(
|
|
660
|
+
value, tuple
|
|
661
|
+
) else container_types.add(value)
|
|
662
|
+
|
|
663
|
+
elif check_formatting_rule(key, document_type_code, "terminal"):
|
|
664
|
+
terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
|
|
665
|
+
value
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
elif check_formatting_rule(key, document_type_code, "depot"):
|
|
669
|
+
depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
|
|
670
|
+
|
|
671
|
+
walk("root", entity_value)
|
|
672
|
+
|
|
673
|
+
return container_types, terminals, depots
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
async def format_all_labels(entity_data, document_type_code, params, mime_type):
|
|
677
|
+
"""Format all labels in the entity data using cached mappings."""
|
|
678
|
+
# Collect all mapping values needed
|
|
679
|
+
container_req, terminal_req, depot_req = await collect_mapping_requests(
|
|
680
|
+
entity_data, document_type_code
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Batch fetch mappings
|
|
684
|
+
container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
|
|
685
|
+
container_req, terminal_req, depot_req
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Format labels using cached mappings
|
|
689
|
+
_, result = await format_label(
|
|
690
|
+
"root",
|
|
691
|
+
entity_data,
|
|
692
|
+
document_type_code,
|
|
693
|
+
params,
|
|
694
|
+
mime_type,
|
|
695
|
+
container_map,
|
|
696
|
+
terminal_map,
|
|
697
|
+
depot_map,
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
return _, result
|
|
701
|
+
|
|
702
|
+
|
|
601
703
|
async def format_all_entities(result, document_type_code, params, mime_type):
|
|
602
704
|
"""Format the entity values in the result dictionary."""
|
|
603
705
|
# Since we treat `customsInvoice` same as `partnerInvoice`
|
|
@@ -613,8 +715,8 @@ async def format_all_entities(result, document_type_code, params, mime_type):
|
|
|
613
715
|
return {}
|
|
614
716
|
|
|
615
717
|
# Format all entities recursively
|
|
616
|
-
_, aggregated_data = await
|
|
617
|
-
|
|
718
|
+
_, aggregated_data = await format_all_labels(
|
|
719
|
+
result, document_type_code, params, mime_type
|
|
618
720
|
)
|
|
619
721
|
|
|
620
722
|
# Process partner invoice on lineitem mapping and reverse charge sentence
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
"""This module contains the postprocessing functions for the partner invoice."""
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
from fuzzywuzzy import fuzz
|
|
2
|
+
from rapidfuzz import fuzz, process
|
|
5
3
|
|
|
6
4
|
from src.io import logger
|
|
7
5
|
from src.utils import get_tms_mappings
|
|
@@ -177,6 +175,7 @@ async def process_line_items_batch(
|
|
|
177
175
|
pending_line_items = {}
|
|
178
176
|
|
|
179
177
|
# Check Fuzzy Matching
|
|
178
|
+
logger.info(f"Mapping line item codes with Fuzzy matching....")
|
|
180
179
|
for i, item in enumerate(line_items):
|
|
181
180
|
description_obj = item.get("lineItemDescription")
|
|
182
181
|
|
|
@@ -231,12 +230,6 @@ async def process_line_items_batch(
|
|
|
231
230
|
return line_items
|
|
232
231
|
|
|
233
232
|
|
|
234
|
-
def compute_score(args):
|
|
235
|
-
"""Compute the fuzzy matching score between a new line item and a key."""
|
|
236
|
-
new_lineitem, key = args
|
|
237
|
-
return key, fuzz.ratio(new_lineitem, key)
|
|
238
|
-
|
|
239
|
-
|
|
240
233
|
def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
|
|
241
234
|
"""Get the best fuzzy match for a target string from a list of candidates.
|
|
242
235
|
|
|
@@ -249,16 +242,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
|
|
|
249
242
|
tuple: (best_match, score) if above threshold, else (None, 0)
|
|
250
243
|
"""
|
|
251
244
|
# Use multiprocessing to find the best match
|
|
252
|
-
|
|
253
|
-
|
|
245
|
+
result = process.extractOne(
|
|
246
|
+
target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
|
|
247
|
+
)
|
|
254
248
|
|
|
255
|
-
|
|
256
|
-
|
|
249
|
+
if result is None:
|
|
250
|
+
return None, False
|
|
257
251
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
if
|
|
261
|
-
|
|
252
|
+
match, score, index = result
|
|
253
|
+
|
|
254
|
+
# return best_match if the best match score is above a threshold (e.g., 80)
|
|
255
|
+
if match:
|
|
256
|
+
return match, True
|
|
262
257
|
|
|
263
258
|
return None, False
|
|
264
259
|
|
|
@@ -290,18 +285,22 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
|
290
285
|
Returns:
|
|
291
286
|
str: The best matching 'Forto SLI' value from the dictionary.
|
|
292
287
|
"""
|
|
293
|
-
new_lineitem = new_lineitem.upper()
|
|
294
|
-
|
|
295
288
|
# Check if the new line item is already in the dictionary
|
|
296
289
|
if new_lineitem in kvp_dict:
|
|
297
290
|
return kvp_dict[new_lineitem]
|
|
298
291
|
|
|
299
292
|
# Get the best fuzzy match score for the extracted line item
|
|
300
|
-
|
|
301
|
-
new_lineitem,
|
|
293
|
+
match, _ = get_fuzzy_match_score(
|
|
294
|
+
new_lineitem,
|
|
295
|
+
list(kvp_dict.keys()),
|
|
296
|
+
threshold,
|
|
302
297
|
)
|
|
303
298
|
|
|
304
|
-
|
|
299
|
+
if match:
|
|
300
|
+
# find the code from the kvp_dict
|
|
301
|
+
return kvp_dict[match]
|
|
302
|
+
|
|
303
|
+
return None
|
|
305
304
|
|
|
306
305
|
|
|
307
306
|
async def associate_forto_item_code(line_item_data, params):
|
|
@@ -54,7 +54,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
54
54
|
- unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
|
|
55
55
|
- totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
|
|
56
56
|
- totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
|
|
57
|
-
- quantity: The quantity of the item or service provided in the line item.
|
|
57
|
+
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
|
|
58
58
|
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
|
|
59
59
|
|
|
60
60
|
- hblNumber and mblNumber:
|
|
@@ -52,7 +52,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
52
52
|
- unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
|
|
53
53
|
- totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
|
|
54
54
|
- totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
|
|
55
|
-
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and
|
|
55
|
+
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
|
|
56
56
|
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
|
|
57
57
|
|
|
58
58
|
- hblNumber and mblNumber:
|
|
@@ -68,6 +68,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
68
68
|
- Example:
|
|
69
69
|
- "COSCO SHIPPING Lines Italy, Poland, or France S.R.L. – Genova Office – As Agent For COSCO SHIPPING Lines Co.,Ltd."
|
|
70
70
|
- vendorName: COSCO SHIPPING Lines Co.,Ltd.
|
|
71
|
+
- From Hapag-Lloyd invoices, look for "Ballindamm 25" address to extract the vendorAddress.
|
|
71
72
|
|
|
72
73
|
- agentName: Name of the agent. Agencies are offices authorized to act on behalf of a company. This details usually available including the branch name of the parent company name in the invoice.
|
|
73
74
|
- agentKeyWord:
|
src/utils.py
CHANGED
|
@@ -406,16 +406,7 @@ async def get_tms_mappings(
|
|
|
406
406
|
response.raise_for_status()
|
|
407
407
|
|
|
408
408
|
# Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
|
|
409
|
-
|
|
410
|
-
# For line_items, return the full data mapping
|
|
411
|
-
return response.json().get("response", {}).get("data", {})
|
|
412
|
-
else:
|
|
413
|
-
return (
|
|
414
|
-
response.json()
|
|
415
|
-
.get("response", {})
|
|
416
|
-
.get("data", {})
|
|
417
|
-
.get(input_list[0], None)
|
|
418
|
-
)
|
|
409
|
+
return response.json().get("response", {}).get("data", {})
|
|
419
410
|
|
|
420
411
|
except httpx.HTTPStatusError as exc:
|
|
421
412
|
logger.error(
|
|
@@ -424,6 +415,25 @@ async def get_tms_mappings(
|
|
|
424
415
|
return {}
|
|
425
416
|
|
|
426
417
|
|
|
418
|
+
async def batch_fetch_all_mappings(container_types, terminals, depots):
|
|
419
|
+
"""Batch fetch all mappings for container types, terminals, and depots."""
|
|
420
|
+
# run batch calls concurrently
|
|
421
|
+
results = await asyncio.gather(
|
|
422
|
+
get_tms_mappings(list(container_types), "container_types"),
|
|
423
|
+
get_tms_mappings(list(terminals), "terminals"),
|
|
424
|
+
get_tms_mappings(list(depots), "depots"),
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
batch_container_map, batch_terminal_map, batch_depot_map = results
|
|
428
|
+
|
|
429
|
+
# Convert lists of tuples to dicts if necessary
|
|
430
|
+
return (
|
|
431
|
+
dict(batch_container_map or {}),
|
|
432
|
+
dict(batch_terminal_map or {}),
|
|
433
|
+
dict(batch_depot_map or {}),
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
|
|
427
437
|
def transform_schema_strings(schema):
|
|
428
438
|
"""
|
|
429
439
|
Recursively transforms a schema dictionary, replacing all "type": "STRING"
|
{data_science_document_ai-1.44.0.dist-info → data_science_document_ai-1.45.1.dist-info}/WHEEL
RENAMED
|
File without changes
|