data-science-document-ai 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +2 -2
- data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
- src/constants.py +6 -10
- src/docai.py +14 -5
- src/docai_processor_config.yaml +0 -56
- src/excel_processing.py +34 -13
- src/io.py +69 -1
- src/llm.py +10 -32
- src/pdf_processing.py +192 -54
- src/postprocessing/common.py +246 -44
- src/postprocessing/postprocess_partner_invoice.py +139 -85
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
- src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
- src/prompts/library/bundeskasse/other/placeholders.json +25 -25
- src/prompts/library/bundeskasse/other/prompt.txt +8 -6
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
- src/prompts/library/customsAssessment/other/placeholders.json +67 -16
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/placeholders.json +20 -20
- src/prompts/library/customsInvoice/other/prompt.txt +4 -4
- src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +34 -34
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/packingList/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
- src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
- src/setup.py +13 -16
- src/utils.py +157 -45
- data_science_document_ai-1.40.3.dist-info/RECORD +0 -59
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
- {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +0 -0
src/postprocessing/common.py
CHANGED
|
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
|
|
|
12
12
|
from src.io import logger
|
|
13
13
|
from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
|
|
14
14
|
from src.prompts.prompt_library import prompt_library
|
|
15
|
-
from src.utils import get_tms_mappings
|
|
15
|
+
from src.utils import batch_fetch_all_mappings, get_tms_mappings
|
|
16
16
|
|
|
17
17
|
tms_domain = os.environ["TMS_DOMAIN"]
|
|
18
18
|
|
|
@@ -84,16 +84,16 @@ def clean_shipment_id(shipment_id):
|
|
|
84
84
|
"""
|
|
85
85
|
if not shipment_id:
|
|
86
86
|
return
|
|
87
|
-
# '#
|
|
88
|
-
# Find the pattern of a shipment ID that starts with 'S' followed by
|
|
89
|
-
match = re.findall(r"S\d{
|
|
87
|
+
# '#S1234565@-1' -> 'S1234565'
|
|
88
|
+
# Find the pattern of a shipment ID that starts with 'S' followed by 7 to 8 digits
|
|
89
|
+
match = re.findall(r"S\d{6,8}", shipment_id)
|
|
90
90
|
stripped_value = match[0] if match else None
|
|
91
91
|
|
|
92
92
|
if not stripped_value:
|
|
93
93
|
return None
|
|
94
94
|
|
|
95
95
|
# Check if length is valid (should be either 7 or 8)
|
|
96
|
-
if len(stripped_value) not in (
|
|
96
|
+
if len(stripped_value) not in (7, 8, 9):
|
|
97
97
|
return None
|
|
98
98
|
|
|
99
99
|
return stripped_value
|
|
@@ -134,9 +134,12 @@ def extract_number(data_field_value):
|
|
|
134
134
|
formatted_value: string
|
|
135
135
|
|
|
136
136
|
"""
|
|
137
|
+
# Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
|
|
138
|
+
value = remove_unwanted_patterns(data_field_value)
|
|
139
|
+
|
|
137
140
|
formatted_value = ""
|
|
138
|
-
for c in
|
|
139
|
-
if c.isnumeric() or c in [",", "."]:
|
|
141
|
+
for c in value:
|
|
142
|
+
if c.isnumeric() or c in [",", ".", "-"]:
|
|
140
143
|
formatted_value += c
|
|
141
144
|
|
|
142
145
|
# First and last characters should not be [",", "."]
|
|
@@ -319,6 +322,14 @@ def remove_unwanted_patterns(lineitem: str):
|
|
|
319
322
|
# Remove "HIGH CUBE"
|
|
320
323
|
lineitem = lineitem.replace("HIGH CUBE", "")
|
|
321
324
|
|
|
325
|
+
# Remove container size e.g., 20FT, 40HC, etc.
|
|
326
|
+
pattern = [
|
|
327
|
+
f"{s}{t}"
|
|
328
|
+
for s in ("20|22|40|45".split("|"))
|
|
329
|
+
for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
|
|
330
|
+
]
|
|
331
|
+
lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
|
|
332
|
+
|
|
322
333
|
return lineitem
|
|
323
334
|
|
|
324
335
|
|
|
@@ -349,51 +360,91 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
349
360
|
# Remove the currency codes
|
|
350
361
|
lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
|
|
351
362
|
|
|
363
|
+
# remove other patterns
|
|
364
|
+
lineitem = remove_unwanted_patterns(lineitem)
|
|
365
|
+
|
|
352
366
|
# Remove numbers from the line item
|
|
353
367
|
if (
|
|
354
368
|
remove_numbers
|
|
355
369
|
): # Do not remove numbers for the reverse charge sentence as it contains Article number
|
|
356
370
|
lineitem = re.sub(r"\d+", "", lineitem)
|
|
357
371
|
|
|
358
|
-
# remove other patterns
|
|
359
|
-
lineitem = remove_unwanted_patterns(lineitem)
|
|
360
|
-
|
|
361
372
|
# remove special chars
|
|
362
373
|
lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
|
|
363
374
|
|
|
375
|
+
# Remove x from lineitem like 10 x
|
|
376
|
+
lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
|
|
377
|
+
|
|
364
378
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
365
379
|
|
|
366
380
|
|
|
367
|
-
async def format_label(
|
|
381
|
+
async def format_label(
|
|
382
|
+
entity_k,
|
|
383
|
+
entity_value,
|
|
384
|
+
document_type_code,
|
|
385
|
+
params,
|
|
386
|
+
mime_type,
|
|
387
|
+
container_map,
|
|
388
|
+
terminal_map,
|
|
389
|
+
depot_map,
|
|
390
|
+
):
|
|
368
391
|
llm_client = params["LlmClient"]
|
|
369
392
|
if isinstance(entity_value, dict): # if it's a nested entity
|
|
370
393
|
format_tasks = [
|
|
371
|
-
format_label(
|
|
394
|
+
format_label(
|
|
395
|
+
sub_k,
|
|
396
|
+
sub_v,
|
|
397
|
+
document_type_code,
|
|
398
|
+
params,
|
|
399
|
+
mime_type,
|
|
400
|
+
container_map,
|
|
401
|
+
terminal_map,
|
|
402
|
+
depot_map,
|
|
403
|
+
)
|
|
372
404
|
for sub_k, sub_v in entity_value.items()
|
|
373
405
|
]
|
|
374
406
|
return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
|
|
375
407
|
if isinstance(entity_value, list):
|
|
376
408
|
format_tasks = await asyncio.gather(
|
|
377
409
|
*[
|
|
378
|
-
format_label(
|
|
410
|
+
format_label(
|
|
411
|
+
entity_k,
|
|
412
|
+
sub_v,
|
|
413
|
+
document_type_code,
|
|
414
|
+
params,
|
|
415
|
+
mime_type,
|
|
416
|
+
container_map,
|
|
417
|
+
terminal_map,
|
|
418
|
+
depot_map,
|
|
419
|
+
)
|
|
379
420
|
for sub_v in entity_value
|
|
380
421
|
]
|
|
381
422
|
)
|
|
382
423
|
return entity_k, [v for _, v in format_tasks]
|
|
424
|
+
|
|
425
|
+
if mime_type == "application/pdf":
|
|
426
|
+
if isinstance(entity_value, tuple):
|
|
427
|
+
page = entity_value[1]
|
|
428
|
+
entity_value = entity_value[0]
|
|
429
|
+
else:
|
|
430
|
+
page = -1
|
|
431
|
+
|
|
383
432
|
entity_key = entity_k.lower()
|
|
384
433
|
formatted_value = None
|
|
385
434
|
|
|
386
435
|
if entity_key.startswith("port"):
|
|
387
|
-
formatted_value = await get_port_code_ai(
|
|
436
|
+
formatted_value = await get_port_code_ai(
|
|
437
|
+
entity_value, llm_client, doc_type=document_type_code
|
|
438
|
+
)
|
|
388
439
|
|
|
389
440
|
elif (entity_key == "containertype") or (entity_key == "containersize"):
|
|
390
|
-
formatted_value =
|
|
441
|
+
formatted_value = container_map.get(entity_value)
|
|
391
442
|
|
|
392
443
|
elif check_formatting_rule(entity_k, document_type_code, "terminal"):
|
|
393
|
-
formatted_value =
|
|
444
|
+
formatted_value = terminal_map.get(entity_value)
|
|
394
445
|
|
|
395
446
|
elif check_formatting_rule(entity_k, document_type_code, "depot"):
|
|
396
|
-
formatted_value =
|
|
447
|
+
formatted_value = depot_map.get(entity_value)
|
|
397
448
|
|
|
398
449
|
elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
|
|
399
450
|
try:
|
|
@@ -414,11 +465,14 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
414
465
|
except ValueError as e:
|
|
415
466
|
logger.info(f"ParserError: {e}")
|
|
416
467
|
|
|
417
|
-
elif
|
|
468
|
+
elif (
|
|
469
|
+
entity_key in ["invoicenumber", "creditnoteinvoicenumber"]
|
|
470
|
+
and document_type_code == "bundeskasse"
|
|
471
|
+
):
|
|
418
472
|
formatted_value = clean_invoice_number(entity_value)
|
|
419
473
|
|
|
420
474
|
elif entity_key in ("shipmentid", "partnerreference"):
|
|
421
|
-
# Clean the shipment ID to match Forto's standard (starts with 'S' followed by
|
|
475
|
+
# Clean the shipment ID to match Forto's standard (starts with 'S' followed by 7 or 8 digits)
|
|
422
476
|
formatted_value = clean_shipment_id(entity_value)
|
|
423
477
|
|
|
424
478
|
elif entity_key == "containernumber":
|
|
@@ -446,10 +500,19 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
446
500
|
elif "reversechargesentence" in entity_key:
|
|
447
501
|
formatted_value = clean_item_description(entity_value, remove_numbers=False)
|
|
448
502
|
|
|
503
|
+
elif "quantity" in entity_key:
|
|
504
|
+
if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
|
|
505
|
+
# For partner invoice, quantity can be mentioned as whole number
|
|
506
|
+
# Apply decimal convertor for 46,45 --> 46.45 but not for 1.000 --> 1000
|
|
507
|
+
formatted_value = decimal_convertor(
|
|
508
|
+
extract_number(entity_value), quantity=True
|
|
509
|
+
)
|
|
510
|
+
else:
|
|
511
|
+
formatted_value = extract_number(entity_value)
|
|
512
|
+
|
|
449
513
|
elif any(
|
|
450
514
|
numeric_indicator in entity_key
|
|
451
515
|
for numeric_indicator in [
|
|
452
|
-
"quantity",
|
|
453
516
|
"value",
|
|
454
517
|
"amount",
|
|
455
518
|
"price",
|
|
@@ -467,17 +530,21 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
467
530
|
"documentValue": entity_value,
|
|
468
531
|
"formattedValue": formatted_value,
|
|
469
532
|
}
|
|
533
|
+
if mime_type == "application/pdf":
|
|
534
|
+
result["page"] = page
|
|
535
|
+
|
|
470
536
|
return entity_k, result
|
|
471
537
|
|
|
472
538
|
|
|
473
|
-
async def get_port_code_ai(port: str, llm_client):
|
|
539
|
+
async def get_port_code_ai(port: str, llm_client, doc_type=None):
|
|
474
540
|
"""Get port code using AI model."""
|
|
475
|
-
port_llm = await get_port_code_llm(port, llm_client)
|
|
541
|
+
port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
|
|
476
542
|
|
|
477
|
-
|
|
543
|
+
result = await get_tms_mappings(port, "ports", port_llm)
|
|
544
|
+
return result.get(port, None)
|
|
478
545
|
|
|
479
546
|
|
|
480
|
-
async def get_port_code_llm(port: str, llm_client):
|
|
547
|
+
async def get_port_code_llm(port: str, llm_client, doc_type=None):
|
|
481
548
|
if (
|
|
482
549
|
"postprocessing" in prompt_library.library.keys()
|
|
483
550
|
and "port_code" in prompt_library.library["postprocessing"].keys()
|
|
@@ -504,7 +571,7 @@ async def get_port_code_llm(port: str, llm_client):
|
|
|
504
571
|
}
|
|
505
572
|
|
|
506
573
|
response = await llm_client.get_unified_json_genai(
|
|
507
|
-
prompt, response_schema=response_schema, model="chatgpt"
|
|
574
|
+
prompt, response_schema=response_schema, model="chatgpt", doc_type=doc_type
|
|
508
575
|
)
|
|
509
576
|
try:
|
|
510
577
|
mapped_port = response["port"]
|
|
@@ -514,7 +581,7 @@ async def get_port_code_llm(port: str, llm_client):
|
|
|
514
581
|
return None
|
|
515
582
|
|
|
516
583
|
|
|
517
|
-
def decimal_convertor(value):
|
|
584
|
+
def decimal_convertor(value, quantity=False):
|
|
518
585
|
"""Convert EU values to English values."""
|
|
519
586
|
if value is None:
|
|
520
587
|
return None
|
|
@@ -522,30 +589,118 @@ def decimal_convertor(value):
|
|
|
522
589
|
# Remove spaces
|
|
523
590
|
value = value.strip().replace(" ", "")
|
|
524
591
|
|
|
525
|
-
#
|
|
526
|
-
if
|
|
527
|
-
|
|
592
|
+
# Check "-" and remove it for processing
|
|
593
|
+
is_negative, value = (True, value[1:]) if value.startswith("-") else (False, value)
|
|
594
|
+
|
|
595
|
+
if not quantity:
|
|
596
|
+
# Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
|
|
597
|
+
if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
|
|
598
|
+
value = value.replace(".", "").replace(",", ".")
|
|
599
|
+
|
|
600
|
+
# European style integer with thousand separator: 2.500
|
|
601
|
+
elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
|
|
602
|
+
value = value.replace(".", "")
|
|
528
603
|
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
604
|
+
# Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
|
|
605
|
+
elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
|
|
606
|
+
value = value.replace(",", "")
|
|
532
607
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
608
|
+
# English style integer with thousand separator: 2,500
|
|
609
|
+
elif re.match(r"^\d{1,3}(,\d{3})+$", value):
|
|
610
|
+
value = value.replace(",", "")
|
|
536
611
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
612
|
+
# Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
|
|
613
|
+
if re.match(r"^\d+,\d{1,2}$", value):
|
|
614
|
+
value = value.replace(",", ".")
|
|
540
615
|
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
616
|
+
# If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
|
|
617
|
+
elif re.match(r"^\d+\.\d{3,}$", value):
|
|
618
|
+
value = value[: value.index(".") + 3]
|
|
619
|
+
|
|
620
|
+
else: # quantity=True → only last two
|
|
621
|
+
# Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
|
|
622
|
+
if re.match(r"^\d+,\d{1,2}$", value):
|
|
623
|
+
value = value.replace(",", ".")
|
|
624
|
+
|
|
625
|
+
# If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
|
|
626
|
+
elif re.match(r"^\d+\.\d{3,}$", value):
|
|
627
|
+
value = value[: value.index(".") + 3]
|
|
628
|
+
|
|
629
|
+
# Re-add negative sign if applicable
|
|
630
|
+
value = "-" + value if is_negative else value
|
|
544
631
|
|
|
545
632
|
return value
|
|
546
633
|
|
|
547
634
|
|
|
548
|
-
async def
|
|
635
|
+
async def collect_mapping_requests(entity_value, document_type_code):
|
|
636
|
+
"""Collect all unique container types, terminals, and depots from the entity value."""
|
|
637
|
+
# Sets to store unique values
|
|
638
|
+
container_types = set()
|
|
639
|
+
terminals = set()
|
|
640
|
+
depots = set()
|
|
641
|
+
|
|
642
|
+
def walk(key, value):
|
|
643
|
+
key_lower = key.lower()
|
|
644
|
+
|
|
645
|
+
# nested dict
|
|
646
|
+
if isinstance(value, dict):
|
|
647
|
+
for k, v in value.items():
|
|
648
|
+
walk(k, v)
|
|
649
|
+
|
|
650
|
+
# list of values
|
|
651
|
+
elif isinstance(value, list):
|
|
652
|
+
for item in value:
|
|
653
|
+
walk(key, item)
|
|
654
|
+
|
|
655
|
+
# leaf node
|
|
656
|
+
else:
|
|
657
|
+
if key_lower in ("containertype", "containersize"):
|
|
658
|
+
# Take only "20DV" from ('20DV', 0) if it's a tuple
|
|
659
|
+
container_types.add(value[0]) if isinstance(
|
|
660
|
+
value, tuple
|
|
661
|
+
) else container_types.add(value)
|
|
662
|
+
|
|
663
|
+
elif check_formatting_rule(key, document_type_code, "terminal"):
|
|
664
|
+
terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
|
|
665
|
+
value
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
elif check_formatting_rule(key, document_type_code, "depot"):
|
|
669
|
+
depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
|
|
670
|
+
|
|
671
|
+
walk("root", entity_value)
|
|
672
|
+
|
|
673
|
+
return container_types, terminals, depots
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
async def format_all_labels(entity_data, document_type_code, params, mime_type):
|
|
677
|
+
"""Format all labels in the entity data using cached mappings."""
|
|
678
|
+
# Collect all mapping values needed
|
|
679
|
+
container_req, terminal_req, depot_req = await collect_mapping_requests(
|
|
680
|
+
entity_data, document_type_code
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Batch fetch mappings
|
|
684
|
+
container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
|
|
685
|
+
container_req, terminal_req, depot_req
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Format labels using cached mappings
|
|
689
|
+
_, result = await format_label(
|
|
690
|
+
"root",
|
|
691
|
+
entity_data,
|
|
692
|
+
document_type_code,
|
|
693
|
+
params,
|
|
694
|
+
mime_type,
|
|
695
|
+
container_map,
|
|
696
|
+
terminal_map,
|
|
697
|
+
depot_map,
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
return _, result
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
async def format_all_entities(result, document_type_code, params, mime_type):
|
|
549
704
|
"""Format the entity values in the result dictionary."""
|
|
550
705
|
# Since we treat `customsInvoice` same as `partnerInvoice`
|
|
551
706
|
document_type_code = (
|
|
@@ -560,11 +715,13 @@ async def format_all_entities(result, document_type_code, params):
|
|
|
560
715
|
return {}
|
|
561
716
|
|
|
562
717
|
# Format all entities recursively
|
|
563
|
-
_, aggregated_data = await
|
|
718
|
+
_, aggregated_data = await format_all_labels(
|
|
719
|
+
result, document_type_code, params, mime_type
|
|
720
|
+
)
|
|
564
721
|
|
|
565
722
|
# Process partner invoice on lineitem mapping and reverse charge sentence
|
|
566
723
|
if document_type_code in ["partnerInvoice", "bundeskasse"]:
|
|
567
|
-
process_partner_invoice(params, aggregated_data, document_type_code)
|
|
724
|
+
await process_partner_invoice(params, aggregated_data, document_type_code)
|
|
568
725
|
|
|
569
726
|
logger.info("Data Extraction completed successfully")
|
|
570
727
|
return aggregated_data
|
|
@@ -594,3 +751,48 @@ def remove_stop_words(lineitem: str):
|
|
|
594
751
|
.upper()
|
|
595
752
|
.strip()
|
|
596
753
|
)
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
|
|
757
|
+
"""Convert LLM prediction dictionary to tuples of (value, page_number)."""
|
|
758
|
+
# If only 1 page, simply pair each value with page number 0
|
|
759
|
+
if number_of_pages == 1:
|
|
760
|
+
effective_page = 0 if page_number is None else page_number
|
|
761
|
+
if isinstance(llm_prediction, dict):
|
|
762
|
+
return {
|
|
763
|
+
k: llm_prediction_to_tuples(
|
|
764
|
+
v, number_of_pages, page_number=effective_page
|
|
765
|
+
)
|
|
766
|
+
for k, v in llm_prediction.items()
|
|
767
|
+
}
|
|
768
|
+
elif isinstance(llm_prediction, list):
|
|
769
|
+
return [
|
|
770
|
+
llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
|
|
771
|
+
for v in llm_prediction
|
|
772
|
+
]
|
|
773
|
+
else:
|
|
774
|
+
return (llm_prediction, effective_page) if llm_prediction else None
|
|
775
|
+
|
|
776
|
+
# logic for multi-page predictions
|
|
777
|
+
if isinstance(llm_prediction, dict):
|
|
778
|
+
if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
|
|
779
|
+
if llm_prediction["value"]:
|
|
780
|
+
try:
|
|
781
|
+
_page_number = int(llm_prediction["page_number"])
|
|
782
|
+
except: # noqa: E722
|
|
783
|
+
_page_number = -1
|
|
784
|
+
return (llm_prediction["value"], _page_number)
|
|
785
|
+
return None
|
|
786
|
+
|
|
787
|
+
for key, value in llm_prediction.items():
|
|
788
|
+
llm_prediction[key] = llm_prediction_to_tuples(
|
|
789
|
+
llm_prediction.get(key, value), number_of_pages, page_number
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
elif isinstance(llm_prediction, list):
|
|
793
|
+
for i, item in enumerate(llm_prediction):
|
|
794
|
+
llm_prediction[i] = llm_prediction_to_tuples(
|
|
795
|
+
item, number_of_pages, page_number
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
return llm_prediction
|