data-science-document-ai 1.40.2__py3-none-any.whl → 1.40.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.40.2.dist-info → data_science_document_ai-1.40.4.dist-info}/METADATA +1 -1
- {data_science_document_ai-1.40.2.dist-info → data_science_document_ai-1.40.4.dist-info}/RECORD +4 -4
- {data_science_document_ai-1.40.2.dist-info → data_science_document_ai-1.40.4.dist-info}/WHEEL +1 -1
- src/postprocessing/common.py +44 -21
{data_science_document_ai-1.40.2.dist-info → data_science_document_ai-1.40.4.dist-info}/RECORD
RENAMED
|
@@ -7,7 +7,7 @@ src/io.py,sha256=IXz4wWqiHa9mnHNgtrC6X9M2lItYp9eu6rHCThUIh5c,3585
|
|
|
7
7
|
src/llm.py,sha256=aEK3rL8XvY7CakvkOJQmcHpEKwZRd8PPrLrzHiO-GFk,7827
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
9
|
src/pdf_processing.py,sha256=S_eTsgaDIIr3SCrEmaQZyc7TDJlRI0GCuP0P9EGF1Xc,15385
|
|
10
|
-
src/postprocessing/common.py,sha256=
|
|
10
|
+
src/postprocessing/common.py,sha256=W4L455j7IvTRZDWiBizoj9KC_UGUflkL_hEkk5P0h0k,20391
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
13
|
src/postprocessing/postprocess_partner_invoice.py,sha256=cM4te4qjOI_bXyrF8Zhb6X7eNf5aMKoRaPCFfqFv-98,11538
|
|
@@ -54,6 +54,6 @@ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40
|
|
|
54
54
|
src/setup.py,sha256=kPSZosrICfaGZeDaajr40Ha7Ok4XK4fo_uq35Omiwr0,7128
|
|
55
55
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
56
56
|
src/utils.py,sha256=-1Yq_5ExZlFQRUPRsQHiBD3TthNSiPVPp46Dvdb9Kf0,13830
|
|
57
|
-
data_science_document_ai-1.40.
|
|
58
|
-
data_science_document_ai-1.40.
|
|
59
|
-
data_science_document_ai-1.40.
|
|
57
|
+
data_science_document_ai-1.40.4.dist-info/METADATA,sha256=zM542Z9wdq9B2SaEmjoAEzj20BYlbL9LxwjZvMXb22w,2153
|
|
58
|
+
data_science_document_ai-1.40.4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
59
|
+
data_science_document_ai-1.40.4.dist-info/RECORD,,
|
src/postprocessing/common.py
CHANGED
|
@@ -425,10 +425,12 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
425
425
|
# Remove all non-alphanumeric characters like ' ', '-', etc.
|
|
426
426
|
formatted_value = convert_container_number(entity_value)
|
|
427
427
|
|
|
428
|
-
elif (
|
|
429
|
-
|
|
428
|
+
elif any(
|
|
429
|
+
numeric_indicator in entity_key
|
|
430
|
+
for numeric_indicator in ["measurements", "weight"]
|
|
430
431
|
):
|
|
431
|
-
formatted_value =
|
|
432
|
+
formatted_value = extract_number(entity_value)
|
|
433
|
+
|
|
432
434
|
elif any(
|
|
433
435
|
packaging_type in entity_key
|
|
434
436
|
for packaging_type in ["packagingtype", "packagetype", "currency"]
|
|
@@ -444,11 +446,18 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
444
446
|
elif "reversechargesentence" in entity_key:
|
|
445
447
|
formatted_value = clean_item_description(entity_value, remove_numbers=False)
|
|
446
448
|
|
|
449
|
+
elif "quantity" in entity_key:
|
|
450
|
+
if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
|
|
451
|
+
# For partner invoice, quantity can be mentioned as whole number
|
|
452
|
+
formatted_value = decimal_convertor(
|
|
453
|
+
extract_number(entity_value), quantity=True
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
formatted_value = extract_number(entity_value)
|
|
457
|
+
|
|
447
458
|
elif any(
|
|
448
459
|
numeric_indicator in entity_key
|
|
449
460
|
for numeric_indicator in [
|
|
450
|
-
"weight",
|
|
451
|
-
"quantity",
|
|
452
461
|
"value",
|
|
453
462
|
"amount",
|
|
454
463
|
"price",
|
|
@@ -513,7 +522,7 @@ async def get_port_code_llm(port: str, llm_client):
|
|
|
513
522
|
return None
|
|
514
523
|
|
|
515
524
|
|
|
516
|
-
def decimal_convertor(value):
|
|
525
|
+
def decimal_convertor(value, quantity=False):
|
|
517
526
|
"""Convert EU values to English values."""
|
|
518
527
|
if value is None:
|
|
519
528
|
return None
|
|
@@ -521,25 +530,39 @@ def decimal_convertor(value):
|
|
|
521
530
|
# Remove spaces
|
|
522
531
|
value = value.strip().replace(" ", "")
|
|
523
532
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
533
|
+
if not quantity:
|
|
534
|
+
# Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
|
|
535
|
+
if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
|
|
536
|
+
value = value.replace(".", "").replace(",", ".")
|
|
537
|
+
|
|
538
|
+
# European style integer with thousand separator: 2.500
|
|
539
|
+
elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
|
|
540
|
+
value = value.replace(".", "")
|
|
541
|
+
|
|
542
|
+
# Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
|
|
543
|
+
elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
|
|
544
|
+
value = value.replace(",", "")
|
|
545
|
+
|
|
546
|
+
# English style integer with thousand separator: 2,500
|
|
547
|
+
elif re.match(r"^\d{1,3}(,\d{3})+$", value):
|
|
548
|
+
value = value.replace(",", "")
|
|
527
549
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
550
|
+
# Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
|
|
551
|
+
if re.match(r"^\d+,\d{1,2}$", value):
|
|
552
|
+
value = value.replace(",", ".")
|
|
531
553
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
554
|
+
# If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
|
|
555
|
+
elif re.match(r"^\d+\.\d{3,}$", value):
|
|
556
|
+
value = value[: value.index(".") + 3]
|
|
535
557
|
|
|
536
|
-
#
|
|
537
|
-
|
|
538
|
-
|
|
558
|
+
else: # quantity=True → only last two
|
|
559
|
+
# Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
|
|
560
|
+
if re.match(r"^\d+,\d{1,2}$", value):
|
|
561
|
+
value = value.replace(",", ".")
|
|
539
562
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
563
|
+
# If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
|
|
564
|
+
elif re.match(r"^\d+\.\d{3,}$", value):
|
|
565
|
+
value = value[: value.index(".") + 3]
|
|
543
566
|
|
|
544
567
|
return value
|
|
545
568
|
|