data-science-document-ai 1.40.2__py3-none-any.whl → 1.40.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.40.2
3
+ Version: 1.40.4
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -7,7 +7,7 @@ src/io.py,sha256=IXz4wWqiHa9mnHNgtrC6X9M2lItYp9eu6rHCThUIh5c,3585
7
7
  src/llm.py,sha256=aEK3rL8XvY7CakvkOJQmcHpEKwZRd8PPrLrzHiO-GFk,7827
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
9
  src/pdf_processing.py,sha256=S_eTsgaDIIr3SCrEmaQZyc7TDJlRI0GCuP0P9EGF1Xc,15385
10
- src/postprocessing/common.py,sha256=OR9O73gUP4tevIZMnorbiUgzviEJlVr46ArTWMXrYVA,19316
10
+ src/postprocessing/common.py,sha256=W4L455j7IvTRZDWiBizoj9KC_UGUflkL_hEkk5P0h0k,20391
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
13
  src/postprocessing/postprocess_partner_invoice.py,sha256=cM4te4qjOI_bXyrF8Zhb6X7eNf5aMKoRaPCFfqFv-98,11538
@@ -54,6 +54,6 @@ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40
54
54
  src/setup.py,sha256=kPSZosrICfaGZeDaajr40Ha7Ok4XK4fo_uq35Omiwr0,7128
55
55
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
56
56
  src/utils.py,sha256=-1Yq_5ExZlFQRUPRsQHiBD3TthNSiPVPp46Dvdb9Kf0,13830
57
- data_science_document_ai-1.40.2.dist-info/METADATA,sha256=RYwuTFlx4I5lADhMdoZ5RBr-qPMD2eKMydljNOjPFK0,2153
58
- data_science_document_ai-1.40.2.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
59
- data_science_document_ai-1.40.2.dist-info/RECORD,,
57
+ data_science_document_ai-1.40.4.dist-info/METADATA,sha256=zM542Z9wdq9B2SaEmjoAEzj20BYlbL9LxwjZvMXb22w,2153
58
+ data_science_document_ai-1.40.4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
59
+ data_science_document_ai-1.40.4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.2.0
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -425,10 +425,12 @@ async def format_label(entity_k, entity_value, document_type_code, params):
425
425
  # Remove all non-alphanumeric characters like ' ', '-', etc.
426
426
  formatted_value = convert_container_number(entity_value)
427
427
 
428
- elif (
429
- document_type_code in ["finalMbL", "draftMbl"] and entity_key == "measurements"
428
+ elif any(
429
+ numeric_indicator in entity_key
430
+ for numeric_indicator in ["measurements", "weight"]
430
431
  ):
431
- formatted_value = decimal_convertor(extract_number(entity_value))
432
+ formatted_value = extract_number(entity_value)
433
+
432
434
  elif any(
433
435
  packaging_type in entity_key
434
436
  for packaging_type in ["packagingtype", "packagetype", "currency"]
@@ -444,11 +446,18 @@ async def format_label(entity_k, entity_value, document_type_code, params):
444
446
  elif "reversechargesentence" in entity_key:
445
447
  formatted_value = clean_item_description(entity_value, remove_numbers=False)
446
448
 
449
+ elif "quantity" in entity_key:
450
+ if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
451
+ # For partner invoice, quantity can be mentioned as whole number
452
+ formatted_value = decimal_convertor(
453
+ extract_number(entity_value), quantity=True
454
+ )
455
+ else:
456
+ formatted_value = extract_number(entity_value)
457
+
447
458
  elif any(
448
459
  numeric_indicator in entity_key
449
460
  for numeric_indicator in [
450
- "weight",
451
- "quantity",
452
461
  "value",
453
462
  "amount",
454
463
  "price",
@@ -513,7 +522,7 @@ async def get_port_code_llm(port: str, llm_client):
513
522
  return None
514
523
 
515
524
 
516
- def decimal_convertor(value):
525
+ def decimal_convertor(value, quantity=False):
517
526
  """Convert EU values to English values."""
518
527
  if value is None:
519
528
  return None
@@ -521,25 +530,39 @@ def decimal_convertor(value):
521
530
  # Remove spaces
522
531
  value = value.strip().replace(" ", "")
523
532
 
524
- # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
525
- if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
526
- value = value.replace(".", "").replace(",", ".")
533
+ if not quantity:
534
+ # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
535
+ if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
536
+ value = value.replace(".", "").replace(",", ".")
537
+
538
+ # European style integer with thousand separator: 2.500
539
+ elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
540
+ value = value.replace(".", "")
541
+
542
+ # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
543
+ elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
544
+ value = value.replace(",", "")
545
+
546
+ # English style integer with thousand separator: 2,500
547
+ elif re.match(r"^\d{1,3}(,\d{3})+$", value):
548
+ value = value.replace(",", "")
527
549
 
528
- # European style integer with thousand separator: 2.500
529
- elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
530
- value = value.replace(".", "")
550
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
551
+ if re.match(r"^\d+,\d{1,2}$", value):
552
+ value = value.replace(",", ".")
531
553
 
532
- # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
533
- elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
534
- value = value.replace(",", "")
554
+ # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
555
+ elif re.match(r"^\d+\.\d{3,}$", value):
556
+ value = value[: value.index(".") + 3]
535
557
 
536
- # English style integer with thousand separator: 2,500
537
- elif re.match(r"^\d{1,3}(,\d{3})+$", value):
538
- value = value.replace(",", "")
558
+ else: # quantity=True only last two
559
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
560
+ if re.match(r"^\d+,\d{1,2}$", value):
561
+ value = value.replace(",", ".")
539
562
 
540
- # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
541
- elif re.match(r"^\d+,\d{1,2}$", value):
542
- value = value.replace(",", ".")
563
+ # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
564
+ elif re.match(r"^\d+\.\d{3,}$", value):
565
+ value = value[: value.index(".") + 3]
543
566
 
544
567
  return value
545
568