data-science-document-ai 1.40.3__py3-none-any.whl → 1.40.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.40.3
3
+ Version: 1.40.4
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -7,7 +7,7 @@ src/io.py,sha256=IXz4wWqiHa9mnHNgtrC6X9M2lItYp9eu6rHCThUIh5c,3585
7
7
  src/llm.py,sha256=aEK3rL8XvY7CakvkOJQmcHpEKwZRd8PPrLrzHiO-GFk,7827
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
9
  src/pdf_processing.py,sha256=S_eTsgaDIIr3SCrEmaQZyc7TDJlRI0GCuP0P9EGF1Xc,15385
10
- src/postprocessing/common.py,sha256=ll7VMEJ_51OeczcV8Uw-aVrufV3kd3kNLCmss3kt0Do,19291
10
+ src/postprocessing/common.py,sha256=W4L455j7IvTRZDWiBizoj9KC_UGUflkL_hEkk5P0h0k,20391
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
13
  src/postprocessing/postprocess_partner_invoice.py,sha256=cM4te4qjOI_bXyrF8Zhb6X7eNf5aMKoRaPCFfqFv-98,11538
@@ -54,6 +54,6 @@ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40
54
54
  src/setup.py,sha256=kPSZosrICfaGZeDaajr40Ha7Ok4XK4fo_uq35Omiwr0,7128
55
55
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
56
56
  src/utils.py,sha256=-1Yq_5ExZlFQRUPRsQHiBD3TthNSiPVPp46Dvdb9Kf0,13830
57
- data_science_document_ai-1.40.3.dist-info/METADATA,sha256=ym7EzwlZAar6Qvx0GgwVQM44p30sw74-nrPl7Liyg_8,2153
58
- data_science_document_ai-1.40.3.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
59
- data_science_document_ai-1.40.3.dist-info/RECORD,,
57
+ data_science_document_ai-1.40.4.dist-info/METADATA,sha256=zM542Z9wdq9B2SaEmjoAEzj20BYlbL9LxwjZvMXb22w,2153
58
+ data_science_document_ai-1.40.4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
59
+ data_science_document_ai-1.40.4.dist-info/RECORD,,
@@ -446,10 +446,18 @@ async def format_label(entity_k, entity_value, document_type_code, params):
446
446
  elif "reversechargesentence" in entity_key:
447
447
  formatted_value = clean_item_description(entity_value, remove_numbers=False)
448
448
 
449
+ elif "quantity" in entity_key:
450
+ if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
451
+ # For partner invoice, quantity can be mentioned as whole number
452
+ formatted_value = decimal_convertor(
453
+ extract_number(entity_value), quantity=True
454
+ )
455
+ else:
456
+ formatted_value = extract_number(entity_value)
457
+
449
458
  elif any(
450
459
  numeric_indicator in entity_key
451
460
  for numeric_indicator in [
452
- "quantity",
453
461
  "value",
454
462
  "amount",
455
463
  "price",
@@ -514,7 +522,7 @@ async def get_port_code_llm(port: str, llm_client):
514
522
  return None
515
523
 
516
524
 
517
- def decimal_convertor(value):
525
+ def decimal_convertor(value, quantity=False):
518
526
  """Convert EU values to English values."""
519
527
  if value is None:
520
528
  return None
@@ -522,25 +530,39 @@ def decimal_convertor(value):
522
530
  # Remove spaces
523
531
  value = value.strip().replace(" ", "")
524
532
 
525
- # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
526
- if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
527
- value = value.replace(".", "").replace(",", ".")
533
+ if not quantity:
534
+ # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
535
+ if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
536
+ value = value.replace(".", "").replace(",", ".")
537
+
538
+ # European style integer with thousand separator: 2.500
539
+ elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
540
+ value = value.replace(".", "")
541
+
542
+ # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
543
+ elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
544
+ value = value.replace(",", "")
545
+
546
+ # English style integer with thousand separator: 2,500
547
+ elif re.match(r"^\d{1,3}(,\d{3})+$", value):
548
+ value = value.replace(",", "")
528
549
 
529
- # European style integer with thousand separator: 2.500
530
- elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
531
- value = value.replace(".", "")
550
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
551
+ if re.match(r"^\d+,\d{1,2}$", value):
552
+ value = value.replace(",", ".")
532
553
 
533
- # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
534
- elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
535
- value = value.replace(",", "")
554
+ # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
555
+ elif re.match(r"^\d+\.\d{3,}$", value):
556
+ value = value[: value.index(".") + 3]
536
557
 
537
- # English style integer with thousand separator: 2,500
538
- elif re.match(r"^\d{1,3}(,\d{3})+$", value):
539
- value = value.replace(",", "")
558
+ else: # quantity=True only last two
559
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
560
+ if re.match(r"^\d+,\d{1,2}$", value):
561
+ value = value.replace(",", ".")
540
562
 
541
- # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
542
- elif re.match(r"^\d+,\d{1,2}$", value):
543
- value = value.replace(",", ".")
563
+ # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
564
+ elif re.match(r"^\d+\.\d{3,}$", value):
565
+ value = value[: value.index(".") + 3]
544
566
 
545
567
  return value
546
568