data-science-document-ai 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +2 -2
  2. data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
  3. src/constants.py +6 -10
  4. src/docai.py +14 -5
  5. src/docai_processor_config.yaml +0 -56
  6. src/excel_processing.py +34 -13
  7. src/io.py +69 -1
  8. src/llm.py +10 -32
  9. src/pdf_processing.py +192 -54
  10. src/postprocessing/common.py +246 -44
  11. src/postprocessing/postprocess_partner_invoice.py +139 -85
  12. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  13. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  14. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  15. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  16. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  17. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  18. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  19. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  20. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  21. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  22. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  23. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  24. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  25. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  26. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  27. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  28. src/prompts/library/bundeskasse/other/placeholders.json +25 -25
  29. src/prompts/library/bundeskasse/other/prompt.txt +8 -6
  30. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  31. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  32. src/prompts/library/customsAssessment/other/placeholders.json +67 -16
  33. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  34. src/prompts/library/customsInvoice/other/placeholders.json +20 -20
  35. src/prompts/library/customsInvoice/other/prompt.txt +4 -4
  36. src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
  37. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  38. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  39. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  40. src/prompts/library/finalMbL/other/placeholders.json +34 -34
  41. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  42. src/prompts/library/packingList/other/placeholders.json +98 -0
  43. src/prompts/library/packingList/other/prompt.txt +1 -1
  44. src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
  45. src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
  46. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  47. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  48. src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
  49. src/setup.py +13 -16
  50. src/utils.py +157 -45
  51. data_science_document_ai-1.40.3.dist-info/RECORD +0 -59
  52. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
  53. src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
  54. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  55. src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
  56. {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +0 -0
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
12
12
  from src.io import logger
13
13
  from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
14
14
  from src.prompts.prompt_library import prompt_library
15
- from src.utils import get_tms_mappings
15
+ from src.utils import batch_fetch_all_mappings, get_tms_mappings
16
16
 
17
17
  tms_domain = os.environ["TMS_DOMAIN"]
18
18
 
@@ -84,16 +84,16 @@ def clean_shipment_id(shipment_id):
84
84
  """
85
85
  if not shipment_id:
86
86
  return
87
- # '#S123456@-1' -> 'S123456'
88
- # Find the pattern of a shipment ID that starts with 'S' followed by 5 to 7 digits
89
- match = re.findall(r"S\d{5,7}", shipment_id)
87
+ # '#S1234565@-1' -> 'S1234565'
88
+ # Find the pattern of a shipment ID that starts with 'S' followed by 7 to 8 digits
89
+ match = re.findall(r"S\d{6,8}", shipment_id)
90
90
  stripped_value = match[0] if match else None
91
91
 
92
92
  if not stripped_value:
93
93
  return None
94
94
 
95
95
  # Check if length is valid (should be either 7 or 8)
96
- if len(stripped_value) not in (6, 7, 8):
96
+ if len(stripped_value) not in (7, 8, 9):
97
97
  return None
98
98
 
99
99
  return stripped_value
@@ -134,9 +134,12 @@ def extract_number(data_field_value):
134
134
  formatted_value: string
135
135
 
136
136
  """
137
+ # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
138
+ value = remove_unwanted_patterns(data_field_value)
139
+
137
140
  formatted_value = ""
138
- for c in data_field_value:
139
- if c.isnumeric() or c in [",", "."]:
141
+ for c in value:
142
+ if c.isnumeric() or c in [",", ".", "-"]:
140
143
  formatted_value += c
141
144
 
142
145
  # First and last characters should not be [",", "."]
@@ -319,6 +322,14 @@ def remove_unwanted_patterns(lineitem: str):
319
322
  # Remove "HIGH CUBE"
320
323
  lineitem = lineitem.replace("HIGH CUBE", "")
321
324
 
325
+ # Remove container size e.g., 20FT, 40HC, etc.
326
+ pattern = [
327
+ f"{s}{t}"
328
+ for s in ("20|22|40|45".split("|"))
329
+ for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
330
+ ]
331
+ lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
332
+
322
333
  return lineitem
323
334
 
324
335
 
@@ -349,51 +360,91 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
349
360
  # Remove the currency codes
350
361
  lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
351
362
 
363
+ # remove other patterns
364
+ lineitem = remove_unwanted_patterns(lineitem)
365
+
352
366
  # Remove numbers from the line item
353
367
  if (
354
368
  remove_numbers
355
369
  ): # Do not remove numbers for the reverse charge sentence as it contains Article number
356
370
  lineitem = re.sub(r"\d+", "", lineitem)
357
371
 
358
- # remove other patterns
359
- lineitem = remove_unwanted_patterns(lineitem)
360
-
361
372
  # remove special chars
362
373
  lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
363
374
 
375
+ # Remove x from lineitem like 10 x
376
+ lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
377
+
364
378
  return re.sub(r"\s{2,}", " ", lineitem).strip()
365
379
 
366
380
 
367
- async def format_label(entity_k, entity_value, document_type_code, params):
381
+ async def format_label(
382
+ entity_k,
383
+ entity_value,
384
+ document_type_code,
385
+ params,
386
+ mime_type,
387
+ container_map,
388
+ terminal_map,
389
+ depot_map,
390
+ ):
368
391
  llm_client = params["LlmClient"]
369
392
  if isinstance(entity_value, dict): # if it's a nested entity
370
393
  format_tasks = [
371
- format_label(sub_k, sub_v, document_type_code, params)
394
+ format_label(
395
+ sub_k,
396
+ sub_v,
397
+ document_type_code,
398
+ params,
399
+ mime_type,
400
+ container_map,
401
+ terminal_map,
402
+ depot_map,
403
+ )
372
404
  for sub_k, sub_v in entity_value.items()
373
405
  ]
374
406
  return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
375
407
  if isinstance(entity_value, list):
376
408
  format_tasks = await asyncio.gather(
377
409
  *[
378
- format_label(entity_k, sub_v, document_type_code, params)
410
+ format_label(
411
+ entity_k,
412
+ sub_v,
413
+ document_type_code,
414
+ params,
415
+ mime_type,
416
+ container_map,
417
+ terminal_map,
418
+ depot_map,
419
+ )
379
420
  for sub_v in entity_value
380
421
  ]
381
422
  )
382
423
  return entity_k, [v for _, v in format_tasks]
424
+
425
+ if mime_type == "application/pdf":
426
+ if isinstance(entity_value, tuple):
427
+ page = entity_value[1]
428
+ entity_value = entity_value[0]
429
+ else:
430
+ page = -1
431
+
383
432
  entity_key = entity_k.lower()
384
433
  formatted_value = None
385
434
 
386
435
  if entity_key.startswith("port"):
387
- formatted_value = await get_port_code_ai(entity_value, llm_client)
436
+ formatted_value = await get_port_code_ai(
437
+ entity_value, llm_client, doc_type=document_type_code
438
+ )
388
439
 
389
440
  elif (entity_key == "containertype") or (entity_key == "containersize"):
390
- formatted_value = get_tms_mappings(entity_value, "container_types")
441
+ formatted_value = container_map.get(entity_value)
391
442
 
392
443
  elif check_formatting_rule(entity_k, document_type_code, "terminal"):
393
- formatted_value = get_tms_mappings(entity_value, "terminals")
444
+ formatted_value = terminal_map.get(entity_value)
394
445
 
395
446
  elif check_formatting_rule(entity_k, document_type_code, "depot"):
396
- formatted_value = get_tms_mappings(entity_value, "depots")
447
+ formatted_value = depot_map.get(entity_value)
397
448
 
398
449
  elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
399
450
  try:
@@ -414,11 +465,14 @@ async def format_label(entity_k, entity_value, document_type_code, params):
414
465
  except ValueError as e:
415
466
  logger.info(f"ParserError: {e}")
416
467
 
417
- elif entity_key in ["invoicenumber", "creditnoteinvoicenumber"]:
468
+ elif (
469
+ entity_key in ["invoicenumber", "creditnoteinvoicenumber"]
470
+ and document_type_code == "bundeskasse"
471
+ ):
418
472
  formatted_value = clean_invoice_number(entity_value)
419
473
 
420
474
  elif entity_key in ("shipmentid", "partnerreference"):
421
- # Clean the shipment ID to match Forto's standard (starts with 'S' followed by 5 to 7 digits)
475
+ # Clean the shipment ID to match Forto's standard (starts with 'S' followed by 7 or 8 digits)
422
476
  formatted_value = clean_shipment_id(entity_value)
423
477
 
424
478
  elif entity_key == "containernumber":
@@ -446,10 +500,19 @@ async def format_label(entity_k, entity_value, document_type_code, params):
446
500
  elif "reversechargesentence" in entity_key:
447
501
  formatted_value = clean_item_description(entity_value, remove_numbers=False)
448
502
 
503
+ elif "quantity" in entity_key:
504
+ if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
505
+ # For partner invoice, quantity can be mentioned as whole number
506
+ # Apply decimal convertor for 46,45 --> 46.45 but not for 1.000 --> 1000
507
+ formatted_value = decimal_convertor(
508
+ extract_number(entity_value), quantity=True
509
+ )
510
+ else:
511
+ formatted_value = extract_number(entity_value)
512
+
449
513
  elif any(
450
514
  numeric_indicator in entity_key
451
515
  for numeric_indicator in [
452
- "quantity",
453
516
  "value",
454
517
  "amount",
455
518
  "price",
@@ -467,17 +530,21 @@ async def format_label(entity_k, entity_value, document_type_code, params):
467
530
  "documentValue": entity_value,
468
531
  "formattedValue": formatted_value,
469
532
  }
533
+ if mime_type == "application/pdf":
534
+ result["page"] = page
535
+
470
536
  return entity_k, result
471
537
 
472
538
 
473
- async def get_port_code_ai(port: str, llm_client):
539
+ async def get_port_code_ai(port: str, llm_client, doc_type=None):
474
540
  """Get port code using AI model."""
475
- port_llm = await get_port_code_llm(port, llm_client)
541
+ port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
476
542
 
477
- return get_tms_mappings(port, "ports", port_llm)
543
+ result = await get_tms_mappings(port, "ports", port_llm)
544
+ return result.get(port, None)
478
545
 
479
546
 
480
- async def get_port_code_llm(port: str, llm_client):
547
+ async def get_port_code_llm(port: str, llm_client, doc_type=None):
481
548
  if (
482
549
  "postprocessing" in prompt_library.library.keys()
483
550
  and "port_code" in prompt_library.library["postprocessing"].keys()
@@ -504,7 +571,7 @@ async def get_port_code_llm(port: str, llm_client):
504
571
  }
505
572
 
506
573
  response = await llm_client.get_unified_json_genai(
507
- prompt, response_schema=response_schema, model="chatgpt"
574
+ prompt, response_schema=response_schema, model="chatgpt", doc_type=doc_type
508
575
  )
509
576
  try:
510
577
  mapped_port = response["port"]
@@ -514,7 +581,7 @@ async def get_port_code_llm(port: str, llm_client):
514
581
  return None
515
582
 
516
583
 
517
- def decimal_convertor(value):
584
+ def decimal_convertor(value, quantity=False):
518
585
  """Convert EU values to English values."""
519
586
  if value is None:
520
587
  return None
@@ -522,30 +589,118 @@ def decimal_convertor(value):
522
589
  # Remove spaces
523
590
  value = value.strip().replace(" ", "")
524
591
 
525
- # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
526
- if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
527
- value = value.replace(".", "").replace(",", ".")
592
+ # Check "-" and remove it for processing
593
+ is_negative, value = (True, value[1:]) if value.startswith("-") else (False, value)
594
+
595
+ if not quantity:
596
+ # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
597
+ if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
598
+ value = value.replace(".", "").replace(",", ".")
599
+
600
+ # European style integer with thousand separator: 2.500
601
+ elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
602
+ value = value.replace(".", "")
528
603
 
529
- # European style integer with thousand separator: 2.500
530
- elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
531
- value = value.replace(".", "")
604
+ # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
605
+ elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
606
+ value = value.replace(",", "")
532
607
 
533
- # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
534
- elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
535
- value = value.replace(",", "")
608
+ # English style integer with thousand separator: 2,500
609
+ elif re.match(r"^\d{1,3}(,\d{3})+$", value):
610
+ value = value.replace(",", "")
536
611
 
537
- # English style integer with thousand separator: 2,500
538
- elif re.match(r"^\d{1,3}(,\d{3})+$", value):
539
- value = value.replace(",", "")
612
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
613
+ if re.match(r"^\d+,\d{1,2}$", value):
614
+ value = value.replace(",", ".")
540
615
 
541
- # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
542
- elif re.match(r"^\d+,\d{1,2}$", value):
543
- value = value.replace(",", ".")
616
+ # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
617
+ elif re.match(r"^\d+\.\d{3,}$", value):
618
+ value = value[: value.index(".") + 3]
619
+
620
+ else: # quantity=True → only last two
621
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
622
+ if re.match(r"^\d+,\d{1,2}$", value):
623
+ value = value.replace(",", ".")
624
+
625
+ # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
626
+ elif re.match(r"^\d+\.\d{3,}$", value):
627
+ value = value[: value.index(".") + 3]
628
+
629
+ # Re-add negative sign if applicable
630
+ value = "-" + value if is_negative else value
544
631
 
545
632
  return value
546
633
 
547
634
 
548
- async def format_all_entities(result, document_type_code, params):
635
+ async def collect_mapping_requests(entity_value, document_type_code):
636
+ """Collect all unique container types, terminals, and depots from the entity value."""
637
+ # Sets to store unique values
638
+ container_types = set()
639
+ terminals = set()
640
+ depots = set()
641
+
642
+ def walk(key, value):
643
+ key_lower = key.lower()
644
+
645
+ # nested dict
646
+ if isinstance(value, dict):
647
+ for k, v in value.items():
648
+ walk(k, v)
649
+
650
+ # list of values
651
+ elif isinstance(value, list):
652
+ for item in value:
653
+ walk(key, item)
654
+
655
+ # leaf node
656
+ else:
657
+ if key_lower in ("containertype", "containersize"):
658
+ # Take only "20DV" from ('20DV', 0) if it's a tuple
659
+ container_types.add(value[0]) if isinstance(
660
+ value, tuple
661
+ ) else container_types.add(value)
662
+
663
+ elif check_formatting_rule(key, document_type_code, "terminal"):
664
+ terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
665
+ value
666
+ )
667
+
668
+ elif check_formatting_rule(key, document_type_code, "depot"):
669
+ depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
670
+
671
+ walk("root", entity_value)
672
+
673
+ return container_types, terminals, depots
674
+
675
+
676
+ async def format_all_labels(entity_data, document_type_code, params, mime_type):
677
+ """Format all labels in the entity data using cached mappings."""
678
+ # Collect all mapping values needed
679
+ container_req, terminal_req, depot_req = await collect_mapping_requests(
680
+ entity_data, document_type_code
681
+ )
682
+
683
+ # Batch fetch mappings
684
+ container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
685
+ container_req, terminal_req, depot_req
686
+ )
687
+
688
+ # Format labels using cached mappings
689
+ _, result = await format_label(
690
+ "root",
691
+ entity_data,
692
+ document_type_code,
693
+ params,
694
+ mime_type,
695
+ container_map,
696
+ terminal_map,
697
+ depot_map,
698
+ )
699
+
700
+ return _, result
701
+
702
+
703
+ async def format_all_entities(result, document_type_code, params, mime_type):
549
704
  """Format the entity values in the result dictionary."""
550
705
  # Since we treat `customsInvoice` same as `partnerInvoice`
551
706
  document_type_code = (
@@ -560,11 +715,13 @@ async def format_all_entities(result, document_type_code, params):
560
715
  return {}
561
716
 
562
717
  # Format all entities recursively
563
- _, aggregated_data = await format_label(None, result, document_type_code, params)
718
+ _, aggregated_data = await format_all_labels(
719
+ result, document_type_code, params, mime_type
720
+ )
564
721
 
565
722
  # Process partner invoice on lineitem mapping and reverse charge sentence
566
723
  if document_type_code in ["partnerInvoice", "bundeskasse"]:
567
- process_partner_invoice(params, aggregated_data, document_type_code)
724
+ await process_partner_invoice(params, aggregated_data, document_type_code)
568
725
 
569
726
  logger.info("Data Extraction completed successfully")
570
727
  return aggregated_data
@@ -594,3 +751,48 @@ def remove_stop_words(lineitem: str):
594
751
  .upper()
595
752
  .strip()
596
753
  )
754
+
755
+
756
+ def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
757
+ """Convert LLM prediction dictionary to tuples of (value, page_number)."""
758
+ # If only 1 page, simply pair each value with page number 0
759
+ if number_of_pages == 1:
760
+ effective_page = 0 if page_number is None else page_number
761
+ if isinstance(llm_prediction, dict):
762
+ return {
763
+ k: llm_prediction_to_tuples(
764
+ v, number_of_pages, page_number=effective_page
765
+ )
766
+ for k, v in llm_prediction.items()
767
+ }
768
+ elif isinstance(llm_prediction, list):
769
+ return [
770
+ llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
771
+ for v in llm_prediction
772
+ ]
773
+ else:
774
+ return (llm_prediction, effective_page) if llm_prediction else None
775
+
776
+ # logic for multi-page predictions
777
+ if isinstance(llm_prediction, dict):
778
+ if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
779
+ if llm_prediction["value"]:
780
+ try:
781
+ _page_number = int(llm_prediction["page_number"])
782
+ except: # noqa: E722
783
+ _page_number = -1
784
+ return (llm_prediction["value"], _page_number)
785
+ return None
786
+
787
+ for key, value in llm_prediction.items():
788
+ llm_prediction[key] = llm_prediction_to_tuples(
789
+ llm_prediction.get(key, value), number_of_pages, page_number
790
+ )
791
+
792
+ elif isinstance(llm_prediction, list):
793
+ for i, item in enumerate(llm_prediction):
794
+ llm_prediction[i] = llm_prediction_to_tuples(
795
+ item, number_of_pages, page_number
796
+ )
797
+
798
+ return llm_prediction