data-science-document-ai 1.57.0__py3-none-any.whl → 1.58.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.57.0
3
+ Version: 1.58.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -7,7 +7,7 @@ src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
7
7
  src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
9
  src/pdf_processing.py,sha256=Fx-Glb9niEUU3WUCrBZ02ZYV-E2vWoUM0ifN7-0A1Q4,19961
10
- src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
10
+ src/postprocessing/common.py,sha256=tyy97UBfcnSs8Oh5vVDp4D1qDRit32ri9IGqRlNZcaY,27254
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
13
  src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
@@ -15,8 +15,8 @@ src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr
15
15
  src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
16
16
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=5efq6b--KGWeqGbvASZFTqXJgUEAvsC-0ljo-q0Lhew,5855
17
17
  src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=OxNfXZaWppwsFMprthzJpOOr8ApQL4KYEmlu9fSUvxk,3485
18
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=5efq6b--KGWeqGbvASZFTqXJgUEAvsC-0ljo-q0Lhew,5855
19
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=I9x52U9Kt3vwiQK96KnaAhD48rEjL47jS0JDm3QT27k,5025
18
+ src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=en83Em25e5PF2OAgFJC8w-MONVnketPZ3J_3zCjIVfE,5915
19
+ src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=bLHQgGR9e8X4UvFpiyd1OasD00XGvUMG6HSLQy4IgQ4,5157
20
20
  src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=5efq6b--KGWeqGbvASZFTqXJgUEAvsC-0ljo-q0Lhew,5855
21
21
  src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=S-C5cq8AkEoGKilCO0XiXLZXgZPwz9udQOTm557GG64,3984
22
22
  src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=5efq6b--KGWeqGbvASZFTqXJgUEAvsC-0ljo-q0Lhew,5855
@@ -54,7 +54,7 @@ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E
54
54
  src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
55
55
  src/setup.py,sha256=8-vZWjC8Iwa3xxdk3iR4412VCjtNtgzVqkXcFon7UBE,7309
56
56
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
57
- src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
- data_science_document_ai-1.57.0.dist-info/METADATA,sha256=CnYlyCERc0j4WniG4qrEYOKLhcTtEXWZw6nQ5XJEQeE,2152
59
- data_science_document_ai-1.57.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
- data_science_document_ai-1.57.0.dist-info/RECORD,,
57
+ src/utils.py,sha256=8BpuJJLiJZntZAI86cQMNa-FGjl9jbOjlCWIG27mjJo,17418
58
+ data_science_document_ai-1.58.0.dist-info/METADATA,sha256=8MWt4KlixrpV8lQhKmFo5i1UZn02o16vMaR3uEe94Js,2152
59
+ data_science_document_ai-1.58.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.58.0.dist-info/RECORD,,
@@ -723,10 +723,45 @@ async def format_all_entities(result, document_type_code, params, mime_type):
723
723
  if document_type_code in ["partnerInvoice", "bundeskasse"]:
724
724
  await process_partner_invoice(params, aggregated_data, document_type_code)
725
725
 
726
+ if document_type_code in ["bookingConfirmation"]:
727
+ aggregated_data["legalEntity"] = await get_legal_entity(
728
+ aggregated_data.get("carrierName", {}).get("documentValue", None),
729
+ aggregated_data.get("carrierAddress", {}).get("documentValue", None),
730
+ )
731
+
726
732
  logger.info("Data Extraction completed successfully")
727
733
  return aggregated_data
728
734
 
729
735
 
736
+ async def get_legal_entity(name, address):
737
+ """Get legal entity mapping from TMS mappings.
738
+
739
+ Args:
740
+ name (str): The name of the legal entity. Mandatory.
741
+ address (str): The address of the legal entity. Optional for better matching.
742
+
743
+ Returns:
744
+ dict or None: The mapping result from TMS embeddings, or None if not found.
745
+ """
746
+ # Name is mandatory for legal entity mapping
747
+ if not name:
748
+ return {"documentValue": None, "mappedValue": None}
749
+
750
+ # Build input safely
751
+ input_text = name if not address else f"{name} | {address}"
752
+
753
+ api_results = await get_tms_mappings(
754
+ input_list=[input_text],
755
+ embedding_type="legal_entities",
756
+ input_key="partnerNameAddress",
757
+ )
758
+
759
+ return {
760
+ "documentValue": None,
761
+ "formattedValue": api_results.get(input_text),
762
+ }
763
+
764
+
730
765
  def add_text_without_space(text):
731
766
  """If the cleaned text is different from the original text, append it.
732
767
  Useful for port names like QUINHON - Quinhon"""
@@ -134,7 +134,7 @@
134
134
  "carrierAddress": {
135
135
  "type": "STRING",
136
136
  "nullable": true,
137
- "description": "The address of the carrier who provides service and issued the document."
137
+ "description": "The address of the carrier who provides service and issued the document. It can be from Germany, Poland, Italy, Vietnam, China, etc."
138
138
  },
139
139
  "carrierName": {
140
140
  "type": "STRING",
@@ -29,8 +29,10 @@ For Import Shipment: The loaded container / cargo arrives at a port of discharge
29
29
 
30
30
  - carrierName and carrierAddress:
31
31
  - Extract the name and address of the carrier who is the main parent company in the document.
32
+ - It can be found in the top section of the document, often near the logo or header.
32
33
  - Example:
33
- - "Hapag-Lloyd AG" or "Hapag-Lloyd Aktiengesellschaft" for vendorName.
34
+ - "Hapag-Lloyd" for vendorName
35
+ - Hamburg, Germany, Poland, Italy, Vietnam, China etc... for vendorAddress
34
36
 
35
37
  - transportLegs: Multiple Transport Legs entries may exist, capture all instances under "transportLegs". Make sure the order of the legs are important.
36
38
  - eta: The estimated time of arrival for a specific leg.
src/utils.py CHANGED
@@ -361,7 +361,10 @@ def extract_top_pages(pdf_bytes, num_pages=4):
361
361
 
362
362
 
363
363
  async def get_tms_mappings(
364
- input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
364
+ input_list: List[str],
365
+ embedding_type: str,
366
+ llm_ports: Optional[List[str]] = None,
367
+ input_key: str = None,
365
368
  ) -> Dict[str, Any]:
366
369
  """Get TMS mappings for the given values.
367
370
 
@@ -370,6 +373,7 @@ async def get_tms_mappings(
370
373
  embedding_type (str): Type of embedding to use
371
374
  (e.g., "container_types", "ports", "depots", "lineitems", "terminals").
372
375
  llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
376
+ input_key (str, optional): Key to use for input list in payload. Defaults to None.
373
377
 
374
378
  Returns:
375
379
  dict or string: A dictionary or a string with the mapping results.
@@ -389,7 +393,7 @@ async def get_tms_mappings(
389
393
  input_list = [input_list]
390
394
 
391
395
  # Always send a dict with named keys
392
- payload = {embedding_type: input_list}
396
+ payload = {input_key or embedding_type: input_list}
393
397
 
394
398
  if llm_ports:
395
399
  payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]