PyPI - data-science-document-ai - Versions diffs - 1.45.2__tar.gz → 1.59.0__tar.gz - Mend

data-science-document-ai 1.45.2tar.gz → 1.59.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

{data_science_document_ai-1.45.2 → data_science_document_ai-1.59.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.45.2
+Version: 1.59.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.45.2 → data_science_document_ai-1.59.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-science-document-ai"
-version = "1.45.2"
+version = "1.59.0"
 description = "\"Document AI repo for data science\""
 authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
 packages = [

{data_science_document_ai-1.45.2 → data_science_document_ai-1.59.0}/src/constants.py RENAMED Viewed

@@ -20,10 +20,11 @@ project_parameters = {
     # Fuzzy lookup
     "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
     "item_code_lookup": "line_item_kvp_table.json",
+    "intermodal_partners": "intermodal_partners.json",
     "invoice_classification_lookup": "invoice_classification.json",
     "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
     # Fuzzy logic params
-    "fuzzy_threshold_item_code": 90,
+    "fuzzy_threshold_item_code": 92,
     "fuzzy_threshold_reverse_charge": 80,
     "fuzzy_threshold_invoice_classification": 70,
     # Chunking params
@@ -36,6 +37,8 @@ project_parameters = {
     # models metadata (confidence),
     "g_model_data_folder": "models",
     "local_model_data_folder": "data",
+    "if_use_docai": False,
+    "if_use_llm": True,  # Keep it always True
     "released_doc_types": {
         "bookingConfirmation",
         "packingList",
@@ -50,23 +53,6 @@ project_parameters = {
         "customsInvoice",
         "bundeskasse",
     },
-    "model_selector": {
-        "stable": {
-            "bookingConfirmation": 1,
-            "finalMbL": 0,
-            "draftMbl": 0,
-            "arrivalNotice": 0,
-            "shippingInstruction": 0,
-            "customsAssessment": 0,
-            "deliveryOrder": 0,
-            "partnerInvoice": 0,
-        },
-        "beta": {
-            "bookingConfirmation": 0,
-        },
-    },
-    # this is the model selector for the model to be used from the model_config.yaml
-    # file based on the environment, 0 mean the first model in the list
     # LLM model parameters
     "gemini_params": {
         "temperature": 0,
@@ -84,23 +70,15 @@ project_parameters = {
         "seed": 42,
         "model_id": "gemini-2.5-flash",
     },
-    # Key to combine the LLM results with the Doc Ai results
-    "key_to_combine": {
-        "bookingConfirmation": ["transportLegs"],
-        "finalMbL": ["containers"],
-        "draftMbl": ["containers"],
-        "customsAssessment": ["containers"],
-        "packingList": ["skuData"],
-        "commercialInvoice": ["skus"],
-        "shippingInstruction": ["containers"],
-        "partnerInvoice": ["lineItem"],
-        "customsInvoice": ["lineItem"],
-        "bundeskasse": ["lineItem"],
-    },
 }
 # Hardcoded rules for data points formatting that can't be based on label name alone
 formatting_rules = {
-    "bookingConfirmation": {"pickUpTerminal": "depot", "gateInTerminal": "terminal"},
+    "bookingConfirmation": {
+        "pickUpDepotCode": "depot",
+        "dropOffDepotCode": "depot",
+        "gateInTerminalCode": "terminal",
+        "pickUpTerminalCode": "terminal",
+    },
     "deliveryOrder": {"pickUpTerminal": "terminal", "EmptyContainerDepot": "depot"},
 }

data_science_document_ai-1.59.0/src/docai_processor_config.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+models_project_id: "738250249861"
+model_config:
+  stable:
+    bookingConfirmation:
+      - id: "dc3e714cd168aeaa"
+        details:
+          display_name: "doc_cap_bookingConfirmation"
+          author: "reet.kanjilal@forto.com"
+          created_date: ""

{data_science_document_ai-1.45.2 → data_science_document_ai-1.59.0}/src/excel_processing.py RENAMED Viewed

@@ -11,9 +11,8 @@ import asyncio
 import numpy as np
 import pandas as pd
-from src.llm import prompt_excel_extraction
 from src.prompts.prompt_library import prompt_library
-from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
+from src.utils import estimate_page_count, get_excel_sheets
 async def extract_data_from_sheet(
@@ -29,11 +28,14 @@ async def extract_data_from_sheet(
     )
     # Prompt for the LLM JSON
-    prompt_docai = prompt_excel_extraction(worksheet)
+    prompt = prompt_library.library[doc_type]["other"]["prompt"]
+    # Join the worksheet content with the prompt
+    prompt = worksheet + "\n" + prompt
     try:
         result = await llm_client.get_unified_json_genai(
-            prompt_docai,
+            prompt,
             response_schema=response_schema,
             doc_type=doc_type,
         )
@@ -67,19 +69,7 @@ async def extract_data_from_excel(
     """
     # Generate the response structure
-    response_schema = (
-        prompt_library.library[input_doc_type]["other"]["placeholders"]
-        if input_doc_type
-        in [
-            "partnerInvoice",
-            "customsInvoice",
-            "bundeskasse",
-            "commercialInvoice",
-            "packingList",
-            "bookingConfirmation",
-        ]
-        else generate_schema_structure(params, input_doc_type)
-    )
+    response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
     # Load the Excel file and get ONLY the "visible" sheet names
     sheets, workbook = get_excel_sheets(file_content, mime_type)

{data_science_document_ai-1.45.2 → data_science_document_ai-1.59.0}/src/llm.py RENAMED Viewed

@@ -201,33 +201,4 @@ class LlmClient:
         return response
-def prompt_excel_extraction(excel_structured_text):
-    """Write a prompt to extract data from Excel files.
-    Args:
-        excel_structured_text (str): The structured text of the Excel file.
-    Returns:
-        prompt str: The prompt for common json.
-    """
-    prompt = f"""{excel_structured_text}
-    Task: Fill in the following dictionary from the information in the given in the above excel data.
-    Instructions:
-    - Do not change the keys of the following dictionary.
-    - The values should be filled in as per the schema provided below.
-    - If an entity contains a 'display_name', consider its properties as child data points in the below format.
-    {{'data-field': {{
-        'child-data-field': 'type -occurrence_type- description',
-          }}
-    }}
-    - The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
-    - Ensure the schema reflects the hierarchical relationship.
-    - Use the data field description to understand the context of the data.
-    """
-    return prompt
 # pylint: enable=all

{data_science_document_ai-1.45.2 → data_science_document_ai-1.59.0}/src/pdf_processing.py RENAMED Viewed

@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
 from src.prompts.prompt_library import prompt_library
 from src.utils import (
     extract_top_pages,
-    generate_schema_structure,
     get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
@@ -202,21 +201,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     number_of_pages = get_pdf_page_count(file_content)
     logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
-    # get the schema placeholder from the Doc AI and generate the response structure
-    response_schema = (
-        prompt_library.library[input_doc_type]["other"]["placeholders"]
-        if input_doc_type
-        in [
-            "partnerInvoice",
-            "customsInvoice",
-            "bundeskasse",
-            "commercialInvoice",
-            "packingList",
-            "bookingConfirmation",
-        ]
-        else generate_schema_structure(params, input_doc_type)
-    )
     carrier = "other"
     carrier_schema = (
         prompt_library.library.get("preprocessing", {})
@@ -253,6 +237,9 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     # get the related prompt from predefined prompt library
     prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
+    # get the schema placeholder
+    response_schema = prompt_library.library[input_doc_type][carrier]["placeholders"]
     # Add page-number extraction for moderately large docs
     use_chunking = number_of_pages >= params["chunk_after"]
@@ -270,7 +257,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     ):
         tasks.append(
             process_chunk_with_retry(
-                chunk, prompt, response_schema, llm_client, input_doc_type
+                chunk,
+                prompt,
+                response_schema,
+                llm_client,
+                input_doc_type,
             )
         )
@@ -362,8 +353,7 @@ async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_
     # Add currency from the amount field
     if input_doc_type in ["commercialInvoice"]:
         result = postprocessing_commercial_invoice(result, params, input_doc_type)
-    elif input_doc_type == "bookingConfirmation":
-        result = postprocess_booking_confirmation(result)
     return result, llm_client.model_id
@@ -382,13 +372,14 @@ def combine_llm_results_w_doc_ai(
     Returns:
         combined result
     """
-    result = doc_ai.copy()
-    llm = remove_none_values(llm)
-    if not llm:
+    result = remove_none_values(llm)
+    docAi = doc_ai.copy()
+    if not docAi:
         return result
     # Merge top-level keys
-    result.update({k: v for k, v in llm.items() if k not in result})
+    result.update({k: v for k, v in docAi.items() if k not in result})
     if (
         input_doc_type
@@ -396,28 +387,28 @@ def combine_llm_results_w_doc_ai(
         and keys_to_combine
     ):
         result.update(
-            {key: llm.get(key) for key in keys_to_combine if key in llm.keys()}
+            {key: docAi.get(key) for key in keys_to_combine if key in docAi.keys()}
         )
         return result
     # Handle specific key-based merging logic for multiple keys
     if keys_to_combine:
         for key in keys_to_combine:
-            if key in llm.keys():
+            if key in docAi.keys():
                 # Merge the list of dictionaries
-                # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
-                if len(llm[key]) < len(result[key]):
-                    result[key] = llm[key]
+                # If the length of the docAi list is less than the LLM result, replace with the docAi list
+                if len(docAi[key]) < len(result[key]):
+                    result[key] = docAi[key]
                 else:
-                    # If the length of the LLM list is greater than or equal to the Doc AI result,
+                    # If the length of the docAi list is greater than or equal to the LLM result,
                     # add & merge the dictionaries
-                    if isinstance(llm[key], list):
-                        for i in range(len(llm[key])):
+                    if isinstance(docAi[key], list):
+                        for i in range(len(docAi[key])):
                             if i == len(result[key]):
-                                result[key].append(llm[key][i])
+                                result[key].append(docAi[key][i])
                             else:
-                                for sub_key in llm[key][i].keys():
-                                    result[key][i][sub_key] = llm[key][i][sub_key]
+                                for sub_key in docAi[key][i].keys():
+                                    result[key][i][sub_key] = docAi[key][i][sub_key]
     return result
@@ -511,13 +502,15 @@ async def data_extraction_manual_flow(
     page_count = None
     # Validate the file type
     if mime_type == "application/pdf":
+        if_use_docai = params["if_use_docai"]
         # Enable Doc Ai only for certain document types.
-        if_use_docai = (
-            True if meta.documentTypeCode in params["model_config"]["stable"] else False
-        )
-        if_use_llm = (
-            True if meta.documentTypeCode in params["key_to_combine"].keys() else False
-        )
+        if params["if_use_docai"]:
+            if_use_docai = (
+                True
+                if meta.documentTypeCode in params["model_config"]["stable"]
+                else False
+            )
         (
             extracted_data,
@@ -529,7 +522,7 @@ async def data_extraction_manual_flow(
             meta.documentTypeCode,
             processor_client,
             if_use_docai=if_use_docai,
-            if_use_llm=if_use_llm,
+            if_use_llm=params["if_use_llm"],
             llm_client=llm_client,
             isBetaTest=False,
         )

{data_science_document_ai-1.45.2 → data_science_document_ai-1.59.0}/src/postprocessing/postprocess_partner_invoice.py RENAMED Viewed

@@ -1,4 +1,6 @@
 """This module contains the postprocessing functions for the partner invoice."""
+from collections import defaultdict
 from rapidfuzz import fuzz, process
 from src.io import logger
@@ -103,9 +105,18 @@ def post_process_bundeskasse(aggregated_data):
             )
         # Check if the deferredDutyPayer is forto
-        deferredDutyPayer = line_item.get("deferredDutyPayer", {})
-        lower = deferredDutyPayer.get("documentValue", "").lower()
-        if any(key in lower for key in ["de789147263644738", "forto"]):
+        KEYWORDS = {"de789147263644738", "forto", "009812"}
+        def is_forto_recipient(line_item: dict) -> bool:
+            values_to_check = [
+                line_item.get("deferredDutyPayer", {}).get("documentValue", ""),
+                line_item.get("vatId", {}).get("documentValue", ""),
+            ]
+            combined = " ".join(values_to_check).lower()
+            return any(keyword in combined for keyword in KEYWORDS)
+        if is_forto_recipient(line_item):
             is_recipient_forto = True
     update_recipient_and_vendor(aggregated_data, is_recipient_forto)
@@ -134,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
     ] = "Dasbachstraße 15, 54292 Trier, Germany"
+def select_unique_bank_account(bank_account):
+    # Select the unique bank account if multiple are present
+    if isinstance(bank_account, list) and bank_account:
+        best = defaultdict(lambda: None)
+        for item in bank_account:
+            dv = item["documentValue"]
+            if best[dv] is None or item["page"] < best[dv]["page"]:
+                best[dv] = item
+        unique = list(best.values())
+        return unique
 async def process_partner_invoice(params, aggregated_data, document_type_code):
     """Process the partner invoice data."""
     # Post process bundeskasse invoices
@@ -141,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
         post_process_bundeskasse(aggregated_data)
         return
+    if "bankAccount" in aggregated_data:
+        aggregated_data["bankAccount"] = select_unique_bank_account(
+            aggregated_data["bankAccount"]
+        )
     line_items = aggregated_data.get("lineItem", [])
     # Add debug logging
     logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
@@ -158,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
         reverse_charge_info["formattedValue"] = reverse_charge_value
         reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
+    # Partner Name
+    partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
     # Process everything in one go
-    processed_items = await process_line_items_batch(params, line_items, reverse_charge)
+    processed_items = await process_line_items_batch(
+        params, line_items, reverse_charge, partner_name
+    )
     # Update your main data structure
     aggregated_data["lineItem"] = processed_items
 async def process_line_items_batch(
-    params: dict, line_items: list[dict], reverse_charge=None
+    params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
 ):
     """
     Processes all line items efficiently using a "Split-Apply-Combine" strategy.
@@ -204,23 +239,12 @@ async def process_line_items_batch(
     # Batch API Call for Embedding lookups
     if pending_line_items:
-        values_to_fetch = list(set(pending_line_items.values()))
-        logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
-        # Await the batch response {"desc1": "code1", "desc2": "code2"}
-        api_results = await get_tms_mappings(
-            input_list=values_to_fetch, embedding_type="line_items"
-        )
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
-        # Merge API results back into original list
         for index, desc in pending_line_items.items():
-            # Get result from API response, or None if API failed for that item
-            forto_code = api_results.get(desc)
-            # Update the original item
             line_items[index]["itemCode"] = {
                 "documentValue": desc,
-                "formattedValue": forto_code,  # Might be None if API failed
+                "formattedValue": code_map.get(desc),
                 "page": line_items[index]["lineItemDescription"].get("page"),
             }
@@ -229,8 +253,12 @@ async def process_line_items_batch(
         [
             item.update({"reverseChargeSentence": reverse_charge})
             for item in line_items
-            if item["itemCode"]["formattedValue"] != "CDU"
+            if (
+                (item.get("itemCode") and item["itemCode"]["formattedValue"] != "CDU")
+                or not item.get("itemCode")
+            )
         ]
     return line_items
@@ -272,11 +300,14 @@ def if_reverse_charge_sentence(sentence: str, params):
         return False
     # Check if the sentence is similar to any of the reverse charge sentences
-    _, is_reverse_charge = get_fuzzy_match_score(
-        sentence, reverse_charge_sentences, threshold
+    match, _ = get_fuzzy_match_score(
+        sentence, list(reverse_charge_sentences.keys()), threshold
     )
-    return is_reverse_charge
+    if match:
+        return reverse_charge_sentences[match]
+    return False
 def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
@@ -307,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
     return None
-async def associate_forto_item_code(line_item_data, params):
+async def associate_forto_item_code(line_item_data, params, partner_name=None):
     """
     Associates Forto item codes to a list of line item descriptions.
     Args:
         line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
         params (dict): Parameters containing lookup data and thresholds.
+        partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
     Returns:
         list: A list of dictionaries with 'description' and 'itemCode' keys.
@@ -334,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
     # Batch API Call for Embedding lookups
     if pending_line_items:
-        api_results = await get_tms_mappings(
-            input_list=list(pending_line_items.values()),
-            embedding_type="line_items",
-        )
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
-        # Merge API results back into original list
         for desc, f_desc in pending_line_items.items():
-            code = api_results.get(f_desc)
-            result.append({"description": desc, "itemCode": code})
+            result.append(
+                {
+                    "description": desc,
+                    "itemCode": code_map.get(f_desc),
+                }
+            )
+    return result
+async def fetch_line_item_codes(
+    pending_line_items: dict,
+    partner_name: str | None,
+    params: dict,
+):
+    """Returns: {original_description: mapped_code_or_None}"""
+    t_mode = (
+        find_matching_lineitem(
+            partner_name.upper(),
+            params["lookup_data"]["intermodal_partners"],
+            threshold=87,
+        )
+        if partner_name
+        else None
+    )
+    unique_descs = list(set(pending_line_items.values()))
+    logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
+    # Build API input map
+    api_input_map = {
+        desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
+    }
+    api_results = await get_tms_mappings(
+        input_list=list(api_input_map.values()),
+        embedding_type="line_items",
+    )
+    # Normalize response back to original descriptions
+    result = {
+        original_desc: api_results.get(api_desc)
+        for original_desc, api_desc in api_input_map.items()
+    }
     return result

data_science_document_ai-1.59.0/src/prompts/library/arrivalNotice/other/placeholders.json ADDED Viewed

@@ -0,0 +1,70 @@
+{
+  "type": "OBJECT",
+  "properties": {
+      "bookingNumber": {
+        "type": "STRING",
+        "nullable": true,
+        "description": "The booking number associated with the Arrival Notice document. They are often referred to as 'Booking Number', 'Booking No.', 'Booking Ref.', 'Booking Reference', 'Booking ID', 'carrier's reference' or 'Order Ref'."
+      },
+      "destinationTerminal": {
+        "type": "STRING",
+        "nullable": true,
+        "description": "The terminal at the destination port where the container will be delivered."
+      },
+      "eta": {
+        "type": "STRING",
+        "nullable": true,
+        "description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."
+      },
+      "mblNumber": {
+        "type": "STRING",
+        "nullable": true,
+        "description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
+      },
+      "portOfDischarge": {
+        "type": "STRING",
+        "nullable": true,
+        "description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment."
+      },
+      "vesselName": {
+        "type": "STRING",
+        "nullable": true,
+        "description": "The name of the vessel carrying the shipment."
+      },
+      "containers": {
+          "type": "ARRAY",
+          "items": {
+              "type": "OBJECT",
+              "properties": {
+                  "containerNumber": {
+                    "type": "STRING",
+                    "nullable": true,
+                    "description": "The unique identifier for each container. It always starts with 4 capital letters and followed by 7 digits. Example: TEMU7972458."
+                  },
+                  "containerType": {
+                    "type": "STRING",
+                    "nullable": true,
+                    "description": "The size of the container associated with the containerNumber, such as 20ft, 40ft, 40HC, 20DC etc."
+                  },
+                  "grossWeight": {
+                    "type": "STRING",
+                    "nullable": true,
+                    "description": "The gross weight of the container. Usually mentioned as G.W or GW or Gross Weight, etc.."
+                  },
+                  "measurements": {
+                    "type": "STRING",
+                    "nullable": true,
+                    "description": "The volume of the container. Usually, it is measured in 'Cubic Meter (cbm)' or dimensions. But volume in 'cbm' is preferred."
+                  },
+                  "sealNumber": {
+                    "type": "STRING",
+                    "nullable": true,
+                    "description": "The seal number associated with the container Number. But it is not same as the container number."
+                  }
+              },
+              "required": ["containerNumber", "containerType", "grossWeight"]
+          }
+      }
+},
+  "required": ["bookingNumber", "destinationTerminal", "eta", "portOfDischarge", "vesselName", "containers"]
+}

data_science_document_ai-1.59.0/src/prompts/library/arrivalNotice/other/prompt.txt ADDED Viewed

@@ -0,0 +1,40 @@
+<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
+<TASK> Your task is to extract data from Arrival Notice documents as per the given response schema structure. <TASK>
+<CONTEXT>
+The Freight Forwarding company receives Arrival Notice from shipping lines.
+These documents contain various details related to arrival of a shipment to the port of destination such as container numbers, estimated time of arrival, vessel details and containers information.
+They may be written in different languages such as English, German, Italian and can appear in a variety of formats and layouts.
+Your role is to accurately extract specific entities from these Arrival Notices to support efficient processing and accurate record-keeping.
+<CONTEXT>
+<INSTRUCTIONS>
+- Populate fields as defined in the response schema.
+- Multiple Containers entries may exist, capture all instances under "containers".
+- Use the data field description to understand the context of the data.
+- bookingNumbers:
+    - Booking numbers are unique identifiers for shipments. They are often referred to as "Booking Number", "Booking No.", "Booking Ref.", "Booking Reference", "Booking ID", "SACO-Pos.", "Order Ref", "Unsere Referenz", or "Unsere Position"
+    - If there is a unique_id that starts with "S" followed by 6 or 8 digits, it is a shipmentID, not a bookingNumber.
+- destinationTerminal:
+    - Destination Terminal can also be referred to as "Destination Termina;", "Pickup Location", "Delivery Location", "Delivery Terminal", "Empfangsort", "Entladeort", or "Abladestelle".
+- mblNumbers:
+    - Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", "HBL No.", or "M-AWB Nummer".
+    - Bill of Lading Number is known as mblNumber. Not a shipmentID even if it starts with "S".
+    - mblNumber from Hapag-Lloyd always starts with HLC.... (e.g., "HLCUTS12303AWNT3) and named as SEA WAYBILL or "SWB-NR.
+- eta:
+    - Estimated Time of Arrival (ETA) is the expected date and time when the shipment will arrive at the destination port.
+    - It can be referred to as "ETA", "Estimated Arrival", "Voraussichtliche Ankunft", "Ankunftszeit", "Arrivo", "Due to arrive at Terminal"
+- vesselName:
+    - Vessel Name is the name of the ship carrying the cargo. It can be referred to as "Vessel", "Ship Name", "Schiff", "Schiffsname", "Nave", or "Vessel/Flight No.".
+- containers: Details of each container on the arrival notice. Make sure to extract each container information separately.
+    - containerNumber: Container Number consists of 4 capital letters followed by 7 digits (e.g., TEMU7972458, CAIU 7222892).
+    - sealNumber: Seal numbers are unique identifiers for shipping seals. They are usually mentioned as seal numbers in the document but they are definitely not container numbers.
+<INSTRUCTIONS>

data-science-document-ai 1.45.2__tar.gz → 1.59.0__tar.gz

data-science-document-ai 1.45.2tar.gz → 1.59.0tar.gz