PyPI - data-science-document-ai - Versions diffs - 1.51.0__tar.gz → 1.60.1__tar.gz - Mend

data-science-document-ai 1.51.0tar.gz → 1.60.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{data_science_document_ai-1.51.0 → data_science_document_ai-1.60.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.51.0
+Version: 1.60.1
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.51.0 → data_science_document_ai-1.60.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-science-document-ai"
-version = "1.51.0"
+version = "1.60.1"
 description = "\"Document AI repo for data science\""
 authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
 packages = [

{data_science_document_ai-1.51.0 → data_science_document_ai-1.60.1}/src/constants.py RENAMED Viewed

@@ -20,10 +20,11 @@ project_parameters = {
     # Fuzzy lookup
     "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
     "item_code_lookup": "line_item_kvp_table.json",
+    "intermodal_partners": "intermodal_partners.json",
     "invoice_classification_lookup": "invoice_classification.json",
     "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
     # Fuzzy logic params
-    "fuzzy_threshold_item_code": 90,
+    "fuzzy_threshold_item_code": 92,
     "fuzzy_threshold_reverse_charge": 80,
     "fuzzy_threshold_invoice_classification": 70,
     # Chunking params
@@ -36,6 +37,8 @@ project_parameters = {
     # models metadata (confidence),
     "g_model_data_folder": "models",
     "local_model_data_folder": "data",
+    "if_use_docai": False,
+    "if_use_llm": True,  # Keep it always True
     "released_doc_types": {
         "bookingConfirmation",
         "packingList",
@@ -50,16 +53,6 @@ project_parameters = {
         "customsInvoice",
         "bundeskasse",
     },
-    "model_selector": {
-        "stable": {
-            "bookingConfirmation": 1,
-        },
-        "beta": {
-            "bookingConfirmation": 0,
-        },
-    },
-    # this is the model selector for the model to be used from the model_config.yaml
-    # file based on the environment, 0 mean the first model in the list
     # LLM model parameters
     "gemini_params": {
         "temperature": 0,
@@ -77,25 +70,15 @@ project_parameters = {
         "seed": 42,
         "model_id": "gemini-2.5-flash",
     },
-    # Key to combine the LLM results with the Doc Ai results
-    "key_to_combine": {
-        "bookingConfirmation": ["transportLegs"],
-        "arrivalNotice": ["containers"],
-        "finalMbL": ["containers"],
-        "draftMbl": ["containers"],
-        "deliveryOrder": ["Equipment", "TransportLeg"],
-        "customsAssessment": ["containers"],
-        "packingList": ["skuData"],
-        "commercialInvoice": ["skus"],
-        "shippingInstruction": ["containers"],
-        "partnerInvoice": ["lineItem"],
-        "customsInvoice": ["lineItem"],
-        "bundeskasse": ["lineItem"],
-    },
 }
 # Hardcoded rules for data points formatting that can't be based on label name alone
 formatting_rules = {
-    "bookingConfirmation": {"pickUpTerminal": "depot", "gateInTerminal": "terminal"},
+    "bookingConfirmation": {
+        "pickUpDepotCode": "depot",
+        "dropOffDepotCode": "depot",
+        "gateInTerminalCode": "terminal",
+        "pickUpTerminalCode": "terminal",
+    },
     "deliveryOrder": {"pickUpTerminal": "terminal", "EmptyContainerDepot": "depot"},
 }

data_science_document_ai-1.60.1/src/docai_processor_config.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+models_project_id: "738250249861"
+model_config:
+  stable:
+    bookingConfirmation:
+      - id: "dc3e714cd168aeaa"
+        details:
+          display_name: "doc_cap_bookingConfirmation"
+          author: "reet.kanjilal@forto.com"
+          created_date: ""

{data_science_document_ai-1.51.0 → data_science_document_ai-1.60.1}/src/pdf_processing.py RENAMED Viewed

@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
 from src.prompts.prompt_library import prompt_library
 from src.utils import (
     extract_top_pages,
-    generate_schema_structure,
     get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
@@ -202,9 +201,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     number_of_pages = get_pdf_page_count(file_content)
     logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
-    # get the schema placeholder
-    response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
     carrier = "other"
     carrier_schema = (
         prompt_library.library.get("preprocessing", {})
@@ -241,6 +237,9 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     # get the related prompt from predefined prompt library
     prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
+    # get the schema placeholder
+    response_schema = prompt_library.library[input_doc_type][carrier]["placeholders"]
     # Add page-number extraction for moderately large docs
     use_chunking = number_of_pages >= params["chunk_after"]
@@ -258,7 +257,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     ):
         tasks.append(
             process_chunk_with_retry(
-                chunk, prompt, response_schema, llm_client, input_doc_type
+                chunk,
+                prompt,
+                response_schema,
+                llm_client,
+                input_doc_type,
             )
         )
@@ -350,8 +353,7 @@ async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_
     # Add currency from the amount field
     if input_doc_type in ["commercialInvoice"]:
         result = postprocessing_commercial_invoice(result, params, input_doc_type)
-    elif input_doc_type == "bookingConfirmation":
-        result = postprocess_booking_confirmation(result)
     return result, llm_client.model_id
@@ -370,13 +372,14 @@ def combine_llm_results_w_doc_ai(
     Returns:
         combined result
     """
-    result = doc_ai.copy()
-    llm = remove_none_values(llm)
-    if not llm:
+    result = remove_none_values(llm)
+    docAi = doc_ai.copy()
+    if not docAi:
         return result
     # Merge top-level keys
-    result.update({k: v for k, v in llm.items() if k not in result})
+    result.update({k: v for k, v in docAi.items() if k not in result})
     if (
         input_doc_type
@@ -384,28 +387,28 @@ def combine_llm_results_w_doc_ai(
         and keys_to_combine
     ):
         result.update(
-            {key: llm.get(key) for key in keys_to_combine if key in llm.keys()}
+            {key: docAi.get(key) for key in keys_to_combine if key in docAi.keys()}
         )
         return result
     # Handle specific key-based merging logic for multiple keys
     if keys_to_combine:
         for key in keys_to_combine:
-            if key in llm.keys():
+            if key in docAi.keys():
                 # Merge the list of dictionaries
-                # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
-                if len(llm[key]) < len(result[key]):
-                    result[key] = llm[key]
+                # If the length of the docAi list is less than the LLM result, replace with the docAi list
+                if len(docAi[key]) < len(result[key]):
+                    result[key] = docAi[key]
                 else:
-                    # If the length of the LLM list is greater than or equal to the Doc AI result,
+                    # If the length of the docAi list is greater than or equal to the LLM result,
                     # add & merge the dictionaries
-                    if isinstance(llm[key], list):
-                        for i in range(len(llm[key])):
+                    if isinstance(docAi[key], list):
+                        for i in range(len(docAi[key])):
                             if i == len(result[key]):
-                                result[key].append(llm[key][i])
+                                result[key].append(docAi[key][i])
                             else:
-                                for sub_key in llm[key][i].keys():
-                                    result[key][i][sub_key] = llm[key][i][sub_key]
+                                for sub_key in docAi[key][i].keys():
+                                    result[key][i][sub_key] = docAi[key][i][sub_key]
     return result
@@ -499,13 +502,15 @@ async def data_extraction_manual_flow(
     page_count = None
     # Validate the file type
     if mime_type == "application/pdf":
+        if_use_docai = params["if_use_docai"]
         # Enable Doc Ai only for certain document types.
-        if_use_docai = (
-            True if meta.documentTypeCode in params["model_config"]["stable"] else False
-        )
-        if_use_llm = (
-            True if meta.documentTypeCode in params["key_to_combine"].keys() else False
-        )
+        if params["if_use_docai"]:
+            if_use_docai = (
+                True
+                if meta.documentTypeCode in params["model_config"]["stable"]
+                else False
+            )
         (
             extracted_data,
@@ -517,7 +522,7 @@ async def data_extraction_manual_flow(
             meta.documentTypeCode,
             processor_client,
             if_use_docai=if_use_docai,
-            if_use_llm=if_use_llm,
+            if_use_llm=params["if_use_llm"],
             llm_client=llm_client,
             isBetaTest=False,
         )

{data_science_document_ai-1.51.0 → data_science_document_ai-1.60.1}/src/postprocessing/postprocess_partner_invoice.py RENAMED Viewed

@@ -1,4 +1,6 @@
 """This module contains the postprocessing functions for the partner invoice."""
+from collections import defaultdict
 from rapidfuzz import fuzz, process
 from src.io import logger
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
     ] = "Dasbachstraße 15, 54292 Trier, Germany"
+def select_unique_bank_account(bank_account):
+    # Select the unique bank account if multiple are present
+    if isinstance(bank_account, list) and bank_account:
+        best = defaultdict(lambda: None)
+        for item in bank_account:
+            dv = item["documentValue"]
+            if best[dv] is None or item["page"] < best[dv]["page"]:
+                best[dv] = item
+        unique = list(best.values())
+        return unique
 async def process_partner_invoice(params, aggregated_data, document_type_code):
     """Process the partner invoice data."""
     # Post process bundeskasse invoices
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
         post_process_bundeskasse(aggregated_data)
         return
+    if "bankAccount" in aggregated_data:
+        aggregated_data["bankAccount"] = select_unique_bank_account(
+            aggregated_data["bankAccount"]
+        )
     line_items = aggregated_data.get("lineItem", [])
     # Add debug logging
     logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
@@ -167,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
         reverse_charge_info["formattedValue"] = reverse_charge_value
         reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
+    # Partner Name
+    partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
     # Process everything in one go
-    processed_items = await process_line_items_batch(params, line_items, reverse_charge)
+    processed_items = await process_line_items_batch(
+        params, line_items, reverse_charge, partner_name
+    )
     # Update your main data structure
     aggregated_data["lineItem"] = processed_items
 async def process_line_items_batch(
-    params: dict, line_items: list[dict], reverse_charge=None
+    params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
 ):
     """
     Processes all line items efficiently using a "Split-Apply-Combine" strategy.
@@ -213,23 +239,12 @@ async def process_line_items_batch(
     # Batch API Call for Embedding lookups
     if pending_line_items:
-        values_to_fetch = list(set(pending_line_items.values()))
-        logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
-        # Await the batch response {"desc1": "code1", "desc2": "code2"}
-        api_results = await get_tms_mappings(
-            input_list=values_to_fetch, embedding_type="line_items"
-        )
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
-        # Merge API results back into original list
         for index, desc in pending_line_items.items():
-            # Get result from API response, or None if API failed for that item
-            forto_code = api_results.get(desc)
-            # Update the original item
             line_items[index]["itemCode"] = {
                 "documentValue": desc,
-                "formattedValue": forto_code,  # Might be None if API failed
+                "formattedValue": code_map.get(desc),
                 "page": line_items[index]["lineItemDescription"].get("page"),
             }
@@ -285,11 +300,14 @@ def if_reverse_charge_sentence(sentence: str, params):
         return False
     # Check if the sentence is similar to any of the reverse charge sentences
-    _, is_reverse_charge = get_fuzzy_match_score(
-        sentence, reverse_charge_sentences, threshold
+    match, _ = get_fuzzy_match_score(
+        sentence, list(reverse_charge_sentences.keys()), threshold
     )
-    return is_reverse_charge
+    if match:
+        return reverse_charge_sentences[match]
+    return False
 def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
@@ -320,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
     return None
-async def associate_forto_item_code(line_item_data, params):
+async def associate_forto_item_code(line_item_data, params, partner_name=None):
     """
     Associates Forto item codes to a list of line item descriptions.
     Args:
         line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
         params (dict): Parameters containing lookup data and thresholds.
+        partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
     Returns:
         list: A list of dictionaries with 'description' and 'itemCode' keys.
@@ -347,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
     # Batch API Call for Embedding lookups
     if pending_line_items:
-        api_results = await get_tms_mappings(
-            input_list=list(pending_line_items.values()),
-            embedding_type="line_items",
-        )
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
-        # Merge API results back into original list
         for desc, f_desc in pending_line_items.items():
-            code = api_results.get(f_desc)
-            result.append({"description": desc, "itemCode": code})
+            result.append(
+                {
+                    "description": desc,
+                    "itemCode": code_map.get(f_desc),
+                }
+            )
+    return result
+async def fetch_line_item_codes(
+    pending_line_items: dict,
+    partner_name: str | None,
+    params: dict,
+):
+    """Returns: {original_description: mapped_code_or_None}"""
+    t_mode = (
+        find_matching_lineitem(
+            partner_name.upper(),
+            params["lookup_data"]["intermodal_partners"],
+            threshold=87,
+        )
+        if partner_name
+        else None
+    )
+    unique_descs = list(set(pending_line_items.values()))
+    logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
+    # Build API input map
+    api_input_map = {
+        desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
+    }
+    api_results = await get_tms_mappings(
+        input_list=list(api_input_map.values()),
+        embedding_type="line_items",
+    )
+    # Normalize response back to original descriptions
+    result = {
+        original_desc: api_results.get(api_desc)
+        for original_desc, api_desc in api_input_map.items()
+    }
     return result

data_science_document_ai-1.60.1/src/prompts/library/bookingConfirmation/evergreen/placeholders.json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+  "type": "OBJECT",
+  "properties": {
+    "bookingNumber": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "A unique identifier assigned to the shipment booking, used for tracking and reference. They are often referred to as 'Booking No.', 'Booking Reference', 'Our Reference', or 'Order Ref'."
+    },
+    "contractNumber": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "It's a contract number between the carrier and Forto Logistics SE & Co KG. Shipment Id 'S' followed by 6, 7, or 8 digits e.g. S9486358 is not a contract number."
+    },
+    "pickUpTerminalCode": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The specific terminal for cargo pickup during the import shipment."
+    },
+    "gateInTerminalCode": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The specific terminal where cargo is gated in especially Export terminal delivery address. E.g., FULL RETURN TO or Export terminal name."
+    },
+    "proformaDate": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The date considered to apply the rates and charges specified in the booking confirmation"
+    },
+    "cyCutOff": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The datetime by which the cargo to be delivered to the Container Yard. It can be found with keys CARGO CUT OFF DATE/TIME"
+    },
+    "gateInReference": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "A reference code for cargo entering the terminal to drop the loaded cargo for Export. Sometimes it can be 'Booking Number'."
+    },
+    "mblNumber": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
+    },
+    "pickUpReference": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "A reference code for cargo pickup during the import shipment. Sometimes it can be 'Our Reference'."
+    },
+    "siCutOff": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The deadline datetime for submitting the Shipping Instructions (SI) to the carrier. It can be found with keys DOC CUT OFF DATE/TIME"
+    },
+    "vgmCutOff": {
+      "type": "STRING",
+      "nullable": true,
+        "description": "The deadline datetime for submitting the Verified Gross Mass (VGM) to the carrier. It can be found with keys VGM DEADLINE, VGM DUE, VGM CUT OFF."
+    },
+    "containers": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+          "containerType": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The size / type of the container, such as 20ft, 40ft, 40HC, 20DC etc under Type/Size column."
+          },
+          "pickUpDepotCode": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The depot code where the empty container will be picked up. It is identified as Empty Pick Up AT Depot or Export Empty Pick Up Depot(s)."
+          },
+          "dropOffDepotCode": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The depot code where the empty container will be dropped off."
+          }
+        }
+      },
+      "required": ["containerType", "pickupDepotCode", "dropoffDepotCode"]
+    },
+    "transportLegs": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+          "eta": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."
+          },
+          "etd": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "Estimated Time of Departure (ETD) is the expected date when the shipment will leave the origin port."
+          },
+          "imoNumber": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The International Maritime Organization number for a specific leg. It can be found as IMO No, IMO number."
+          },
+          "portOfDischarge": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment. It can be found at POD, Port of Discharge, To, Discharge Port"
+          },
+          "portOfLoading": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The port where the goods are loaded onto the vessel. This is the origin port for the shipment. It can be found at POL, Port of Loading, From, Load Port"
+          },
+          "vesselName": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The name of the vessel carrying the shipment. It can be found at VESSEL/VOYAGE e.g., MOL EMERALD"
+          },
+          "voyage": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The journey or route taken by the vessel for a specific leg. It can be found at VESSEL/VOYAGE e.g., 087E"
+          }
+        }
+      },
+      "required": [
+          "eta",
+          "etd",
+          "portOfDischarge",
+          "portOfLoading",
+          "vesselName",
+          "voyage"
+        ]
+    },
+    "carrierAddress": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The address of the carrier who provides service and issued the document."
+    },
+    "carrierName": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The name of the carrier who issued the document. Extract full name. e,g, Evergreen Line."
+    }
+  },
+  "required": ["bookingNumber", "transportLegs", "containers", "cyCutOff", "vgmCutOff", "siCutOff"]
+}

data_science_document_ai-1.60.1/src/prompts/library/bookingConfirmation/evergreen/prompt.txt ADDED Viewed

@@ -0,0 +1,62 @@
+<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
+<TASK> Your task is to extract data from Booking Confirmation documents as per the given response schema structure. <TASK>
+<CONTEXT>
+The Freight Forwarding company receives Booking Confirmation from EverGreen Carrier (Shipping Lines) partner.
+These Booking Confirmations contain various details related to booking, container pick up and drop off depot details, vessel details, as well as other transport Legs data.
+They may be written in different languages such as English, German, Vietnamese, Chinese, and other European languages, and can appear in a variety of formats and layouts.
+Your role is to accurately extract specific entities from these Booking Confirmations to support efficient processing and accurate record-keeping.
+To provide context on the journey of a containers for both Export and Import shipments,
+For Export shipment: An empty container is picked up from a depot (pickupDepotCode) using a pickUpReference and goods loaded into it at a warehouse. Then the loaded container / cargo is transported back to a Container Yard or gateInTerminal before the cyCutOff date for further shipping processes. Then the POL of the First TransportLeg may start from the gateInTerminal or a different POL too.
+For Import Shipment: The loaded container / cargo arrives at a port of discharge then picked up at pickUpTerminal using pickUpReference. After delivery, an empty container is returned to a depot (dropOffDepotCode).
+<CONTEXT>
+<INSTRUCTIONS>
+- Populate fields as defined in the response schema.
+- Use the data field description to understand the context of the data.
+- Containers: Need to extract Depot details per Container Type. Multiple Containers entries may exist, capture all instances under "Containers".
+    - containerType: The type of container (e.g., 20FT, 40FT, 20ft, 40ft, 40HC, 20DC, etc...).
+    - pickupDepotCode: The code of the depot where the empty container is picked up.It is identified as Empty Pick Up AT Depot or Export Empty Pick Up Depot(s).
+    - dropOffDepotCode: The code of the depot where the empty container is dropped off. It is identified as Import Empty Drop Off Depot(s). Full Return To is not the drop off depot.
+- transportLegs: Multiple Transport Legs entries may exist, capture all instances under "transportLegs". Make sure the order of the legs are important.
+    - eta: The estimated time of arrival for a specific leg.
+    - etd: The estimated time of departure for a specific leg. ETD DATE above the PORT OF DISCHARGING information.
+    - imoNumber: The International Maritime Organization number for a specific leg.
+    - portOfDischarge: The port where cargo is unloaded for a specific leg.
+    - portOfLoading: The port where cargo is loaded for a specific leg.
+    - vesselName: The name of the vessel for a specific leg. Can be found at VESSEL/VOYAGE (e.g., EVER LAUREL).
+    - voyage: The journey or route taken by the vessel for a specific leg. It can be found at VESSEL/VOYAGE e.g., 087E.
+IMPORTANT explanation for the transportLegs part as follows:
+- There is at least one leg in each document.
+- 'eta' must be equal or later than 'etd'!
+- Multiple legs are possible. When there are multiple legs,
+    - Sequential Sorting: You must manually re-order legs based on etd then eta, regardless of their order in the source text.
+    - "T/S PORT OF LOADING" indicates the presence of a multi-leg journey.
+    - Transhipment Handling: Treat any mentioned "T/S PORT OF LOADING" as the bridge between two legs (Discharge for Leg A and Loading for Leg B).
+    - The Connectivity Rule: For any sequence of legs, the Port of Discharge of the previous leg must match the Port of Loading of the following leg.
+        - First T/S PORT OF LOADING is the Port of Discharge for the first transportLegs and Port of Loading for the second transportLegs.
+        - Second T/S PORT OF LOADING is the Port of Discharge for the second transportLegs and Port of Loading for the third transportLegs.
+    - Timeline Integrity: Ensure a "No Time Travel" policy: The eta of a previous leg must be earlier than or equal to the etd of the following leg.
+Structure of Multiple Leg Sequence & Mapping
+ Leg 1 (Initial):
+ - `portOfLoading`: PORT OF LOADING.
+ - `portOfDischarge`: T/S PORT OF LOADING.
+ - `vesselName`: VESSEL/VOYAGE (ignore parentheses).
+ - `etd`: ETD DATE above the PORT OF DISCHARGING information.
+ - `eta`: ETA DATE below first T/S PORT OF LOADING
+ Leg 2 (Intermediate):  Trigger: Only if T/S PORT OF LOADING exists.
+ - `portOfLoading`: First T/S PORT OF LOADING. POD of Leg 1.
+ - `portOfDischarge`: Second T/S PORT OF LOADING (if exists), otherwise PORT OF DISCHARGE before the FINAL DESTINATION.
+ - `vesselName`: EST. CONNECTING VESSEL / VOY.
+ - `etd`: ETD DATE after first T/S PORT OF LOADING
+ - `eta`: ETA DATE after second T/S PORT OF LOADING (if exists), otherwise ETA next to the FINAL DESTINATION section.
+<INSTRUCTIONS>

data-science-document-ai 1.51.0__tar.gz → 1.60.1__tar.gz

data-science-document-ai 1.51.0tar.gz → 1.60.1tar.gz