PyPI - data-science-document-ai - Versions diffs - 1.51.0__tar.gz → 1.58.0__tar.gz - Mend

data-science-document-ai 1.51.0tar.gz → 1.58.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

{data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.51.0
+Version: 1.58.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-science-document-ai"
-version = "1.51.0"
+version = "1.58.0"
 description = "\"Document AI repo for data science\""
 authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
 packages = [

{data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/constants.py RENAMED Viewed

@@ -20,10 +20,11 @@ project_parameters = {
     # Fuzzy lookup
     "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
     "item_code_lookup": "line_item_kvp_table.json",
+    "intermodal_partners": "intermodal_partners.json",
     "invoice_classification_lookup": "invoice_classification.json",
     "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
     # Fuzzy logic params
-    "fuzzy_threshold_item_code": 90,
+    "fuzzy_threshold_item_code": 92,
     "fuzzy_threshold_reverse_charge": 80,
     "fuzzy_threshold_invoice_classification": 70,
     # Chunking params
@@ -36,6 +37,8 @@ project_parameters = {
     # models metadata (confidence),
     "g_model_data_folder": "models",
     "local_model_data_folder": "data",
+    "if_use_docai": False,
+    "if_use_llm": True,  # Keep it always True
     "released_doc_types": {
         "bookingConfirmation",
         "packingList",
@@ -50,16 +53,6 @@ project_parameters = {
         "customsInvoice",
         "bundeskasse",
     },
-    "model_selector": {
-        "stable": {
-            "bookingConfirmation": 1,
-        },
-        "beta": {
-            "bookingConfirmation": 0,
-        },
-    },
-    # this is the model selector for the model to be used from the model_config.yaml
-    # file based on the environment, 0 mean the first model in the list
     # LLM model parameters
     "gemini_params": {
         "temperature": 0,
@@ -77,25 +70,15 @@ project_parameters = {
         "seed": 42,
         "model_id": "gemini-2.5-flash",
     },
-    # Key to combine the LLM results with the Doc Ai results
-    "key_to_combine": {
-        "bookingConfirmation": ["transportLegs"],
-        "arrivalNotice": ["containers"],
-        "finalMbL": ["containers"],
-        "draftMbl": ["containers"],
-        "deliveryOrder": ["Equipment", "TransportLeg"],
-        "customsAssessment": ["containers"],
-        "packingList": ["skuData"],
-        "commercialInvoice": ["skus"],
-        "shippingInstruction": ["containers"],
-        "partnerInvoice": ["lineItem"],
-        "customsInvoice": ["lineItem"],
-        "bundeskasse": ["lineItem"],
-    },
 }
 # Hardcoded rules for data points formatting that can't be based on label name alone
 formatting_rules = {
-    "bookingConfirmation": {"pickUpTerminal": "depot", "gateInTerminal": "terminal"},
+    "bookingConfirmation": {
+        "pickUpDepotCode": "depot",
+        "dropOffDepotCode": "depot",
+        "gateInTerminalCode": "terminal",
+        "pickUpTerminalCode": "terminal",
+    },
     "deliveryOrder": {"pickUpTerminal": "terminal", "EmptyContainerDepot": "depot"},
 }

data_science_document_ai-1.58.0/src/docai_processor_config.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+models_project_id: "738250249861"
+model_config:
+  stable:
+    bookingConfirmation:
+      - id: "dc3e714cd168aeaa"
+        details:
+          display_name: "doc_cap_bookingConfirmation"
+          author: "reet.kanjilal@forto.com"
+          created_date: ""

{data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/pdf_processing.py RENAMED Viewed

@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
 from src.prompts.prompt_library import prompt_library
 from src.utils import (
     extract_top_pages,
-    generate_schema_structure,
     get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
@@ -202,9 +201,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     number_of_pages = get_pdf_page_count(file_content)
     logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
-    # get the schema placeholder
-    response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
     carrier = "other"
     carrier_schema = (
         prompt_library.library.get("preprocessing", {})
@@ -241,6 +237,9 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     # get the related prompt from predefined prompt library
     prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
+    # get the schema placeholder
+    response_schema = prompt_library.library[input_doc_type][carrier]["placeholders"]
     # Add page-number extraction for moderately large docs
     use_chunking = number_of_pages >= params["chunk_after"]
@@ -258,7 +257,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     ):
         tasks.append(
             process_chunk_with_retry(
-                chunk, prompt, response_schema, llm_client, input_doc_type
+                chunk,
+                prompt,
+                response_schema,
+                llm_client,
+                input_doc_type,
             )
         )
@@ -350,8 +353,7 @@ async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_
     # Add currency from the amount field
     if input_doc_type in ["commercialInvoice"]:
         result = postprocessing_commercial_invoice(result, params, input_doc_type)
-    elif input_doc_type == "bookingConfirmation":
-        result = postprocess_booking_confirmation(result)
     return result, llm_client.model_id
@@ -370,13 +372,14 @@ def combine_llm_results_w_doc_ai(
     Returns:
         combined result
     """
-    result = doc_ai.copy()
-    llm = remove_none_values(llm)
-    if not llm:
+    result = remove_none_values(llm)
+    docAi = doc_ai.copy()
+    if not docAi:
         return result
     # Merge top-level keys
-    result.update({k: v for k, v in llm.items() if k not in result})
+    result.update({k: v for k, v in docAi.items() if k not in result})
     if (
         input_doc_type
@@ -384,28 +387,28 @@ def combine_llm_results_w_doc_ai(
         and keys_to_combine
     ):
         result.update(
-            {key: llm.get(key) for key in keys_to_combine if key in llm.keys()}
+            {key: docAi.get(key) for key in keys_to_combine if key in docAi.keys()}
         )
         return result
     # Handle specific key-based merging logic for multiple keys
     if keys_to_combine:
         for key in keys_to_combine:
-            if key in llm.keys():
+            if key in docAi.keys():
                 # Merge the list of dictionaries
-                # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
-                if len(llm[key]) < len(result[key]):
-                    result[key] = llm[key]
+                # If the length of the docAi list is less than the LLM result, replace with the docAi list
+                if len(docAi[key]) < len(result[key]):
+                    result[key] = docAi[key]
                 else:
-                    # If the length of the LLM list is greater than or equal to the Doc AI result,
+                    # If the length of the docAi list is greater than or equal to the LLM result,
                     # add & merge the dictionaries
-                    if isinstance(llm[key], list):
-                        for i in range(len(llm[key])):
+                    if isinstance(docAi[key], list):
+                        for i in range(len(docAi[key])):
                             if i == len(result[key]):
-                                result[key].append(llm[key][i])
+                                result[key].append(docAi[key][i])
                             else:
-                                for sub_key in llm[key][i].keys():
-                                    result[key][i][sub_key] = llm[key][i][sub_key]
+                                for sub_key in docAi[key][i].keys():
+                                    result[key][i][sub_key] = docAi[key][i][sub_key]
     return result
@@ -499,13 +502,15 @@ async def data_extraction_manual_flow(
     page_count = None
     # Validate the file type
     if mime_type == "application/pdf":
+        if_use_docai = params["if_use_docai"]
         # Enable Doc Ai only for certain document types.
-        if_use_docai = (
-            True if meta.documentTypeCode in params["model_config"]["stable"] else False
-        )
-        if_use_llm = (
-            True if meta.documentTypeCode in params["key_to_combine"].keys() else False
-        )
+        if params["if_use_docai"]:
+            if_use_docai = (
+                True
+                if meta.documentTypeCode in params["model_config"]["stable"]
+                else False
+            )
         (
             extracted_data,
@@ -517,7 +522,7 @@ async def data_extraction_manual_flow(
             meta.documentTypeCode,
             processor_client,
             if_use_docai=if_use_docai,
-            if_use_llm=if_use_llm,
+            if_use_llm=params["if_use_llm"],
             llm_client=llm_client,
             isBetaTest=False,
         )

{data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/postprocessing/common.py RENAMED Viewed

@@ -723,10 +723,45 @@ async def format_all_entities(result, document_type_code, params, mime_type):
     if document_type_code in ["partnerInvoice", "bundeskasse"]:
         await process_partner_invoice(params, aggregated_data, document_type_code)
+    if document_type_code in ["bookingConfirmation"]:
+        aggregated_data["legalEntity"] = await get_legal_entity(
+            aggregated_data.get("carrierName", {}).get("documentValue", None),
+            aggregated_data.get("carrierAddress", {}).get("documentValue", None),
+        )
     logger.info("Data Extraction completed successfully")
     return aggregated_data
+async def get_legal_entity(name, address):
+    """Get legal entity mapping from TMS mappings.
+    Args:
+        name (str): The name of the legal entity. Mandatory.
+        address (str): The address of the legal entity. Optional for better matching.
+    Returns:
+        dict or None: The mapping result from TMS embeddings, or None if not found.
+    """
+    # Name is mandatory for legal entity mapping
+    if not name:
+        return {"documentValue": None, "mappedValue": None}
+    # Build input safely
+    input_text = name if not address else f"{name} | {address}"
+    api_results = await get_tms_mappings(
+        input_list=[input_text],
+        embedding_type="legal_entities",
+        input_key="partnerNameAddress",
+    )
+    return {
+        "documentValue": None,
+        "formattedValue": api_results.get(input_text),
+    }
 def add_text_without_space(text):
     """If the cleaned text is different from the original text, append it.
     Useful for port names like QUINHON - Quinhon"""

{data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/postprocessing/postprocess_partner_invoice.py RENAMED Viewed

@@ -1,4 +1,6 @@
 """This module contains the postprocessing functions for the partner invoice."""
+from collections import defaultdict
 from rapidfuzz import fuzz, process
 from src.io import logger
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
     ] = "Dasbachstraße 15, 54292 Trier, Germany"
+def select_unique_bank_account(bank_account):
+    # Select the unique bank account if multiple are present
+    if isinstance(bank_account, list) and bank_account:
+        best = defaultdict(lambda: None)
+        for item in bank_account:
+            dv = item["documentValue"]
+            if best[dv] is None or item["page"] < best[dv]["page"]:
+                best[dv] = item
+        unique = list(best.values())
+        return unique
 async def process_partner_invoice(params, aggregated_data, document_type_code):
     """Process the partner invoice data."""
     # Post process bundeskasse invoices
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
         post_process_bundeskasse(aggregated_data)
         return
+    if "bankAccount" in aggregated_data:
+        aggregated_data["bankAccount"] = select_unique_bank_account(
+            aggregated_data["bankAccount"]
+        )
     line_items = aggregated_data.get("lineItem", [])
     # Add debug logging
     logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
@@ -167,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
         reverse_charge_info["formattedValue"] = reverse_charge_value
         reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
+    # Partner Name
+    partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
     # Process everything in one go
-    processed_items = await process_line_items_batch(params, line_items, reverse_charge)
+    processed_items = await process_line_items_batch(
+        params, line_items, reverse_charge, partner_name
+    )
     # Update your main data structure
     aggregated_data["lineItem"] = processed_items
 async def process_line_items_batch(
-    params: dict, line_items: list[dict], reverse_charge=None
+    params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
 ):
     """
     Processes all line items efficiently using a "Split-Apply-Combine" strategy.
@@ -213,23 +239,12 @@ async def process_line_items_batch(
     # Batch API Call for Embedding lookups
     if pending_line_items:
-        values_to_fetch = list(set(pending_line_items.values()))
-        logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
-        # Await the batch response {"desc1": "code1", "desc2": "code2"}
-        api_results = await get_tms_mappings(
-            input_list=values_to_fetch, embedding_type="line_items"
-        )
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
-        # Merge API results back into original list
         for index, desc in pending_line_items.items():
-            # Get result from API response, or None if API failed for that item
-            forto_code = api_results.get(desc)
-            # Update the original item
             line_items[index]["itemCode"] = {
                 "documentValue": desc,
-                "formattedValue": forto_code,  # Might be None if API failed
+                "formattedValue": code_map.get(desc),
                 "page": line_items[index]["lineItemDescription"].get("page"),
             }
@@ -285,11 +300,14 @@ def if_reverse_charge_sentence(sentence: str, params):
         return False
     # Check if the sentence is similar to any of the reverse charge sentences
-    _, is_reverse_charge = get_fuzzy_match_score(
-        sentence, reverse_charge_sentences, threshold
+    match, _ = get_fuzzy_match_score(
+        sentence, list(reverse_charge_sentences.keys()), threshold
     )
-    return is_reverse_charge
+    if match:
+        return reverse_charge_sentences[match]
+    return False
 def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
@@ -320,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
     return None
-async def associate_forto_item_code(line_item_data, params):
+async def associate_forto_item_code(line_item_data, params, partner_name=None):
     """
     Associates Forto item codes to a list of line item descriptions.
     Args:
         line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
         params (dict): Parameters containing lookup data and thresholds.
+        partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
     Returns:
         list: A list of dictionaries with 'description' and 'itemCode' keys.
@@ -347,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
     # Batch API Call for Embedding lookups
     if pending_line_items:
-        api_results = await get_tms_mappings(
-            input_list=list(pending_line_items.values()),
-            embedding_type="line_items",
-        )
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
-        # Merge API results back into original list
         for desc, f_desc in pending_line_items.items():
-            code = api_results.get(f_desc)
-            result.append({"description": desc, "itemCode": code})
+            result.append(
+                {
+                    "description": desc,
+                    "itemCode": code_map.get(f_desc),
+                }
+            )
+    return result
+async def fetch_line_item_codes(
+    pending_line_items: dict,
+    partner_name: str | None,
+    params: dict,
+):
+    """Returns: {original_description: mapped_code_or_None}"""
+    t_mode = (
+        find_matching_lineitem(
+            partner_name.upper(),
+            params["lookup_data"]["intermodal_partners"],
+            threshold=87,
+        )
+        if partner_name
+        else None
+    )
+    unique_descs = list(set(pending_line_items.values()))
+    logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
+    # Build API input map
+    api_input_map = {
+        desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
+    }
+    api_results = await get_tms_mappings(
+        input_list=list(api_input_map.values()),
+        embedding_type="line_items",
+    )
+    # Normalize response back to original descriptions
+    result = {
+        original_desc: api_results.get(api_desc)
+        for original_desc, api_desc in api_input_map.items()
+    }
     return result

data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+  "type": "OBJECT",
+  "properties": {
+    "bookingNumber": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "A unique identifier assigned to the shipment booking, used for tracking and reference. They are often referred to as 'Booking No.', 'Booking Reference', 'Our Reference', or 'Order Ref'."
+    },
+    "contractNumber": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "It's a contract number between the carrier and Forto Logistics SE & Co KG."
+    },
+    "pickUpTerminalCode": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The specific terminal for cargo pickup during the import shipment."
+    },
+    "gateInTerminalCode": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The specific terminal where cargo is gated in especially Export terminal delivery address. E.g., Export terminal delivery address, Export terminal location, or Export terminal name."
+    },
+    "performaDate": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The date considered to apply the rates and charges specified in the booking confirmation"
+    },
+    "cyCutOff": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The datetime by which the cargo to be delivered to the Container Yard. It can be found with keys FCL delivery cut-off, FCL DG delivery cut-off, CY CUT OFF, CY Closing."
+    },
+    "gateInReference": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "A reference code for cargo entering the terminal to drop the loaded cargo for Export. Sometimes it can be 'Our Reference'."
+    },
+    "mblNumber": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
+    },
+    "pickUpReference": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "A reference code for cargo pickup during the import shipment. Sometimes it can be 'Our Reference'."
+    },
+    "siCutOff": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The deadline datetime for submitting the Shipping Instructions (SI) to the carrier. It can be found with keys Shipping Instruction Closing."
+    },
+    "vgmCutOff": {
+      "type": "STRING",
+      "nullable": true,
+        "description": "The deadline datetime for submitting the Verified Gross Mass (VGM) to the carrier. It can be found with keys VGM DEADLINE, VGM DUE, VGM CUT OFF."
+    },
+    "containers": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+          "containerType": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The size / type of the container, such as 20ft, 40ft, 40HC, 20DC etc under Type/Size column."
+          },
+          "pickUpDepotCode": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The depot code where the empty container will be picked up. It is identified as Empty Pick Up Depot or Export Empty Pick Up Depot(s)."
+          },
+          "dropOffDepotCode": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The depot code where the empty container will be dropped off."
+          }
+        }
+      },
+      "required": ["containerType", "pickupDepotCode", "dropoffDepotCode"]
+    },
+    "transportLegs": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+          "eta": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."
+          },
+          "etd": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "Estimated Time of Departure (ETD) is the expected date when the shipment will leave the origin port."
+          },
+          "imoNumber": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The International Maritime Organization number for a specific leg. It can be found as IMO No, IMO number."
+          },
+          "portOfDischarge": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment. It can be found at POD, Port of Discharge, To, Discharge Port"
+          },
+          "portOfLoading": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The port where the goods are loaded onto the vessel. This is the origin port for the shipment. It can be found at POL, Port of Loading, From, Load Port"
+          },
+          "vesselName": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The name of the vessel carrying the shipment. It can be found at vessel, INTENDED VESSEL/VOYAGE"
+          },
+          "voyage": {
+            "type": "STRING",
+            "nullable": true,
+            "description": "The journey or route taken by the vessel for a specific leg. It can be found at Voy. no, INTENDED VESSEL/VOYAGE"
+          }
+        }
+      },
+      "required": [
+          "eta",
+          "etd",
+          "portOfDischarge",
+          "portOfLoading",
+          "vesselName",
+          "voyage"
+        ]
+    },
+    "carrierAddress": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The address of the carrier who provides service and issued the document."
+    },
+    "carrierName": {
+      "type": "STRING",
+      "nullable": true,
+      "description": "The name of the carrier who issued the document e,g, Hapag-Lloyd."
+    }
+  },
+  "required": ["bookingNumber", "transportLegs", "containers", "cyCutOff", "vgmCutOff", "siCutOff"]
+}

{data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt RENAMED Viewed

@@ -1,6 +1,14 @@
-your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
-```json
-{
+<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
+<TASK> Your task is to extract data from Booking Confirmation documents as per the given response schema structure. <TASK>
+<CONTEXT>
+The Freight Forwarding company receives Booking Confirmation from EverGreen Carrier (Shipping Lines) partner.
+These Booking Confirmations contain various details related to booking, container pick up and drop off depot details, vessel details, as well as other transport Legs data.
+They may be written in different languages such as English, German, Vietnamese, Chinese, and other European languages, and can appear in a variety of formats and layouts.
+Your role is to accurately extract specific entities from these Booking Confirmations to support efficient processing and accurate record-keeping.
+<CONTEXT>
 "mblNumber": "Extract the value after the label 'BOOKING NO.'.",
 "gateInReference": "Extract the value after the label 'BOOKING NO.'.",
 "pickUpReference": "Extract the value after the label 'BOOKING NO.'.",
@@ -14,23 +22,19 @@ your task is to extract the text value of the following entities and page number
 "portOfDischarge": "Extract the text after the label 'PORT OF DISCHARGING:' and before 'FINAL DESTINATION'.",
 "pickUpTerminal": "Extract the text after the label 'EMPTY PICK UP AT:' removing any extra spaces or line breaks.",
 "gateInTerminal": "Extract the text after the label 'FULL RETURN TO:' removing any extra spaces or line breaks.",
-"transportLegs": [
-{
-"portOfLoading": "For the first leg, use the extracted 'portOfLoading'.",
-"portOfDischarge": "Extract the text after the label 'T/S PORT OF LOADING:'.",
-"vesselName": "For the first leg, use the extracted 'vesselName'.",
-"voyage": "Voyage is a code of numbers and letters sometimes separated by '-'. For the first leg, use the extracted 'voyage'.",
-"eta": "Extract the date after the label 'ETA DATE' that appears within the section starting with 'FINAL DESTINATION:' and ending with 'T/S PORT OF LOADING:'.",
-"etd": "Extract the date after the label 'ETD DATE' that appears within the section starting with 'PORT OF LOADING:' and ending with 'FINAL DESTINATION:'.",
-},
-{
+"transportLegs":
+    "portOfLoading": "For the first leg, use the extracted 'portOfLoading'.",
+    "portOfDischarge": "Extract the text after the label 'T/S PORT OF LOADING:'.",
+    "vesselName": "For the first leg, use the extracted 'vesselName'.",
+    "voyage": "Voyage is a code of numbers and letters sometimes separated by '-'. For the first leg, use the extracted 'voyage'.",
+    "eta": "Extract the date after the label 'ETA DATE' that appears within the section starting with 'FINAL DESTINATION:' and ending with 'T/S PORT OF LOADING:'.",
+    "etd": "Extract the date after the label 'ETD DATE' that appears within the section starting with 'PORT OF LOADING:' and ending with 'FINAL DESTINATION:'.",
 "portOfLoading": "For the second leg, use the 'portOfDischarge' from the previous leg.",
 "portOfDischarge": "For the second leg, use the extracted 'portOfDischarge' from the main extraction.",
 "vesselName": "Extract the text after the label 'EST. CONNECT VSL/VOY:' and before the hyphen and numbers.",
 "voyage": "Voyage is a code of numbers and letters sometimes separated by '-'. Extract the code after the label 'EST. CONNECT VSL/VOY:' and after the vessel name.",
 "eta": "Extract the date after the label 'ETA DATE' that is after the line that contains 'T/S PORT OF LOADING'",
 "etd": "Extract the date after the label 'ETD DATE' that is related to the 'EST. CONNECT VSL/VOY:'. "
-}
-]
-}
-```

data-science-document-ai 1.51.0__tar.gz → 1.58.0__tar.gz

data-science-document-ai 1.51.0tar.gz → 1.58.0tar.gz