PyPI - data-science-document-ai - Versions diffs - 1.40.4__tar.gz → 1.42.0__tar.gz - Mend

data-science-document-ai 1.40.4tar.gz → 1.42.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.40.4
+Version: 1.42.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-science-document-ai"
-version = "1.40.4"
+version = "1.42.0"
 description = "\"Document AI repo for data science\""
 authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
 packages = [

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/docai.py RENAMED Viewed

@@ -3,11 +3,16 @@ import re
 from google.cloud import documentai
-from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
+from src.io import (
+    delete_folder_from_bucket,
+    get_gcp_labels,
+    logger,
+    upload_pdf_to_bucket,
+)
 from src.utils import cache_on_disk
-async def _process_pdf_w_docai(image_content, client, processor_name):
+async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
     """Process the PDF using Document AI.
     Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
         client: The Document AI client.
         processor_name (str): The name of the processor to be used.
                             e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
+        doc_type (str, optional): Document type for cost tracking labels.
     Returns:
         The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
         content=image_content, mime_type="application/pdf"
     )
-    # Configure the process request
+    # Configure the process request with labels for cost tracking
     request = documentai.ProcessRequest(
         name=processor_name,
         raw_document=raw_document,  # field_mask=field_mask
+        labels=get_gcp_labels(doc_type=doc_type),
     )
     result = await cache_on_disk(client.process_document, request=request)
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
 async def _batch_process_pdf_w_docai(
-    params, image_content, client, processor_name, timeout=1200
+    params, image_content, client, processor_name, timeout=1200, doc_type=None
 ):
     """Process the PDF using Document AI Batch Process API.
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
         processor_name (str): The name of the processor to be used.
                             e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
         timeout (int, optional): The timeout in seconds. Defaults to 1200.
+        doc_type (str, optional): Document type for cost tracking labels.
     Returns:
         The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
     # Where to write results
     output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
-    # The full resource name of the processor
+    # The full resource name of the processor with labels for cost tracking
     request = documentai.BatchProcessRequest(
         name=processor_name,
         input_documents=input_config,
         document_output_config=output_config,
+        labels=get_gcp_labels(doc_type=doc_type),
     )
     # BatchProcess returns a Long Running Operation (LRO)

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/excel_processing.py RENAMED Viewed

@@ -2,6 +2,8 @@
 # flake8: noqa: E402
 import logging
+from src.postprocessing.common import llm_prediction_to_tuples
 logger = logging.getLogger(__name__)
 import asyncio
@@ -14,7 +16,9 @@ from src.llm import prompt_excel_extraction
 from src.utils import generate_schema_structure, get_excel_sheets
-async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
+async def extract_data_from_sheet(
+    params, sheet_name, sheet, response_schema, doc_type=None
+):
     logger.info(f"Processing sheet: {sheet_name}")
     excel_content = pd.DataFrame(sheet.values)
     # Convert to Markdown format for the LLM model
@@ -30,6 +34,7 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
         result = await params["LlmClient"].get_unified_json_genai(
             prompt_docai,
             response_schema=response_schema,
+            doc_type=doc_type,
         )
     except Exception as e:
         result = {}
@@ -67,12 +72,17 @@ async def extract_data_from_excel(
     # Excel files may contain multiple sheets. Extract data from each sheet
     sheet_extract_tasks = [
         extract_data_from_sheet(
-            params, sheet_name, workbook[sheet_name], response_schema
+            params,
+            sheet_name,
+            workbook[sheet_name],
+            response_schema,
+            doc_type=input_doc_type,
         )
         for sheet_name in sheets
     ]
     extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
-    stored_data = json.dumps(extracted_data)
+    # Convert LLM prediction dictionary to tuples of (value, page_number).
+    extracted_data = llm_prediction_to_tuples(extracted_data)
-    return extracted_data, stored_data, params["gemini_params"]["model_id"]
+    return extracted_data, extracted_data, params["gemini_params"]["model_id"]

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/io.py RENAMED Viewed

@@ -11,6 +11,28 @@ from pathlib import Path
 from google.cloud import bigquery, storage
+from src.constants import project_parameters
+def get_gcp_labels(**extra_labels):
+    """Generate standardized GCP labels for cost tracking.
+    Args:
+        **extra_labels: Additional custom labels
+    Returns:
+        dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
+    """
+    labels = {
+        "ds-project-name": project_parameters["project_name"],
+        "ds-env": os.getenv("CLUSTER", "local").lower(),
+    }
+    # Add any extra labels passed in
+    labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
+    return labels
 def get_bq_client(params):
     """Get Google BigQuery client."""
@@ -18,7 +40,7 @@ def get_bq_client(params):
     job_config = bigquery.QueryJobConfig(
         allow_large_results=True,
         # flatten_results=True,
-        labels={"project-name": params["project_name"]},
+        labels=get_gcp_labels(),
     )
     return bq_client, job_config
@@ -112,3 +134,6 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
         Path(directory).mkdir(parents=True, exist_ok=True)
         blob.download_to_filename(directory_local / Path(blob.name))
     return result
+# type: ignore

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/llm.py RENAMED Viewed

@@ -15,6 +15,7 @@ from vertexai.generative_models import (
     Part,
 )
+from src.io import get_gcp_labels
 from src.utils import cache_on_disk
@@ -69,6 +70,7 @@ class LlmClient:
         document: str = None,
         response_schema: dict = None,
         response_mime_type: str = "application/json",
+        doc_type: str = None,
     ):
         """Ask the Gemini model a question.
@@ -76,6 +78,7 @@ class LlmClient:
             prompt (str): The prompt to send to the model.
             document (str, optional): An optional document to provide context.
             response_schema (dict, optional): Defines a specific response schema for the model.
+            doc_type (str, optional): Document type for cost tracking labels.
         Returns:
             str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
             # Prepare inputs for the model
             inputs = [document, prompt] if document else prompt
-            # Generate the response
+            # Generate the response with labels for cost tracking
             model_response = await cache_on_disk(
                 self.geminy_client.generate_content_async,
                 contents=inputs,
                 generation_config=config,
                 safety_settings=self.safety_config,
+                labels=get_gcp_labels(doc_type=doc_type),
             )
             response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
             return "{}"
     async def get_unified_json_genai(
-        self, prompt, document=None, response_schema=None, model="gemini"
+        self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
     ):
         """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
@@ -122,6 +126,7 @@ class LlmClient:
             document: Content of the PDF document
             response_schema: The schema to use for the response
             model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
+            doc_type (str, optional): Document type for cost tracking labels.
         Returns:
             dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
             response = await self.ask_chatgpt(prompt, document, response_schema)
         else:
             # Default to Gemini
-            response = await self.ask_gemini(prompt, document, response_schema)
+            response = await self.ask_gemini(
+                prompt, document, response_schema, doc_type=doc_type
+            )
         try:
             return json.loads(response)

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/pdf_processing.py RENAMED Viewed

@@ -14,7 +14,11 @@ from google.cloud.documentai_v1 import Document as docaiv1_document
 from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
 from src.excel_processing import extract_data_from_excel
-from src.postprocessing.common import format_all_entities, remove_none_values
+from src.postprocessing.common import (
+    format_all_entities,
+    llm_prediction_to_tuples,
+    remove_none_values,
+)
 from src.postprocessing.postprocess_booking_confirmation import (
     postprocess_booking_confirmation,
 )
@@ -30,11 +34,14 @@ from src.utils import (
     generate_schema_structure,
     get_processor_name,
     run_background_tasks,
+    transform_schema_strings,
     validate_based_on_schema,
 )
-async def process_file_w_docai(params, image_content, client, processor_name):
+async def process_file_w_docai(
+    params, image_content, client, processor_name, doc_type=None
+):
     """
     Process a file using Document AI.
@@ -43,6 +50,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
         image_content (bytes): The file to be processed. It can be bytes object.
         client: The Document AI client.
         processor_name (str): The name of the processor to be used.
+        doc_type (str, optional): Document type for cost tracking labels.
     Returns:
         The processed document.
@@ -54,7 +62,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
     try:
         logger.info("Processing document...")
-        result = await _process_pdf_w_docai(image_content, client, processor_name)
+        result = await _process_pdf_w_docai(
+            image_content, client, processor_name, doc_type=doc_type
+        )
     except Exception as e:
         if e.reason == "PAGE_LIMIT_EXCEEDED":
             logger.warning(
@@ -63,7 +73,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
             # Process the document in batch method (offline processing)
             try:
                 result = await _batch_process_pdf_w_docai(
-                    params, image_content, client, processor_name
+                    params, image_content, client, processor_name, doc_type=doc_type
                 )
             except Exception as batch_e:
                 logger.error(f"Error processing document {batch_e}.")
@@ -93,7 +103,7 @@ async def extract_data_from_pdf_w_docai(
         )
     result = await process_file_w_docai(
-        params, file_content, processor_client, processor_name
+        params, file_content, processor_client, processor_name, doc_type=input_doc_type
     )
     # Create an entity object to store the result in gcs
@@ -104,9 +114,22 @@ async def extract_data_from_pdf_w_docai(
     # Extract entities from the result
     for entity in result.entities:
         value = (
-            {child.type_: child.mention_text for child in entity.properties}
+            {
+                child.type_: (
+                    child.mention_text,
+                    child.page_anchor.page_refs[0].page
+                    if hasattr(child.page_anchor.page_refs[0], "page")
+                    else 0,
+                )
+                for child in entity.properties
+            }
             if entity.properties
-            else entity.mention_text
+            else (
+                entity.mention_text,
+                entity.page_anchor.page_refs[0].page
+                if hasattr(entity.page_anchor.page_refs[0], "page")
+                else 0,
+            )
         )
         aggregated_data[entity.type_].append(value)
@@ -137,7 +160,9 @@ async def extract_data_from_pdf_w_docai(
     return aggregated_data, result_for_store, processor_version
-async def identify_carrier(document, llm_client, prompt, response_schema):
+async def identify_carrier(
+    document, llm_client, prompt, response_schema, doc_type=None
+):
     """Identify the carrier from the Booking Confirmation document."""
     result = await llm_client.ask_gemini(
@@ -145,6 +170,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
         document=document,
         response_schema=response_schema,
         response_mime_type="text/x.enum",
+        doc_type=doc_type,
     )
     if result:
@@ -201,7 +227,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         # identify carrier for customized prompting
         carrier = await identify_carrier(
-            document, llm_client, carrier_prompt, carrier_schema
+            document,
+            llm_client,
+            carrier_prompt,
+            carrier_schema,
+            doc_type=input_doc_type,
         )
     if input_doc_type == "bookingConfirmation":
@@ -218,8 +248,14 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         # generate the result with LLM (gemini)
         result = await llm_client.get_unified_json_genai(
-            prompt=prompt, document=document, response_schema=response_schema
+            prompt=prompt,
+            document=document,
+            response_schema=response_schema,
+            doc_type=input_doc_type,
         )
+        result = llm_prediction_to_tuples(result)
         return result
     return {}

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/common.py RENAMED Viewed

@@ -380,11 +380,18 @@ async def format_label(entity_k, entity_value, document_type_code, params):
             ]
         )
         return entity_k, [v for _, v in format_tasks]
+    if isinstance(entity_value, tuple):
+        page = entity_value[1]
+        entity_value = entity_value[0]
+    else:
+        page = -1
     entity_key = entity_k.lower()
     formatted_value = None
     if entity_key.startswith("port"):
-        formatted_value = await get_port_code_ai(entity_value, llm_client)
+        formatted_value = await get_port_code_ai(
+            entity_value, llm_client, doc_type=document_type_code
+        )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
         formatted_value = get_tms_mappings(entity_value, "container_types")
@@ -474,18 +481,19 @@ async def format_label(entity_k, entity_value, document_type_code, params):
     result = {
         "documentValue": entity_value,
         "formattedValue": formatted_value,
+        "page": page,
     }
     return entity_k, result
-async def get_port_code_ai(port: str, llm_client):
+async def get_port_code_ai(port: str, llm_client, doc_type=None):
     """Get port code using AI model."""
-    port_llm = await get_port_code_llm(port, llm_client)
+    port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
     return get_tms_mappings(port, "ports", port_llm)
-async def get_port_code_llm(port: str, llm_client):
+async def get_port_code_llm(port: str, llm_client, doc_type=None):
     if (
         "postprocessing" in prompt_library.library.keys()
         and "port_code" in prompt_library.library["postprocessing"].keys()
@@ -512,7 +520,7 @@ async def get_port_code_llm(port: str, llm_client):
         }
         response = await llm_client.get_unified_json_genai(
-            prompt, response_schema=response_schema, model="chatgpt"
+            prompt, response_schema=response_schema, model="chatgpt", doc_type=doc_type
         )
         try:
             mapped_port = response["port"]
@@ -616,3 +624,24 @@ def remove_stop_words(lineitem: str):
         .upper()
         .strip()
     )
+def llm_prediction_to_tuples(llm_prediction):
+    """Convert LLM prediction dictionary to tuples of (value, page_number)."""
+    if isinstance(llm_prediction, dict):
+        if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
+            if llm_prediction["value"]:
+                try:
+                    page_number = int(llm_prediction["page_number"])
+                except:  # noqa: E722
+                    page_number = -1
+                return (llm_prediction["value"], page_number)
+            return None
+        for key, value in llm_prediction.items():
+            llm_prediction[key] = llm_prediction_to_tuples(
+                llm_prediction.get(key, value)
+            )
+    elif isinstance(llm_prediction, list):
+        for i, item in enumerate(llm_prediction):
+            llm_prediction[i] = llm_prediction_to_tuples(item)
+    return llm_prediction

data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json ADDED Viewed

@@ -0,0 +1,32 @@
+{
+  "type": "OBJECT",
+  "properties": {
+    "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
+    "bookingNumber": {"type": "STRING", "nullable": true},
+    "cyCutOff": {"type": "STRING", "nullable": true},
+    "gateInReference": {"type": "STRING", "nullable": true},
+    "gateInTerminal": {"type": "STRING", "nullable": true},
+    "mblNumber": {"type": "STRING", "nullable": true},
+    "pickUpReference": {"type": "STRING", "nullable": true},
+    "pickUpTerminal": {"type": "STRING", "nullable": true},
+    "siCutOff": {"type": "STRING", "nullable": true},
+    "vgmCutOff": {"type": "STRING", "nullable": true},
+    "transportLegs": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+            "eta": {"type": "STRING", "nullable": true},
+            "etd": {"type": "STRING", "nullable": true},
+            "imoNumber": {"type": "STRING", "nullable": true},
+            "portOfDischarge": {"type": "STRING", "nullable": true},
+            "portOfLoading": {"type": "STRING", "nullable": true},
+            "vesselName": {"type": "STRING", "nullable": true},
+            "voyage": {"type": "STRING", "nullable": true}
+          },
+        "required": []
+      }
+    }
+  },
+  "required": []
+}

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt RENAMED Viewed

@@ -1,3 +1,4 @@
+your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
 ```json
 {
 "mblNumber": "Extract the value after the label 'BOOKING NO.'.",

data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json ADDED Viewed

@@ -0,0 +1,32 @@
+{
+  "type": "OBJECT",
+  "properties": {
+    "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
+    "bookingNumber": {"type": "STRING", "nullable": true},
+    "cyCutOff": {"type": "STRING", "nullable": true},
+    "gateInReference": {"type": "STRING", "nullable": true},
+    "gateInTerminal": {"type": "STRING", "nullable": true},
+    "mblNumber": {"type": "STRING", "nullable": true},
+    "pickUpReference": {"type": "STRING", "nullable": true},
+    "pickUpTerminal": {"type": "STRING", "nullable": true},
+    "siCutOff": {"type": "STRING", "nullable": true},
+    "vgmCutOff": {"type": "STRING", "nullable": true},
+    "transportLegs": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+            "eta": {"type": "STRING", "nullable": true},
+            "etd": {"type": "STRING", "nullable": true},
+            "imoNumber": {"type": "STRING", "nullable": true},
+            "portOfDischarge": {"type": "STRING", "nullable": true},
+            "portOfLoading": {"type": "STRING", "nullable": true},
+            "vesselName": {"type": "STRING", "nullable": true},
+            "voyage": {"type": "STRING", "nullable": true}
+          },
+        "required": []
+      }
+    }
+  },
+  "required": []
+}

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt RENAMED Viewed

@@ -18,7 +18,7 @@ transportLegs:
     vesselName: The name of the vessel for a specific leg.
     voyage: The journey or route taken by the vessel for a specific leg.
-your task is to extract the text value of the following entities:
+your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
 SCHEMA_PLACEHOLDER
 Keywords for datapoints:

data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json ADDED Viewed

@@ -0,0 +1,32 @@
+{
+  "type": "OBJECT",
+  "properties": {
+    "bookingNumber": {"type": "STRING", "nullable": true},
+    "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
+    "cyCutOff": {"type": "STRING", "nullable": true},
+    "gateInReference": {"type": "STRING", "nullable": true},
+    "gateInTerminal": {"type": "STRING", "nullable": true},
+    "mblNumber": {"type": "STRING", "nullable": true},
+    "pickUpReference": {"type": "STRING", "nullable": true},
+    "pickUpTerminal": {"type": "STRING", "nullable": true},
+    "siCutOff": {"type": "STRING", "nullable": true},
+    "vgmCutOff": {"type": "STRING", "nullable": true},
+    "transportLegs": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+            "eta": {"type": "STRING", "nullable": true},
+            "etd": {"type": "STRING", "nullable": true},
+            "imoNumber": {"type": "STRING", "nullable": true},
+            "portOfDischarge": {"type": "STRING", "nullable": true},
+            "portOfLoading": {"type": "STRING", "nullable": true},
+            "vesselName": {"type": "STRING", "nullable": true},
+            "voyage": {"type": "STRING", "nullable": true}
+          },
+        "required": []
+      }
+    }
+  },
+  "required": []
+}

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt RENAMED Viewed

@@ -18,7 +18,7 @@ transportLegs:
     vesselName: The name of the vessel for a specific leg.
     voyage: The journey or route taken by the vessel for a specific leg.
-your task is to extract the text value of the following entities:
+your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
 SCHEMA_PLACEHOLDER
 Keywords for datapoints:

data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/msc/placeholders.json ADDED Viewed

@@ -0,0 +1,32 @@
+{
+  "type": "OBJECT",
+  "properties": {
+    "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
+    "bookingNumber": {"type": "STRING", "nullable": true},
+    "cyCutOff": {"type": "STRING", "nullable": true},
+    "gateInReference": {"type": "STRING", "nullable": true},
+    "gateInTerminal": {"type": "STRING", "nullable": true},
+    "mblNumber": {"type": "STRING", "nullable": true},
+    "pickUpReference": {"type": "STRING", "nullable": true},
+    "pickUpTerminal": {"type": "STRING", "nullable": true},
+    "siCutOff": {"type": "STRING", "nullable": true},
+    "vgmCutOff": {"type": "STRING", "nullable": true},
+    "transportLegs": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+            "eta": {"type": "STRING", "nullable": true},
+            "etd": {"type": "STRING", "nullable": true},
+            "imoNumber": {"type": "STRING", "nullable": true},
+            "portOfDischarge": {"type": "STRING", "nullable": true},
+            "portOfLoading": {"type": "STRING", "nullable": true},
+            "vesselName": {"type": "STRING", "nullable": true},
+            "voyage": {"type": "STRING", "nullable": true}
+          },
+        "required": []
+      }
+    }
+  },
+  "required": []
+}

{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt RENAMED Viewed

@@ -18,7 +18,7 @@ transportLegs:
     vesselName: The name of the vessel for a specific leg.
     voyage: The journey or route taken by the vessel for a specific leg.
-your task is to extract the text value of the following entities:
+your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
 SCHEMA_PLACEHOLDER
 Further explanation and Keywords for the transportLegs part as follows. The below 2 conditions is crucial. Take attention here:

data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json ADDED Viewed

@@ -0,0 +1,32 @@
+{
+  "type": "OBJECT",
+  "properties": {
+    "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
+    "bookingNumber": {"type": "STRING", "nullable": true},
+    "cyCutOff": {"type": "STRING", "nullable": true},
+    "gateInReference": {"type": "STRING", "nullable": true},
+    "gateInTerminal": {"type": "STRING", "nullable": true},
+    "mblNumber": {"type": "STRING", "nullable": true},
+    "pickUpReference": {"type": "STRING", "nullable": true},
+    "pickUpTerminal": {"type": "STRING", "nullable": true},
+    "siCutOff": {"type": "STRING", "nullable": true},
+    "vgmCutOff": {"type": "STRING", "nullable": true},
+    "transportLegs": {
+      "type": "ARRAY",
+      "items": {
+        "type": "OBJECT",
+        "properties": {
+            "eta": {"type": "STRING", "nullable": true},
+            "etd": {"type": "STRING", "nullable": true},
+            "portOfDischarge": {"type": "STRING", "nullable": true},
+            "portOfLoading": {"type": "STRING", "nullable": true},
+            "vesselName": {"type": "STRING", "nullable": true},
+            "voyage": {"type": "STRING", "nullable": true},
+            "imoNumber": {"type": "STRING", "nullable": true}
+        },
+        "required": []
+      }
+    }
+  },
+  "required": []
+}

data-science-document-ai 1.40.4__tar.gz → 1.42.0__tar.gz

data-science-document-ai 1.40.4tar.gz → 1.42.0tar.gz