PyPI - data-science-document-ai - Versions diffs - 1.43.3__py3-none-any.whl → 1.43.5__py3-none-any.whl - Mend

data-science-document-ai 1.43.3py3-none-any.whl → 1.43.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{data_science_document_ai-1.43.3.dist-info → data_science_document_ai-1.43.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.43.3
+Version: 1.43.5
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.43.3.dist-info → data_science_document_ai-1.43.5.dist-info}/RECORD RENAMED Viewed

@@ -2,12 +2,12 @@ src/constants.py,sha256=rpYIecVLIBLh98YrJ8e5gdvM0bqrXJZWIKgFkUSn69g,3513
 src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
 src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
 src/docai_processor_config.yaml,sha256=81NUGs-u8UFJm6mc0ZOeeNQlhe9h0f35GhjTcwErvTA,1717
-src/excel_processing.py,sha256=gzP7QFCp4-n0FTevhWmXm-2UoDF0w0y5v39gsby0IV8,3135
+src/excel_processing.py,sha256=PdypkXHf-hln5cq5TyJ_IVybZk-rJF1NKZ50KXuOSdY,3390
 src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
 src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
 src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
-src/pdf_processing.py,sha256=sr41bSMbH-WwWQ9uF5WKnsrkXJzCDirziv3TaS8hoPQ,17164
-src/postprocessing/common.py,sha256=b0VpxM-levZi_8H5a9gDNjx-67W6F7xRFUDa7CJJGgQ,22214
+src/pdf_processing.py,sha256=DaFM8ioERj7YeC8Yjki_dfSnKt0lf7DB14ks9i4OAfA,17741
+src/postprocessing/common.py,sha256=fU3ECfnR0rpF21DnVYM2YM7kPEB4gRJuMasyrNupsaA,23026
 src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
 src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
 src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
@@ -26,12 +26,12 @@ src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOc
 src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
 src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
 src/prompts/library/bundeskasse/other/placeholders.json,sha256=1ll8AI58F2zRDSwQq_r0gxQdxlQB521l5CuiJ-8G6us,4068
-src/prompts/library/bundeskasse/other/prompt.txt,sha256=WV4D3ellIcB2cVmsZXCpbbHOShYY8VN_iZrYOuyoqzw,2937
+src/prompts/library/bundeskasse/other/prompt.txt,sha256=MBv4MIMASMstkzDS7H0q_pNJbPQeadP1vcmhCRrpjQ4,2906
 src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
-src/prompts/library/commercialInvoice/other/prompt.txt,sha256=6sowYMzrKvgmTDpDnAzkeG4OqA44e6-8aUKWRKNziBY,2699
+src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
 src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
 src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
-src/prompts/library/customsInvoice/other/prompt.txt,sha256=Q5ihAVaZFToZ75D01ICEdCRB8nY_FD5DL3yuFvJ4418,9632
+src/prompts/library/customsInvoice/other/prompt.txt,sha256=daSRssY8zcboCJCuqbLqehGR5dJs_wp4hOZHRol3KqU,9595
 src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
 src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
 src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
@@ -44,16 +44,16 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylg
 src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
 src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
 src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
-src/prompts/library/partnerInvoice/other/prompt.txt,sha256=fGUtMYWvhedmSiv9xShRv0cHXmEws1D9pQmZP1E2gl0,7806
+src/prompts/library/partnerInvoice/other/prompt.txt,sha256=4WGEQ6EiOtQxB7iwKy_Hg0PQzCEoFbjJUwEawwTgWiw,7775
 src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
 src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
 src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
 src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
 src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYpwmok_1dWQ2Oa8Qy9NGZ6CCLOUI4,1468
-src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg,2760
+src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
 src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
 src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
-src/utils.py,sha256=cTF2A12jugKjXxGlNXEZQtfgcsIoaTtaU7zhVOOvXXA,16634
-data_science_document_ai-1.43.3.dist-info/METADATA,sha256=6WQCGhLAMXOWEdTyPax7z0teZpgl-poGtI3o3X_P164,2152
-data_science_document_ai-1.43.3.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-data_science_document_ai-1.43.3.dist-info/RECORD,,
+src/utils.py,sha256=Ro4FEYo28VgJwTy842MkNrK5MIAWglW0CmDcfDEhmAo,16514
+data_science_document_ai-1.43.5.dist-info/METADATA,sha256=2XHEh0gDLvzPfNKgt1mwIx4THUV5dgIFLK3K2tWFgqQ,2152
+data_science_document_ai-1.43.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+data_science_document_ai-1.43.5.dist-info/RECORD,,

src/excel_processing.py CHANGED Viewed

@@ -9,17 +9,17 @@ from src.postprocessing.common import llm_prediction_to_tuples
 logger = logging.getLogger(__name__)
 import asyncio
-import json
 import numpy as np
 import pandas as pd
 from src.llm import prompt_excel_extraction
+from src.prompts.prompt_library import prompt_library
 from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
 async def extract_data_from_sheet(
-    params, sheet_name, sheet, response_schema, doc_type=None
+    llm_client, sheet_name, sheet, response_schema, doc_type=None
 ):
     logger.info(f"Processing sheet: {sheet_name}")
     excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
@@ -34,7 +34,7 @@ async def extract_data_from_sheet(
     prompt_docai = prompt_excel_extraction(worksheet)
     try:
-        result = await params["LlmClient"].get_unified_json_genai(
+        result = await llm_client.get_unified_json_genai(
             prompt_docai,
             response_schema=response_schema,
             doc_type=doc_type,
@@ -51,6 +51,7 @@ async def extract_data_from_excel(
     input_doc_type,
     file_content,
     mime_type,
+    llm_client,
 ):
     """Extract data from the Excel file.
@@ -59,6 +60,7 @@ async def extract_data_from_excel(
         input_doc_type (str): The type of the document.
         file_content (bytes): The content of the Excel file to process.
         mime_type (str): The MIME type of the file.
+        llm_client: The LLM client to use for data extraction.
     Returns:
         formatted_data (list): A list of dictionaries containing the extracted data.
@@ -67,7 +69,18 @@ async def extract_data_from_excel(
     """
     # Generate the response structure
-    response_schema = generate_schema_structure(params, input_doc_type)
+    response_schema = (
+        prompt_library.library[input_doc_type]["other"]["placeholders"]
+        if input_doc_type
+        in [
+            "partnerInvoice",
+            "customsInvoice",
+            "bundeskasse",
+            "commercialInvoice",
+            "packingList",
+        ]
+        else generate_schema_structure(params, input_doc_type)
+    )
     # Load the Excel file and get ONLY the "visible" sheet names
     sheets, workbook = get_excel_sheets(file_content, mime_type)
@@ -84,7 +97,7 @@ async def extract_data_from_excel(
     # Excel files may contain multiple sheets. Extract data from each sheet
     sheet_extract_tasks = [
         extract_data_from_sheet(
-            params,
+            llm_client,
             sheet_name,
             workbook[sheet_name],
             response_schema,
@@ -94,7 +107,4 @@ async def extract_data_from_excel(
     ]
     extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
-    # Convert LLM prediction dictionary to tuples of (value, page_number).
-    extracted_data = llm_prediction_to_tuples(extracted_data)
-    return extracted_data, extracted_data, params["gemini_params"]["model_id"]
+    return extracted_data, extracted_data, llm_client.model_id

src/pdf_processing.py CHANGED Viewed

@@ -36,6 +36,7 @@ from src.utils import (
     get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
+    transform_schema_strings,
     validate_based_on_schema,
 )
@@ -199,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         if input_doc_type == "bundeskasse"
         else file_content
     )
+    number_of_pages = get_pdf_page_count(file_content)
     # convert file_content to required document
     document = llm_client.prepare_document_for_gemini(file_content)
@@ -254,6 +256,13 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         # get the related prompt from predefined prompt library
         prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
+        # Update schema to extract value-page_number pairs
+        if number_of_pages > 1:
+            response_schema = transform_schema_strings(response_schema)
+            # Update the prompt to instruct LLM to include page numbers
+            prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
         # generate the result with LLM (gemini)
         result = await llm_client.get_unified_json_genai(
             prompt=prompt,
@@ -262,7 +271,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
             doc_type=input_doc_type,
         )
-        result = llm_prediction_to_tuples(result)
+        result = llm_prediction_to_tuples(result, number_of_pages)
         return result
     return {}
@@ -342,15 +351,9 @@ async def extract_data_by_doctype(
     processor_client,
     if_use_docai,
     if_use_llm,
+    llm_client,
     isBetaTest=False,
 ):
-    # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
-    llm_client = (
-        params["LlmClient_Flash"]
-        if input_doc_type not in ["customsInvoice", "partnerInvoice"]
-        else params["LlmClient"]
-    )
     async def extract_w_docai():
         return await extract_data_from_pdf_w_docai(
             params=params,
@@ -420,6 +423,14 @@ async def data_extraction_manual_flow(
     """
     # Get the start time for processing
     start_time = asyncio.get_event_loop().time()
+    # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
+    llm_client = (
+        params["LlmClient_Flash"]
+        if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
+        else params["LlmClient"]
+    )
     page_count = None
     # Validate the file type
     if mime_type == "application/pdf":
@@ -442,6 +453,7 @@ async def data_extraction_manual_flow(
             processor_client,
             if_use_docai=if_use_docai,
             if_use_llm=if_use_llm,
+            llm_client=llm_client,
             isBetaTest=False,
         )
         page_count = get_pdf_page_count(file_content)
@@ -453,6 +465,7 @@ async def data_extraction_manual_flow(
             input_doc_type=meta.documentTypeCode,
             file_content=file_content,
             mime_type=mime_type,
+            llm_client=llm_client,
         )
         # Get sheet count from dd-trace span (set in extract_data_from_excel)
@@ -472,7 +485,7 @@ async def data_extraction_manual_flow(
         )
     # Create the result dictionary with the extracted data
     extracted_data = await format_all_entities(
-        extracted_data, meta.documentTypeCode, params
+        extracted_data, meta.documentTypeCode, params, mime_type
     )
     result = {
         "id": meta.id,

src/postprocessing/common.py CHANGED Viewed

@@ -372,27 +372,30 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     return re.sub(r"\s{2,}", " ", lineitem).strip()
-async def format_label(entity_k, entity_value, document_type_code, params):
+async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
     llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, document_type_code, params)
+            format_label(sub_k, sub_v, document_type_code, params, mime_type)
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
     if isinstance(entity_value, list):
         format_tasks = await asyncio.gather(
             *[
-                format_label(entity_k, sub_v, document_type_code, params)
+                format_label(entity_k, sub_v, document_type_code, params, mime_type)
                 for sub_v in entity_value
             ]
         )
         return entity_k, [v for _, v in format_tasks]
-    if isinstance(entity_value, tuple):
-        page = entity_value[1]
-        entity_value = entity_value[0]
-    else:
-        page = -1
+    if mime_type == "application/pdf":
+        if isinstance(entity_value, tuple):
+            page = entity_value[1]
+            entity_value = entity_value[0]
+        else:
+            page = -1
     entity_key = entity_k.lower()
     formatted_value = None
@@ -493,8 +496,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
     result = {
         "documentValue": entity_value,
         "formattedValue": formatted_value,
-        "page": page,
     }
+    if mime_type == "application/pdf":
+        result["page"] = page
     return entity_k, result
@@ -593,7 +598,7 @@ def decimal_convertor(value, quantity=False):
     return value
-async def format_all_entities(result, document_type_code, params):
+async def format_all_entities(result, document_type_code, params, mime_type):
     """Format the entity values in the result dictionary."""
     # Since we treat `customsInvoice` same as `partnerInvoice`
     document_type_code = (
@@ -608,7 +613,9 @@ async def format_all_entities(result, document_type_code, params):
         return {}
     # Format all entities recursively
-    _, aggregated_data = await format_label(None, result, document_type_code, params)
+    _, aggregated_data = await format_label(
+        None, result, document_type_code, params, mime_type
+    )
     # Process partner invoice on lineitem mapping and reverse charge sentence
     if document_type_code in ["partnerInvoice", "bundeskasse"]:
@@ -644,8 +651,24 @@ def remove_stop_words(lineitem: str):
     )
-def llm_prediction_to_tuples(llm_prediction):
+def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
     """Convert LLM prediction dictionary to tuples of (value, page_number)."""
+    # If only 1 page, simply pair each value with page number 0
+    if number_of_pages == 1:
+        if isinstance(llm_prediction, dict):
+            return {
+                k: llm_prediction_to_tuples(v, number_of_pages)
+                for k, v in llm_prediction.items()
+            }
+        elif isinstance(llm_prediction, list):
+            return [
+                llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
+            ]
+        else:
+            return (llm_prediction, 0) if llm_prediction else None
+    # logic for multi-page predictions
     if isinstance(llm_prediction, dict):
         if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
             if llm_prediction["value"]:
@@ -655,11 +678,14 @@ def llm_prediction_to_tuples(llm_prediction):
                     page_number = -1
                 return (llm_prediction["value"], page_number)
             return None
         for key, value in llm_prediction.items():
             llm_prediction[key] = llm_prediction_to_tuples(
-                llm_prediction.get(key, value)
+                llm_prediction.get(key, value), number_of_pages
             )
     elif isinstance(llm_prediction, list):
         for i, item in enumerate(llm_prediction):
-            llm_prediction[i] = llm_prediction_to_tuples(item)
+            llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
     return llm_prediction

src/prompts/library/bundeskasse/other/prompt.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
-<TASK>Your task is to extract data and page numbers starting from 0 from customs invoice documents as per the given response schema structure.<TASK>
+<TASK> Your task is to extract data from customs invoice documents as per the given response schema structure. <TASK>
 <CONTEXT>
 The Freight Forwarding company receives Customs invoices from Customs Brokers called Bundeskasse.

src/prompts/library/commercialInvoice/other/prompt.txt CHANGED Viewed

@@ -2,7 +2,7 @@ Task: You are a document entity extraction specialist. Given a document, your ta
 Extract all the data points from the given document.
 Each data point is part of a master field called skus. There may be multiple skus entries in a document.
-Your task is to extract the text value of the entities and page numbers starting from 0 starting from 0 where the value was found in the document.
+Your task is to extract the text value of the entities and page numbers starting from 0 where the value was found in the document.
 Instructions:

src/prompts/library/customsInvoice/other/prompt.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
-<TASK>Your task is to extract data and their page numbers starting from 0 from invoice documents as per the given response schema structure.<TASK>
+<TASK> Your task is to extract data from invoice documents as per the given response schema structure. <TASK>
 <CONTEXT>
 The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.

src/prompts/library/partnerInvoice/other/prompt.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
-<TASK>Your task is to extract data and page numbers starting from 0 from invoice documents as per the given response schema structure.<TASK>
+<TASK> Your task is to extract data from invoice documents as per the given response schema structure. <TASK>
 <CONTEXT>
 The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.

src/prompts/prompt_library.py CHANGED Viewed

@@ -4,8 +4,6 @@ import os
 from pathlib import Path
 from typing import Dict
-from src.utils import transform_schema_strings
 class PromptLibrary:
     """
@@ -43,8 +41,6 @@ class PromptLibrary:
             if file == "placeholders.json":
                 with open(path_to_library / prompt_type / prompt_subtype / file) as f:
                     placeholders = json.load(f)
-                    if prompt_type not in ["postprocessing", "preprocessing"]:
-                        placeholders = transform_schema_strings(placeholders)
                     self.library[prompt_type][prompt_subtype][
                         "placeholders"
                     ] = placeholders

src/utils.py CHANGED Viewed

@@ -314,9 +314,6 @@ def generate_schema_structure(params, input_doc_type):
             "type": "string",
         }
-    # update schema to extract value-page_number pairs
-    response_schema = transform_schema_strings(response_schema)
     return response_schema

{data_science_document_ai-1.43.3.dist-info → data_science_document_ai-1.43.5.dist-info}/WHEEL RENAMED Viewed

File without changes

data-science-document-ai 1.43.3__py3-none-any.whl → 1.43.5__py3-none-any.whl

data-science-document-ai 1.43.3py3-none-any.whl → 1.43.5py3-none-any.whl