PyPI - data-science-document-ai - Versions diffs - 1.43.6__tar.gz → 1.51.0__tar.gz - Mend

data-science-document-ai 1.43.6tar.gz → 1.51.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.43.6
+Version: 1.51.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-science-document-ai"
-version = "1.43.6"
+version = "1.51.0"
 description = "\"Document AI repo for data science\""
 authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
 packages = [

{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/constants.py RENAMED Viewed

@@ -23,9 +23,12 @@ project_parameters = {
     "invoice_classification_lookup": "invoice_classification.json",
     "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
     # Fuzzy logic params
-    "fuzzy_threshold_item_code": 70,
+    "fuzzy_threshold_item_code": 90,
     "fuzzy_threshold_reverse_charge": 80,
     "fuzzy_threshold_invoice_classification": 70,
+    # Chunking params
+    "chunk_size": 1,  # page (do not change this without changing the page number logic)
+    "chunk_after": 10,  # pages
     # Big Query
     "g_ai_gbq_db_schema": "document_ai",
     "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -50,13 +53,6 @@ project_parameters = {
     "model_selector": {
         "stable": {
             "bookingConfirmation": 1,
-            "finalMbL": 0,
-            "draftMbl": 0,
-            "arrivalNotice": 0,
-            "shippingInstruction": 0,
-            "customsAssessment": 0,
-            "deliveryOrder": 0,
-            "partnerInvoice": 0,
         },
         "beta": {
             "bookingConfirmation": 0,
@@ -84,8 +80,10 @@ project_parameters = {
     # Key to combine the LLM results with the Doc Ai results
     "key_to_combine": {
         "bookingConfirmation": ["transportLegs"],
+        "arrivalNotice": ["containers"],
         "finalMbL": ["containers"],
         "draftMbl": ["containers"],
+        "deliveryOrder": ["Equipment", "TransportLeg"],
         "customsAssessment": ["containers"],
         "packingList": ["skuData"],
         "commercialInvoice": ["skus"],

data_science_document_ai-1.51.0/src/docai_processor_config.yaml ADDED Viewed

@@ -0,0 +1,22 @@
+models_project_id: "738250249861"
+model_config:
+  stable:
+    bookingConfirmation:
+      - id: "dc3e714cd168aeaa"
+        details:
+          display_name: "doc_cap_bookingConfirmation"
+          author: "reet.kanjilal@forto.com"
+          created_date: ""
+      - id: "3c280b11bdb3ed89"
+        details:
+          display_name: "doc_cap_BC_mlg"
+          author: "igor.tonko@forto.com"
+          created_date: ""
+  beta:
+    bookingConfirmation:
+      - id: "3c280b11bdb3ed89"
+        details:
+          display_name: "doc_cap_BC_mlg"
+          author: "igor.tonko@forto.com"
+          created_date: ""

{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/excel_processing.py RENAMED Viewed

@@ -4,8 +4,6 @@ import logging
 from ddtrace import tracer
-from src.postprocessing.common import llm_prediction_to_tuples
 logger = logging.getLogger(__name__)
 import asyncio
@@ -13,9 +11,8 @@ import asyncio
 import numpy as np
 import pandas as pd
-from src.llm import prompt_excel_extraction
 from src.prompts.prompt_library import prompt_library
-from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
+from src.utils import estimate_page_count, get_excel_sheets
 async def extract_data_from_sheet(
@@ -31,11 +28,14 @@ async def extract_data_from_sheet(
     )
     # Prompt for the LLM JSON
-    prompt_docai = prompt_excel_extraction(worksheet)
+    prompt = prompt_library.library[doc_type]["other"]["prompt"]
+    # Join the worksheet content with the prompt
+    prompt = worksheet + "\n" + prompt
     try:
         result = await llm_client.get_unified_json_genai(
-            prompt_docai,
+            prompt,
             response_schema=response_schema,
             doc_type=doc_type,
         )
@@ -69,18 +69,7 @@ async def extract_data_from_excel(
     """
     # Generate the response structure
-    response_schema = (
-        prompt_library.library[input_doc_type]["other"]["placeholders"]
-        if input_doc_type
-        in [
-            "partnerInvoice",
-            "customsInvoice",
-            "bundeskasse",
-            "commercialInvoice",
-            "packingList",
-        ]
-        else generate_schema_structure(params, input_doc_type)
-    )
+    response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
     # Load the Excel file and get ONLY the "visible" sheet names
     sheets, workbook = get_excel_sheets(file_content, mime_type)

{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/io.py RENAMED Viewed

@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
     return result
+def bq_logs(data_to_insert, params):
+    """Insert logs into Google BigQuery.
+    Args:
+        data_to_insert (list): The data to insert into BigQuery.
+        params (dict): The parameters dictionary.
+    """
+    # Use the pre-initialized BigQuery client
+    bq_client = params["bq_client"]
+    # Get the table string
+    table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
+    logger.info(f"Log table: {table_string}")
+    # Insert the rows into the table
+    insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
+    # Check if there were any errors inserting the rows
+    if not insert_logs:
+        logger.info("New rows have been added.")
+    else:
+        logger.info("Errors occurred while inserting rows: ", insert_logs)
 # type: ignore

{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/llm.py RENAMED Viewed

@@ -201,33 +201,4 @@ class LlmClient:
         return response
-def prompt_excel_extraction(excel_structured_text):
-    """Write a prompt to extract data from Excel files.
-    Args:
-        excel_structured_text (str): The structured text of the Excel file.
-    Returns:
-        prompt str: The prompt for common json.
-    """
-    prompt = f"""{excel_structured_text}
-    Task: Fill in the following dictionary from the information in the given in the above excel data.
-    Instructions:
-    - Do not change the keys of the following dictionary.
-    - The values should be filled in as per the schema provided below.
-    - If an entity contains a 'display_name', consider its properties as child data points in the below format.
-    {{'data-field': {{
-        'child-data-field': 'type -occurrence_type- description',
-          }}
-    }}
-    - The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
-    - Ensure the schema reflects the hierarchical relationship.
-    - Use the data field description to understand the context of the data.
-    """
-    return prompt
 # pylint: enable=all

{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/pdf_processing.py RENAMED Viewed

@@ -36,6 +36,7 @@ from src.utils import (
     get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
+    split_pdf_into_chunks,
     transform_schema_strings,
     validate_based_on_schema,
 )
@@ -195,46 +196,32 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         result (dict): The structured data extracted from the document, formatted as JSON.
     """
     # Bundeskasse invoices contains all the required information in the first 3 pages.
-    file_content = (
-        extract_top_pages(file_content, num_pages=5)
-        if input_doc_type == "bundeskasse"
-        else file_content
-    )
+    if input_doc_type == "bundeskasse":
+        file_content = extract_top_pages(file_content, num_pages=5)
     number_of_pages = get_pdf_page_count(file_content)
+    logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
-    # convert file_content to required document
-    document = llm_client.prepare_document_for_gemini(file_content)
-    # get the schema placeholder from the Doc AI and generate the response structure
-    response_schema = (
-        prompt_library.library[input_doc_type]["other"]["placeholders"]
-        if input_doc_type
-        in [
-            "partnerInvoice",
-            "customsInvoice",
-            "bundeskasse",
-            "commercialInvoice",
-            "packingList",
-        ]
-        else generate_schema_structure(params, input_doc_type)
-    )
+    # get the schema placeholder
+    response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
     carrier = "other"
-    if (
-        "preprocessing" in prompt_library.library.keys()
-        and "carrier" in prompt_library.library["preprocessing"].keys()
-        and input_doc_type
-        in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
-    ):
-        carrier_schema = prompt_library.library["preprocessing"]["carrier"][
-            "placeholders"
-        ][input_doc_type]
+    carrier_schema = (
+        prompt_library.library.get("preprocessing", {})
+        .get("carrier", {})
+        .get("placeholders", {})
+        .get(input_doc_type)
+    )
+    if carrier_schema:
         carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
         carrier_prompt = carrier_prompt.replace(
             "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
         )
+        # convert file_content to required document
+        document = llm_client.prepare_document_for_gemini(file_content)
         # identify carrier for customized prompting
         carrier = await identify_carrier(
             document,
@@ -244,37 +231,115 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
             doc_type=input_doc_type,
         )
-    if input_doc_type == "bookingConfirmation":
-        response_schema = prompt_library.library[input_doc_type][carrier][
-            "placeholders"
-        ]
+    # Select prompt
     if (
-        input_doc_type in prompt_library.library.keys()
-        and carrier in prompt_library.library[input_doc_type].keys()
+        input_doc_type not in prompt_library.library
+        or carrier not in prompt_library.library[input_doc_type]
     ):
-        # get the related prompt from predefined prompt library
-        prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
+        return {}
-        # Update schema to extract value-page_number pairs
-        if number_of_pages > 1:
-            response_schema = transform_schema_strings(response_schema)
+    # get the related prompt from predefined prompt library
+    prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
-            # Update the prompt to instruct LLM to include page numbers
-            prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
+    # Add page-number extraction for moderately large docs
+    use_chunking = number_of_pages >= params["chunk_after"]
-        # generate the result with LLM (gemini)
-        result = await llm_client.get_unified_json_genai(
-            prompt=prompt,
-            document=document,
-            response_schema=response_schema,
-            doc_type=input_doc_type,
+    # Update schema and prompt to extract value-page_number pairs
+    if not use_chunking and number_of_pages > 1:
+        response_schema = transform_schema_strings(response_schema)
+        prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
+    tasks = []
+    # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
+    for chunk in (
+        split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
+        if use_chunking
+        else [file_content]
+    ):
+        tasks.append(
+            process_chunk_with_retry(
+                chunk, prompt, response_schema, llm_client, input_doc_type
+            )
         )
-        result = llm_prediction_to_tuples(result, number_of_pages)
+    results = await asyncio.gather(*tasks, return_exceptions=True)
-        return result
-    return {}
+    if use_chunking:
+        return merge_llm_results(results, response_schema)
+    else:
+        return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
+async def process_chunk_with_retry(
+    chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
+):
+    """Process a chunk with retries in case of failure."""
+    for attempt in range(1, retries + 1):
+        try:
+            return await process_chunk(
+                chunk_content=chunk_content,
+                prompt=prompt,
+                response_schema=response_schema,
+                llm_client=llm_client,
+                input_doc_type=input_doc_type,
+            )
+        except Exception as e:
+            logger.error(f"Chunk failed on attempt {attempt}: {e}")
+            if attempt == retries:
+                raise
+            await asyncio.sleep(1)  # small backoff
+async def process_chunk(
+    chunk_content, prompt, response_schema, llm_client, input_doc_type
+):
+    """Process a chunk with Gemini."""
+    document = llm_client.prepare_document_for_gemini(chunk_content)
+    return await llm_client.get_unified_json_genai(
+        prompt=prompt,
+        document=document,
+        response_schema=response_schema,
+        doc_type=input_doc_type,
+    )
+def merge_llm_results(results, response_schema):
+    """Merge LLM results from multiple chunks."""
+    merged = {}
+    for i, result in enumerate(results):
+        if not isinstance(result, dict):
+            continue
+        # Add page number to all values coming from this chunk
+        result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
+        # Merge the result into the final merged dictionary
+        for key, value in result.items():
+            field_type = (
+                response_schema["properties"].get(key, {}).get("type", "").upper()
+            )
+            if key not in merged:
+                if field_type == "ARRAY":
+                    # append the values as a list
+                    merged[key] = (
+                        value if isinstance(value, list) else ([value] if value else [])
+                    )
+                else:
+                    merged[key] = value
+                continue
+            if field_type == "ARRAY":
+                # append list contents across chunks
+                if isinstance(value, list):
+                    merged[key].extend(value)
+                else:
+                    merged[key].append(value)
+            # take first non-null value only
+            if merged[key] in (None, "", [], {}):
+                merged[key] = value
+    return merged
 async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):

{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/common.py RENAMED Viewed

@@ -12,7 +12,7 @@ from src.constants import formatting_rules
 from src.io import logger
 from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
 from src.prompts.prompt_library import prompt_library
-from src.utils import get_tms_mappings
+from src.utils import batch_fetch_all_mappings, get_tms_mappings
 tms_domain = os.environ["TMS_DOMAIN"]
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
         formatted_value: string
     """
+    # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
+    value = remove_unwanted_patterns(data_field_value)
     formatted_value = ""
-    for c in data_field_value:
+    for c in value:
         if c.isnumeric() or c in [",", ".", "-"]:
             formatted_value += c
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
     lineitem = lineitem.replace("HIGH CUBE", "")
     # Remove container size e.g., 20FT, 40HC, etc.
-    lineitem = re.sub(
-        r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
-    ).strip()
+    pattern = [
+        f"{s}{t}"
+        for s in ("20|22|40|45".split("|"))
+        for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
+    ]
+    lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
     return lineitem
@@ -372,18 +378,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     return re.sub(r"\s{2,}", " ", lineitem).strip()
-async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
+async def format_label(
+    entity_k,
+    entity_value,
+    document_type_code,
+    params,
+    mime_type,
+    container_map,
+    terminal_map,
+    depot_map,
+):
     llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, document_type_code, params, mime_type)
+            format_label(
+                sub_k,
+                sub_v,
+                document_type_code,
+                params,
+                mime_type,
+                container_map,
+                terminal_map,
+                depot_map,
+            )
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
     if isinstance(entity_value, list):
         format_tasks = await asyncio.gather(
             *[
-                format_label(entity_k, sub_v, document_type_code, params, mime_type)
+                format_label(
+                    entity_k,
+                    sub_v,
+                    document_type_code,
+                    params,
+                    mime_type,
+                    container_map,
+                    terminal_map,
+                    depot_map,
+                )
                 for sub_v in entity_value
             ]
         )
@@ -405,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
         )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
-        formatted_value = get_tms_mappings(entity_value, "container_types")
+        formatted_value = container_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "terminal"):
-        formatted_value = get_tms_mappings(entity_value, "terminals")
+        formatted_value = terminal_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "depot"):
-        formatted_value = get_tms_mappings(entity_value, "depots")
+        formatted_value = depot_map.get(entity_value)
     elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
         try:
@@ -507,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
     """Get port code using AI model."""
     port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
-    return get_tms_mappings(port, "ports", port_llm)
+    result = await get_tms_mappings(port, "ports", port_llm)
+    return result.get(port, None)
 async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -598,6 +632,74 @@ def decimal_convertor(value, quantity=False):
     return value
+async def collect_mapping_requests(entity_value, document_type_code):
+    """Collect all unique container types, terminals, and depots from the entity value."""
+    # Sets to store unique values
+    container_types = set()
+    terminals = set()
+    depots = set()
+    def walk(key, value):
+        key_lower = key.lower()
+        # nested dict
+        if isinstance(value, dict):
+            for k, v in value.items():
+                walk(k, v)
+        # list of values
+        elif isinstance(value, list):
+            for item in value:
+                walk(key, item)
+        # leaf node
+        else:
+            if key_lower in ("containertype", "containersize"):
+                # Take only "20DV" from ('20DV', 0) if it's a tuple
+                container_types.add(value[0]) if isinstance(
+                    value, tuple
+                ) else container_types.add(value)
+            elif check_formatting_rule(key, document_type_code, "terminal"):
+                terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
+                    value
+                )
+            elif check_formatting_rule(key, document_type_code, "depot"):
+                depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
+    walk("root", entity_value)
+    return container_types, terminals, depots
+async def format_all_labels(entity_data, document_type_code, params, mime_type):
+    """Format all labels in the entity data using cached mappings."""
+    # Collect all mapping values needed
+    container_req, terminal_req, depot_req = await collect_mapping_requests(
+        entity_data, document_type_code
+    )
+    # Batch fetch mappings
+    container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
+        container_req, terminal_req, depot_req
+    )
+    # Format labels using cached mappings
+    _, result = await format_label(
+        "root",
+        entity_data,
+        document_type_code,
+        params,
+        mime_type,
+        container_map,
+        terminal_map,
+        depot_map,
+    )
+    return _, result
 async def format_all_entities(result, document_type_code, params, mime_type):
     """Format the entity values in the result dictionary."""
     # Since we treat `customsInvoice` same as `partnerInvoice`
@@ -613,13 +715,13 @@ async def format_all_entities(result, document_type_code, params, mime_type):
         return {}
     # Format all entities recursively
-    _, aggregated_data = await format_label(
-        None, result, document_type_code, params, mime_type
+    _, aggregated_data = await format_all_labels(
+        result, document_type_code, params, mime_type
     )
     # Process partner invoice on lineitem mapping and reverse charge sentence
     if document_type_code in ["partnerInvoice", "bundeskasse"]:
-        process_partner_invoice(params, aggregated_data, document_type_code)
+        await process_partner_invoice(params, aggregated_data, document_type_code)
     logger.info("Data Extraction completed successfully")
     return aggregated_data
@@ -651,41 +753,46 @@ def remove_stop_words(lineitem: str):
     )
-def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
+def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
     """Convert LLM prediction dictionary to tuples of (value, page_number)."""
     # If only 1 page, simply pair each value with page number 0
     if number_of_pages == 1:
+        effective_page = 0 if page_number is None else page_number
         if isinstance(llm_prediction, dict):
             return {
-                k: llm_prediction_to_tuples(v, number_of_pages)
+                k: llm_prediction_to_tuples(
+                    v, number_of_pages, page_number=effective_page
+                )
                 for k, v in llm_prediction.items()
             }
         elif isinstance(llm_prediction, list):
             return [
-                llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
+                llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
+                for v in llm_prediction
             ]
         else:
-            return (llm_prediction, 0) if llm_prediction else None
+            return (llm_prediction, effective_page) if llm_prediction else None
     # logic for multi-page predictions
     if isinstance(llm_prediction, dict):
         if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
             if llm_prediction["value"]:
                 try:
-                    page_number = int(llm_prediction["page_number"])
+                    _page_number = int(llm_prediction["page_number"])
                 except:  # noqa: E722
-                    page_number = -1
-                return (llm_prediction["value"], page_number)
+                    _page_number = -1
+                return (llm_prediction["value"], _page_number)
             return None
         for key, value in llm_prediction.items():
             llm_prediction[key] = llm_prediction_to_tuples(
-                llm_prediction.get(key, value), number_of_pages
+                llm_prediction.get(key, value), number_of_pages, page_number
             )
     elif isinstance(llm_prediction, list):
         for i, item in enumerate(llm_prediction):
-            llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
+            llm_prediction[i] = llm_prediction_to_tuples(
+                item, number_of_pages, page_number
+            )
     return llm_prediction

data-science-document-ai 1.43.6__tar.gz → 1.51.0__tar.gz

data-science-document-ai 1.43.6tar.gz → 1.51.0tar.gz