PyPI - data-science-document-ai - Versions diffs - 1.43.7__py3-none-any.whl → 1.45.0__py3-none-any.whl - Mend

data-science-document-ai 1.43.7py3-none-any.whl → 1.45.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{data_science_document_ai-1.43.7.dist-info → data_science_document_ai-1.45.0.dist-info}/METADATA +1 -1
{data_science_document_ai-1.43.7.dist-info → data_science_document_ai-1.45.0.dist-info}/RECORD +10 -10
src/constants.py +3 -0
src/excel_processing.py +1 -2
src/io.py +23 -0
src/pdf_processing.py +117 -40
src/postprocessing/common.py +122 -21
src/postprocessing/postprocess_partner_invoice.py +117 -55
src/utils.py +63 -41
{data_science_document_ai-1.43.7.dist-info → data_science_document_ai-1.45.0.dist-info}/WHEEL +0 -0

{data_science_document_ai-1.43.7.dist-info → data_science_document_ai-1.45.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.43.7
+Version: 1.45.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.43.7.dist-info → data_science_document_ai-1.45.0.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
-src/constants.py,sha256=rpYIecVLIBLh98YrJ8e5gdvM0bqrXJZWIKgFkUSn69g,3513
+src/constants.py,sha256=HKHP9MqkLrC6pHgOt0XX2F8j6kbupXJ4HscClDwMBaM,3656
 src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
 src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
 src/docai_processor_config.yaml,sha256=81NUGs-u8UFJm6mc0ZOeeNQlhe9h0f35GhjTcwErvTA,1717
-src/excel_processing.py,sha256=PdypkXHf-hln5cq5TyJ_IVybZk-rJF1NKZ50KXuOSdY,3390
-src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
+src/excel_processing.py,sha256=_vP2q1xEIeyjO8TvZlSTeEM-M1PMceyDSuYGfyZeceY,3361
+src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
 src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
 src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
-src/pdf_processing.py,sha256=DaFM8ioERj7YeC8Yjki_dfSnKt0lf7DB14ks9i4OAfA,17741
-src/postprocessing/common.py,sha256=fU3ECfnR0rpF21DnVYM2YM7kPEB4gRJuMasyrNupsaA,23026
+src/pdf_processing.py,sha256=lzvoza9itpEyl-rcBQbIcWuFxUAvF_Qyc-OpuPQWWMk,20354
+src/postprocessing/common.py,sha256=ao9_hnBXgLv4HOyj_6I00CSDGRiwG8IP_HPg_1Yjzmw,25883
 src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
 src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
-src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
+src/postprocessing/postprocess_partner_invoice.py,sha256=LZcMZfJeLdcbYqPemO8gn9SmJxv-NPmb4uVCT3lKg18,12341
 src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
 src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
 src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
@@ -53,7 +53,7 @@ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYp
 src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
 src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
 src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
-src/utils.py,sha256=iUFjfIKXl_MwkPXPMfK0ZAB9aZ__N6e8mWTBbBiPki4,16568
-data_science_document_ai-1.43.7.dist-info/METADATA,sha256=lajB-JuTBbL2uMTIlvdZ3rJiw5n9BFzTcXnIEYfgIj4,2152
-data_science_document_ai-1.43.7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-data_science_document_ai-1.43.7.dist-info/RECORD,,
+src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
+data_science_document_ai-1.45.0.dist-info/METADATA,sha256=VblAnSZ_nlqjlEJtl0-ETS6tuELw9pThEKwxAxXomjA,2152
+data_science_document_ai-1.45.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+data_science_document_ai-1.45.0.dist-info/RECORD,,

src/constants.py CHANGED Viewed

@@ -26,6 +26,9 @@ project_parameters = {
     "fuzzy_threshold_item_code": 70,
     "fuzzy_threshold_reverse_charge": 80,
     "fuzzy_threshold_invoice_classification": 70,
+    # Chunking params
+    "chunk_size": 1,  # page (do not change this without changing the page number logic)
+    "chunk_after": 10,  # pages
     # Big Query
     "g_ai_gbq_db_schema": "document_ai",
     "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",

src/excel_processing.py CHANGED Viewed

@@ -4,8 +4,6 @@ import logging
 from ddtrace import tracer
-from src.postprocessing.common import llm_prediction_to_tuples
 logger = logging.getLogger(__name__)
 import asyncio
@@ -78,6 +76,7 @@ async def extract_data_from_excel(
             "bundeskasse",
             "commercialInvoice",
             "packingList",
+            "bookingConfirmation",
         ]
         else generate_schema_structure(params, input_doc_type)
     )

src/io.py CHANGED Viewed

@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
     return result
+def bq_logs(data_to_insert, params):
+    """Insert logs into Google BigQuery.
+    Args:
+        data_to_insert (list): The data to insert into BigQuery.
+        params (dict): The parameters dictionary.
+    """
+    # Use the pre-initialized BigQuery client
+    bq_client = params["bq_client"]
+    # Get the table string
+    table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
+    logger.info(f"Log table: {table_string}")
+    # Insert the rows into the table
+    insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
+    # Check if there were any errors inserting the rows
+    if not insert_logs:
+        logger.info("New rows have been added.")
+    else:
+        logger.info("Errors occurred while inserting rows: ", insert_logs)
 # type: ignore

src/pdf_processing.py CHANGED Viewed

@@ -36,6 +36,7 @@ from src.utils import (
     get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
+    split_pdf_into_chunks,
     transform_schema_strings,
     validate_based_on_schema,
 )
@@ -195,15 +196,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         result (dict): The structured data extracted from the document, formatted as JSON.
     """
     # Bundeskasse invoices contains all the required information in the first 3 pages.
-    file_content = (
-        extract_top_pages(file_content, num_pages=5)
-        if input_doc_type == "bundeskasse"
-        else file_content
-    )
-    number_of_pages = get_pdf_page_count(file_content)
+    if input_doc_type == "bundeskasse":
+        file_content = extract_top_pages(file_content, num_pages=5)
-    # convert file_content to required document
-    document = llm_client.prepare_document_for_gemini(file_content)
+    number_of_pages = get_pdf_page_count(file_content)
+    logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
     # get the schema placeholder from the Doc AI and generate the response structure
     response_schema = (
@@ -215,26 +212,28 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
             "bundeskasse",
             "commercialInvoice",
             "packingList",
+            "bookingConfirmation",
         ]
         else generate_schema_structure(params, input_doc_type)
     )
     carrier = "other"
-    if (
-        "preprocessing" in prompt_library.library.keys()
-        and "carrier" in prompt_library.library["preprocessing"].keys()
-        and input_doc_type
-        in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
-    ):
-        carrier_schema = prompt_library.library["preprocessing"]["carrier"][
-            "placeholders"
-        ][input_doc_type]
+    carrier_schema = (
+        prompt_library.library.get("preprocessing", {})
+        .get("carrier", {})
+        .get("placeholders", {})
+        .get(input_doc_type)
+    )
+    if carrier_schema:
         carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
         carrier_prompt = carrier_prompt.replace(
             "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
         )
+        # convert file_content to required document
+        document = llm_client.prepare_document_for_gemini(file_content)
         # identify carrier for customized prompting
         carrier = await identify_carrier(
             document,
@@ -244,37 +243,115 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
             doc_type=input_doc_type,
         )
-    if input_doc_type == "bookingConfirmation":
-        response_schema = prompt_library.library[input_doc_type][carrier][
-            "placeholders"
-        ]
+    # Select prompt
     if (
-        input_doc_type in prompt_library.library.keys()
-        and carrier in prompt_library.library[input_doc_type].keys()
+        input_doc_type not in prompt_library.library
+        or carrier not in prompt_library.library[input_doc_type]
     ):
-        # get the related prompt from predefined prompt library
-        prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
+        return {}
-        # Update schema to extract value-page_number pairs
-        if number_of_pages > 1:
-            response_schema = transform_schema_strings(response_schema)
+    # get the related prompt from predefined prompt library
+    prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
-            # Update the prompt to instruct LLM to include page numbers
-            prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
+    # Add page-number extraction for moderately large docs
+    use_chunking = number_of_pages >= params["chunk_after"]
-        # generate the result with LLM (gemini)
-        result = await llm_client.get_unified_json_genai(
-            prompt=prompt,
-            document=document,
-            response_schema=response_schema,
-            doc_type=input_doc_type,
+    # Update schema and prompt to extract value-page_number pairs
+    if not use_chunking and number_of_pages > 1:
+        response_schema = transform_schema_strings(response_schema)
+        prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
+    tasks = []
+    # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
+    for chunk in (
+        split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
+        if use_chunking
+        else [file_content]
+    ):
+        tasks.append(
+            process_chunk_with_retry(
+                chunk, prompt, response_schema, llm_client, input_doc_type
+            )
         )
-        result = llm_prediction_to_tuples(result, number_of_pages)
+    results = await asyncio.gather(*tasks, return_exceptions=True)
-        return result
-    return {}
+    if use_chunking:
+        return merge_llm_results(results, response_schema)
+    else:
+        return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
+async def process_chunk_with_retry(
+    chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
+):
+    """Process a chunk with retries in case of failure."""
+    for attempt in range(1, retries + 1):
+        try:
+            return await process_chunk(
+                chunk_content=chunk_content,
+                prompt=prompt,
+                response_schema=response_schema,
+                llm_client=llm_client,
+                input_doc_type=input_doc_type,
+            )
+        except Exception as e:
+            logger.error(f"Chunk failed on attempt {attempt}: {e}")
+            if attempt == retries:
+                raise
+            await asyncio.sleep(1)  # small backoff
+async def process_chunk(
+    chunk_content, prompt, response_schema, llm_client, input_doc_type
+):
+    """Process a chunk with Gemini."""
+    document = llm_client.prepare_document_for_gemini(chunk_content)
+    return await llm_client.get_unified_json_genai(
+        prompt=prompt,
+        document=document,
+        response_schema=response_schema,
+        doc_type=input_doc_type,
+    )
+def merge_llm_results(results, response_schema):
+    """Merge LLM results from multiple chunks."""
+    merged = {}
+    for i, result in enumerate(results):
+        if not isinstance(result, dict):
+            continue
+        # Add page number to all values coming from this chunk
+        result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
+        # Merge the result into the final merged dictionary
+        for key, value in result.items():
+            field_type = (
+                response_schema["properties"].get(key, {}).get("type", "").upper()
+            )
+            if key not in merged:
+                if field_type == "ARRAY":
+                    # append the values as a list
+                    merged[key] = (
+                        value if isinstance(value, list) else ([value] if value else [])
+                    )
+                else:
+                    merged[key] = value
+                continue
+            if field_type == "ARRAY":
+                # append list contents across chunks
+                if isinstance(value, list):
+                    merged[key].extend(value)
+                else:
+                    merged[key].append(value)
+            # take first non-null value only
+            if merged[key] in (None, "", [], {}):
+                merged[key] = value
+    return merged
 async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):

src/postprocessing/common.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.constants import formatting_rules
 from src.io import logger
 from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
 from src.prompts.prompt_library import prompt_library
-from src.utils import get_tms_mappings
+from src.utils import batch_fetch_all_mappings, get_tms_mappings
 tms_domain = os.environ["TMS_DOMAIN"]
@@ -372,18 +372,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     return re.sub(r"\s{2,}", " ", lineitem).strip()
-async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
+async def format_label(
+    entity_k,
+    entity_value,
+    document_type_code,
+    params,
+    mime_type,
+    container_map,
+    terminal_map,
+    depot_map,
+):
     llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, document_type_code, params, mime_type)
+            format_label(
+                sub_k,
+                sub_v,
+                document_type_code,
+                params,
+                mime_type,
+                container_map,
+                terminal_map,
+                depot_map,
+            )
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
     if isinstance(entity_value, list):
         format_tasks = await asyncio.gather(
             *[
-                format_label(entity_k, sub_v, document_type_code, params, mime_type)
+                format_label(
+                    entity_k,
+                    sub_v,
+                    document_type_code,
+                    params,
+                    mime_type,
+                    container_map,
+                    terminal_map,
+                    depot_map,
+                )
                 for sub_v in entity_value
             ]
         )
@@ -405,13 +432,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
         )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
-        formatted_value = get_tms_mappings(entity_value, "container_types")
+        formatted_value = container_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "terminal"):
-        formatted_value = get_tms_mappings(entity_value, "terminals")
+        formatted_value = terminal_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "depot"):
-        formatted_value = get_tms_mappings(entity_value, "depots")
+        formatted_value = depot_map.get(entity_value)
     elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
         try:
@@ -507,7 +534,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
     """Get port code using AI model."""
     port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
-    return get_tms_mappings(port, "ports", port_llm)
+    result = await get_tms_mappings(port, "ports", port_llm)
+    return result.get(port, None)
 async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -598,6 +626,74 @@ def decimal_convertor(value, quantity=False):
     return value
+async def collect_mapping_requests(entity_value, document_type_code):
+    """Collect all unique container types, terminals, and depots from the entity value."""
+    # Sets to store unique values
+    container_types = set()
+    terminals = set()
+    depots = set()
+    def walk(key, value):
+        key_lower = key.lower()
+        # nested dict
+        if isinstance(value, dict):
+            for k, v in value.items():
+                walk(k, v)
+        # list of values
+        elif isinstance(value, list):
+            for item in value:
+                walk(key, item)
+        # leaf node
+        else:
+            if key_lower in ("containertype", "containersize"):
+                # Take only "20DV" from ('20DV', 0) if it's a tuple
+                container_types.add(value[0]) if isinstance(
+                    value, tuple
+                ) else container_types.add(value)
+            elif check_formatting_rule(key, document_type_code, "terminal"):
+                terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
+                    value
+                )
+            elif check_formatting_rule(key, document_type_code, "depot"):
+                depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
+    walk("root", entity_value)
+    return container_types, terminals, depots
+async def format_all_labels(entity_data, document_type_code, params, mime_type):
+    """Format all labels in the entity data using cached mappings."""
+    # Collect all mapping values needed
+    container_req, terminal_req, depot_req = await collect_mapping_requests(
+        entity_data, document_type_code
+    )
+    # Batch fetch mappings
+    container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
+        container_req, terminal_req, depot_req
+    )
+    # Format labels using cached mappings
+    _, result = await format_label(
+        "root",
+        entity_data,
+        document_type_code,
+        params,
+        mime_type,
+        container_map,
+        terminal_map,
+        depot_map,
+    )
+    return _, result
 async def format_all_entities(result, document_type_code, params, mime_type):
     """Format the entity values in the result dictionary."""
     # Since we treat `customsInvoice` same as `partnerInvoice`
@@ -613,13 +709,13 @@ async def format_all_entities(result, document_type_code, params, mime_type):
         return {}
     # Format all entities recursively
-    _, aggregated_data = await format_label(
-        None, result, document_type_code, params, mime_type
+    _, aggregated_data = await format_all_labels(
+        result, document_type_code, params, mime_type
     )
     # Process partner invoice on lineitem mapping and reverse charge sentence
     if document_type_code in ["partnerInvoice", "bundeskasse"]:
-        process_partner_invoice(params, aggregated_data, document_type_code)
+        await process_partner_invoice(params, aggregated_data, document_type_code)
     logger.info("Data Extraction completed successfully")
     return aggregated_data
@@ -651,41 +747,46 @@ def remove_stop_words(lineitem: str):
     )
-def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
+def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
     """Convert LLM prediction dictionary to tuples of (value, page_number)."""
     # If only 1 page, simply pair each value with page number 0
     if number_of_pages == 1:
+        effective_page = 0 if page_number is None else page_number
         if isinstance(llm_prediction, dict):
             return {
-                k: llm_prediction_to_tuples(v, number_of_pages)
+                k: llm_prediction_to_tuples(
+                    v, number_of_pages, page_number=effective_page
+                )
                 for k, v in llm_prediction.items()
             }
         elif isinstance(llm_prediction, list):
             return [
-                llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
+                llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
+                for v in llm_prediction
             ]
         else:
-            return (llm_prediction, 0) if llm_prediction else None
+            return (llm_prediction, effective_page) if llm_prediction else None
     # logic for multi-page predictions
     if isinstance(llm_prediction, dict):
         if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
             if llm_prediction["value"]:
                 try:
-                    page_number = int(llm_prediction["page_number"])
+                    _page_number = int(llm_prediction["page_number"])
                 except:  # noqa: E722
-                    page_number = -1
-                return (llm_prediction["value"], page_number)
+                    _page_number = -1
+                return (llm_prediction["value"], _page_number)
             return None
         for key, value in llm_prediction.items():
             llm_prediction[key] = llm_prediction_to_tuples(
-                llm_prediction.get(key, value), number_of_pages
+                llm_prediction.get(key, value), number_of_pages, page_number
             )
     elif isinstance(llm_prediction, list):
         for i, item in enumerate(llm_prediction):
-            llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
+            llm_prediction[i] = llm_prediction_to_tuples(
+                item, number_of_pages, page_number
+            )
     return llm_prediction

src/postprocessing/postprocess_partner_invoice.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """This module contains the postprocessing functions for the partner invoice."""
-from concurrent.futures import ThreadPoolExecutor
-from fuzzywuzzy import fuzz
+from rapidfuzz import fuzz, process
 from src.io import logger
 from src.utils import get_tms_mappings
@@ -136,7 +134,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
     ] = "Dasbachstraße 15, 54292 Trier, Germany"
-def process_partner_invoice(params, aggregated_data, document_type_code):
+async def process_partner_invoice(params, aggregated_data, document_type_code):
     """Process the partner invoice data."""
     # Post process bundeskasse invoices
     if document_type_code == "bundeskasse":
@@ -160,27 +158,76 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
         reverse_charge_info["formattedValue"] = reverse_charge_value
         reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
-    # Process each line item
-    for line_item in line_items:
-        if line_item.get("lineItemDescription", None) is not None:
-            line_item["itemCode"] = associate_forto_item_code(
-                line_item["lineItemDescription"]["formattedValue"],
-                params,
-            )
+    # Process everything in one go
+    processed_items = await process_line_items_batch(params, line_items, reverse_charge)
-            # Add page number for the consistency
-            line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
+    # Update your main data structure
+    aggregated_data["lineItem"] = processed_items
-        if reverse_charge:
-            # Distribute reverseChargeSentence to all line items
-            line_item["reverseChargeSentence"] = reverse_charge
-            line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
+async def process_line_items_batch(
+    params: dict, line_items: list[dict], reverse_charge=None
+):
+    """
+    Processes all line items efficiently using a "Split-Apply-Combine" strategy.
+    """
+    # To store items that need external API lookup
+    pending_line_items = {}
+    # Check Fuzzy Matching
+    logger.info(f"Mapping line item codes with Fuzzy matching....")
+    for i, item in enumerate(line_items):
+        description_obj = item.get("lineItemDescription")
+        if not description_obj or not description_obj.get("formattedValue"):
+            continue
+        # Get the formatted description text
+        desc = description_obj["formattedValue"]
+        # Find Fuzzy Match
+        matched_code = find_matching_lineitem(
+            desc,
+            params["lookup_data"]["item_code"],
+            params["fuzzy_threshold_item_code"],
+        )
+        if matched_code:
+            # Set the code to the line item
+            item["itemCode"] = {
+                "documentValue": desc,
+                "formattedValue": matched_code,
+                "page": description_obj.get("page"),
+            }
+        else:
+            # Store for batch API call
+            pending_line_items[i] = desc
+    # Batch API Call for Embedding lookups
+    if pending_line_items:
+        values_to_fetch = list(set(pending_line_items.values()))
+        logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
+        # Await the batch response {"desc1": "code1", "desc2": "code2"}
+        api_results = await get_tms_mappings(
+            input_list=values_to_fetch, embedding_type="line_items"
+        )
+        # Merge API results back into original list
+        for index, desc in pending_line_items.items():
+            # Get result from API response, or None if API failed for that item
+            forto_code = api_results.get(desc)
+            # Update the original item
+            line_items[index]["itemCode"] = {
+                "documentValue": desc,
+                "formattedValue": forto_code,  # Might be None if API failed
+                "page": line_items[index]["lineItemDescription"].get("page"),
+            }
-def compute_score(args):
-    """Compute the fuzzy matching score between a new line item and a key."""
-    new_lineitem, key = args
-    return key, fuzz.ratio(new_lineitem, key)
+    # Add reverse charge here if exists
+    if reverse_charge:
+        [item.update({"reverseChargeSentence": reverse_charge}) for item in line_items]
+    return line_items
 def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
@@ -195,16 +242,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
         tuple: (best_match, score) if above threshold, else (None, 0)
     """
     # Use multiprocessing to find the best match
-    with ThreadPoolExecutor() as executor:
-        results = executor.map(compute_score, [(target, s) for s in sentences])
+    result = process.extractOne(
+        target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
+    )
-    # Find the best match and score
-    best_match, best_score = max(results, key=lambda x: x[1], default=(None, 0))
+    if result is None:
+        return None, False
-    # return best_match, best_score
-    # If the best match score is above a threshold (e.g., 80), return it
-    if best_score >= threshold:
-        return best_match, True
+    match, score, index = result
+    # return best_match if the best match score is above a threshold (e.g., 80)
+    if match:
+        return match, True
     return None, False
@@ -236,46 +285,59 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
     Returns:
         str: The best matching 'Forto SLI' value from the dictionary.
     """
-    new_lineitem = new_lineitem.upper()
     # Check if the new line item is already in the dictionary
     if new_lineitem in kvp_dict:
         return kvp_dict[new_lineitem]
     # Get the best fuzzy match score for the extracted line item
-    best_match, _ = get_fuzzy_match_score(
-        new_lineitem, list(kvp_dict.keys()), threshold
+    match, _ = get_fuzzy_match_score(
+        new_lineitem,
+        list(kvp_dict.keys()),
+        threshold,
     )
-    return kvp_dict.get(best_match, None)
+    if match:
+        # find the code from the kvp_dict
+        return kvp_dict[match]
+    return None
-def associate_forto_item_code(input_string, params):
-    """
-    Finds a match for the input string using fuzzy matching first, then embedding fallback.
-    1. Tries to find a fuzzy match for input_string against the keys in
-       mapping_data using RapidFuzz, requiring a score >= fuzzy_threshold.
-    2. If found, returns the corresponding value from mapping_data.
-    3. If not found above threshold, calls the embedding_fallback function.
+async def associate_forto_item_code(line_item_data, params):
+    """
+    Associates Forto item codes to a list of line item descriptions.
     Args:
-        input_string: The string to find a match for.
-        params: Parameters containing the lookup data and fuzzy threshold.
+        line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
+        params (dict): Parameters containing lookup data and thresholds.
     Returns:
-        The matched value (from fuzzy match or embedding), or None if no match found.
+        list: A list of dictionaries with 'description' and 'itemCode' keys.
     """
-    # Get the Forto item code using fuzzy matching
-    forto_item_code = find_matching_lineitem(
-        new_lineitem=input_string,
-        kvp_dict=params["lookup_data"]["item_code"],  # TODO: Parse the KVP dictionary
-        threshold=params["fuzzy_threshold_item_code"],
-    )
-    if forto_item_code is None:
-        # 2. Fallback to embedding function if no good fuzzy match
-        forto_item_code = get_tms_mappings(input_string, "line_items")
+    result = []
+    pending_line_items = {}
+    for desc, f_desc in line_item_data.items():
+        # Get the Forto item code using fuzzy matching
+        code = find_matching_lineitem(
+            new_lineitem=f_desc,
+            kvp_dict=params["lookup_data"]["item_code"],
+            threshold=params["fuzzy_threshold_item_code"],
+        )
+        if code:
+            result.append({"description": desc, "itemCode": code})
+        else:
+            pending_line_items[desc] = f_desc
+    # Batch API Call for Embedding lookups
+    if pending_line_items:
+        api_results = await get_tms_mappings(
+            input_list=list(pending_line_items.values()),
+            embedding_type="line_items",
+        )
+        # Merge API results back into original list
+        for desc, f_desc in pending_line_items.items():
+            code = api_results.get(f_desc)
+            result.append({"description": desc, "itemCode": code})
-    result = {"documentValue": input_string, "formattedValue": forto_item_code}
     return result

src/utils.py CHANGED Viewed

@@ -6,16 +6,16 @@ import json
 import os
 import pickle
 from datetime import datetime
-from typing import Literal
+from typing import Any, Dict, List, Literal, Optional
+import httpx
 import numpy as np
 import openpyxl
 import pandas as pd
-import requests
 from google.cloud import documentai_v1beta3 as docu_ai_beta
 from pypdf import PdfReader, PdfWriter
-from src.io import get_storage_client, logger
+from src.io import bq_logs, get_storage_client, logger
 def get_pdf_page_count(pdf_bytes):
@@ -31,29 +31,6 @@ def get_pdf_page_count(pdf_bytes):
     return len(reader.pages)
-def bq_logs(data_to_insert, params):
-    """Insert logs into Google BigQuery.
-    Args:
-        data_to_insert (list): The data to insert into BigQuery.
-        params (dict): The parameters dictionary.
-    """
-    # Use the pre-initialized BigQuery client
-    bq_client = params["bq_client"]
-    # Get the table string
-    table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
-    logger.info(f"Log table: {table_string}")
-    # Insert the rows into the table
-    insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
-    # Check if there were any errors inserting the rows
-    if not insert_logs:
-        logger.info("New rows have been added.")
-    else:
-        logger.info("Errors occurred while inserting rows: ", insert_logs)
 async def get_data_set_schema_from_docai(
     schema_client, project_id=None, location=None, processor_id=None, name=None
 ):
@@ -383,9 +360,9 @@ def extract_top_pages(pdf_bytes, num_pages=4):
     return output.getvalue()
-def get_tms_mappings(
-    input_list: list[str], embedding_type: str, llm_ports: list[str] = None
-):
+async def get_tms_mappings(
+    input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
+) -> Dict[str, Any]:
     """Get TMS mappings for the given values.
     Args:
@@ -395,39 +372,66 @@ def get_tms_mappings(
         llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
     Returns:
-        dict: A dictionary with the mapping results.
+        dict or string: A dictionary or a string with the mapping results.
     """
-    # To test the API locally, port-forward the embedding service in the sandbox to 8080:80
-    # If you want to launch uvicorn from the tms-embedding repo, then use --port 8080 in the config file
     base_url = (
         "http://0.0.0.0:8080/"
         if os.getenv("CLUSTER") is None
         else "http://tms-mappings.api.svc.cluster.local./"
     )
+    # Ensure clean inputs
+    if not input_list:
+        return {}
     # Ensure input_list is a list
     if not isinstance(input_list, list):
         input_list = [input_list]
     # Always send a dict with named keys
     payload = {embedding_type: input_list}
     if llm_ports:
         payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
     # Make the POST request to the TMS mappings API
-    url = f"{base_url}/{embedding_type}"
-    response = requests.post(url=url, json=payload)
+    url = f"{base_url}{embedding_type}"
-    if response.status_code != 200:
-        logger.error(
-            f"Error from TMS mappings API: {response.status_code} - {response.text}"
-        )
+    # Use a timeout so the code doesn't hang forever
+    timeout = httpx.Timeout(60.0, connect=10.0)
+    async with httpx.AsyncClient(timeout=timeout) as client:
+        try:
+            response = await client.post(url, json=payload)
+            response.raise_for_status()
-    formatted_values = (
-        response.json().get("response", {}).get("data", {}).get(input_list[0], None)
+            # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
+            return response.json().get("response", {}).get("data", {})
+        except httpx.HTTPStatusError as exc:
+            logger.error(
+                f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
+            )
+            return {}
+async def batch_fetch_all_mappings(container_types, terminals, depots):
+    """Batch fetch all mappings for container types, terminals, and depots."""
+    # run batch calls concurrently
+    results = await asyncio.gather(
+        get_tms_mappings(list(container_types), "container_types"),
+        get_tms_mappings(list(terminals), "terminals"),
+        get_tms_mappings(list(depots), "depots"),
     )
-    return formatted_values
+    batch_container_map, batch_terminal_map, batch_depot_map = results
+    # Convert lists of tuples to dicts if necessary
+    return (
+        dict(batch_container_map or {}),
+        dict(batch_terminal_map or {}),
+        dict(batch_depot_map or {}),
+    )
 def transform_schema_strings(schema):
@@ -502,3 +506,21 @@ def estimate_page_count(sheet):
     else:
         return None
     return np.ceil(pg_cnt / 500)
+def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
+    """Split PDF into smaller page chunks."""
+    pdf = PdfReader(io.BytesIO(file_content))
+    total_pages = len(pdf.pages)
+    # TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
+    for i in range(0, total_pages, chunk_size):
+        writer = PdfWriter()
+        for j in range(i, min(i + chunk_size, total_pages)):
+            writer.add_page(pdf.pages[j])
+        buffer = io.BytesIO()
+        writer.write(buffer)
+        buffer.seek(0)
+        yield buffer.getvalue()

{data_science_document_ai-1.43.7.dist-info → data_science_document_ai-1.45.0.dist-info}/WHEEL RENAMED Viewed

File without changes

data-science-document-ai 1.43.7__py3-none-any.whl → 1.45.0__py3-none-any.whl

data-science-document-ai 1.43.7py3-none-any.whl → 1.45.0py3-none-any.whl