PyPI - data-science-document-ai - Versions diffs - 1.42.5__py3-none-any.whl → 1.56.1__py3-none-any.whl - Mend

data-science-document-ai 1.42.5py3-none-any.whl → 1.56.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +2 -2
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/RECORD +34 -31
src/constants.py +7 -10
src/docai_processor_config.yaml +0 -56
src/excel_processing.py +24 -14
src/io.py +23 -0
src/llm.py +0 -29
src/pdf_processing.py +156 -51
src/postprocessing/common.py +172 -28
src/postprocessing/postprocess_partner_invoice.py +194 -59
src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
src/prompts/library/bundeskasse/other/placeholders.json +5 -5
src/prompts/library/bundeskasse/other/prompt.txt +7 -5
src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
src/prompts/library/customsAssessment/other/placeholders.json +70 -0
src/prompts/library/customsAssessment/other/prompt.txt +24 -37
src/prompts/library/customsInvoice/other/prompt.txt +4 -3
src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
src/prompts/library/draftMbl/other/placeholders.json +33 -33
src/prompts/library/draftMbl/other/prompt.txt +34 -44
src/prompts/library/finalMbL/other/placeholders.json +80 -0
src/prompts/library/finalMbL/other/prompt.txt +34 -44
src/prompts/library/packingList/other/placeholders.json +98 -0
src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
src/prompts/prompt_library.py +0 -4
src/setup.py +15 -16
src/utils.py +120 -68
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +0 -0

src/pdf_processing.py CHANGED Viewed

@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
 import asyncio
 from collections import defaultdict
+from ddtrace import tracer
 from fastapi import HTTPException
 from google.cloud.documentai_v1 import Document as docaiv1_document
@@ -31,9 +32,10 @@ from src.postprocessing.postprocess_partner_invoice import (
 from src.prompts.prompt_library import prompt_library
 from src.utils import (
     extract_top_pages,
-    generate_schema_structure,
+    get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
+    split_pdf_into_chunks,
     transform_schema_strings,
     validate_based_on_schema,
 )
@@ -193,38 +195,32 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         result (dict): The structured data extracted from the document, formatted as JSON.
     """
     # Bundeskasse invoices contains all the required information in the first 3 pages.
-    file_content = (
-        extract_top_pages(file_content, num_pages=5)
-        if input_doc_type == "bundeskasse"
-        else file_content
-    )
+    if input_doc_type == "bundeskasse":
+        file_content = extract_top_pages(file_content, num_pages=5)
-    # convert file_content to required document
-    document = llm_client.prepare_document_for_gemini(file_content)
+    number_of_pages = get_pdf_page_count(file_content)
+    logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
-    # get the schema placeholder from the Doc AI and generate the response structure
-    response_schema = (
-        prompt_library.library[input_doc_type]["other"]["placeholders"]
-        if input_doc_type in ["partnerInvoice", "customsInvoice", "bundeskasse"]
-        else generate_schema_structure(params, input_doc_type)
-    )
+    # get the schema placeholder
+    response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
     carrier = "other"
-    if (
-        "preprocessing" in prompt_library.library.keys()
-        and "carrier" in prompt_library.library["preprocessing"].keys()
-        and input_doc_type
-        in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
-    ):
-        carrier_schema = prompt_library.library["preprocessing"]["carrier"][
-            "placeholders"
-        ][input_doc_type]
+    carrier_schema = (
+        prompt_library.library.get("preprocessing", {})
+        .get("carrier", {})
+        .get("placeholders", {})
+        .get(input_doc_type)
+    )
+    if carrier_schema:
         carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
         carrier_prompt = carrier_prompt.replace(
             "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
         )
+        # convert file_content to required document
+        document = llm_client.prepare_document_for_gemini(file_content)
         # identify carrier for customized prompting
         carrier = await identify_carrier(
             document,
@@ -234,30 +230,119 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
             doc_type=input_doc_type,
         )
-    if input_doc_type == "bookingConfirmation":
-        response_schema = prompt_library.library[input_doc_type][carrier][
-            "placeholders"
-        ]
+    # Select prompt
     if (
-        input_doc_type in prompt_library.library.keys()
-        and carrier in prompt_library.library[input_doc_type].keys()
+        input_doc_type not in prompt_library.library
+        or carrier not in prompt_library.library[input_doc_type]
     ):
-        # get the related prompt from predefined prompt library
-        prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
-        # generate the result with LLM (gemini)
-        result = await llm_client.get_unified_json_genai(
-            prompt=prompt,
-            document=document,
-            response_schema=response_schema,
-            doc_type=input_doc_type,
+        return {}
+    # get the related prompt from predefined prompt library
+    prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
+    # Add page-number extraction for moderately large docs
+    use_chunking = number_of_pages >= params["chunk_after"]
+    # Update schema and prompt to extract value-page_number pairs
+    if not use_chunking and number_of_pages > 1:
+        response_schema = transform_schema_strings(response_schema)
+        prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
+    tasks = []
+    # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
+    for chunk in (
+        split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
+        if use_chunking
+        else [file_content]
+    ):
+        tasks.append(
+            process_chunk_with_retry(
+                chunk,
+                prompt,
+                response_schema,
+                llm_client,
+                input_doc_type,
+            )
         )
-        result = llm_prediction_to_tuples(result)
+    results = await asyncio.gather(*tasks, return_exceptions=True)
-        return result
-    return {}
+    if use_chunking:
+        return merge_llm_results(results, response_schema)
+    else:
+        return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
+async def process_chunk_with_retry(
+    chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
+):
+    """Process a chunk with retries in case of failure."""
+    for attempt in range(1, retries + 1):
+        try:
+            return await process_chunk(
+                chunk_content=chunk_content,
+                prompt=prompt,
+                response_schema=response_schema,
+                llm_client=llm_client,
+                input_doc_type=input_doc_type,
+            )
+        except Exception as e:
+            logger.error(f"Chunk failed on attempt {attempt}: {e}")
+            if attempt == retries:
+                raise
+            await asyncio.sleep(1)  # small backoff
+async def process_chunk(
+    chunk_content, prompt, response_schema, llm_client, input_doc_type
+):
+    """Process a chunk with Gemini."""
+    document = llm_client.prepare_document_for_gemini(chunk_content)
+    return await llm_client.get_unified_json_genai(
+        prompt=prompt,
+        document=document,
+        response_schema=response_schema,
+        doc_type=input_doc_type,
+    )
+def merge_llm_results(results, response_schema):
+    """Merge LLM results from multiple chunks."""
+    merged = {}
+    for i, result in enumerate(results):
+        if not isinstance(result, dict):
+            continue
+        # Add page number to all values coming from this chunk
+        result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
+        # Merge the result into the final merged dictionary
+        for key, value in result.items():
+            field_type = (
+                response_schema["properties"].get(key, {}).get("type", "").upper()
+            )
+            if key not in merged:
+                if field_type == "ARRAY":
+                    # append the values as a list
+                    merged[key] = (
+                        value if isinstance(value, list) else ([value] if value else [])
+                    )
+                else:
+                    merged[key] = value
+                continue
+            if field_type == "ARRAY":
+                # append list contents across chunks
+                if isinstance(value, list):
+                    merged[key].extend(value)
+                else:
+                    merged[key].append(value)
+            # take first non-null value only
+            if merged[key] in (None, "", [], {}):
+                merged[key] = value
+    return merged
 async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
@@ -334,15 +419,9 @@ async def extract_data_by_doctype(
     processor_client,
     if_use_docai,
     if_use_llm,
+    llm_client,
     isBetaTest=False,
 ):
-    # Select LLM client (Using 2.5 Flash model for Bundeskasse)
-    llm_client = (
-        params["LlmClient_Flash"]
-        if input_doc_type == "bundeskasse"
-        else params["LlmClient"]
-    )
     async def extract_w_docai():
         return await extract_data_from_pdf_w_docai(
             params=params,
@@ -391,6 +470,7 @@ async def data_extraction_manual_flow(
     meta,
     processor_client,
     schema_client,
+    use_default_logging=False,
 ):
     """
     Process a PDF file and extract data from it.
@@ -411,6 +491,15 @@ async def data_extraction_manual_flow(
     """
     # Get the start time for processing
     start_time = asyncio.get_event_loop().time()
+    # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
+    llm_client = (
+        params["LlmClient_Flash"]
+        if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
+        else params["LlmClient"]
+    )
+    page_count = None
     # Validate the file type
     if mime_type == "application/pdf":
         # Enable Doc Ai only for certain document types.
@@ -432,8 +521,10 @@ async def data_extraction_manual_flow(
             processor_client,
             if_use_docai=if_use_docai,
             if_use_llm=if_use_llm,
+            llm_client=llm_client,
             isBetaTest=False,
         )
+        page_count = get_pdf_page_count(file_content)
     elif "excel" in mime_type or "spreadsheet" in mime_type:
         # Extract data from the Excel file
@@ -442,8 +533,19 @@ async def data_extraction_manual_flow(
             input_doc_type=meta.documentTypeCode,
             file_content=file_content,
             mime_type=mime_type,
+            llm_client=llm_client,
         )
+        # Get sheet count from dd-trace span (set in extract_data_from_excel)
+        # Note: we use the span metric instead of len(extracted_data) because
+        # some sheets may fail extraction and not appear in extracted_data
+        span = tracer.current_span()
+        page_count = span.get_metric("est_page_count") if span else len(extracted_data)
+        if page_count > 100:
+            logger.warning(
+                f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
+            )
     else:
         raise HTTPException(
             status_code=400,
@@ -451,7 +553,7 @@ async def data_extraction_manual_flow(
         )
     # Create the result dictionary with the extracted data
     extracted_data = await format_all_entities(
-        extracted_data, meta.documentTypeCode, params
+        extracted_data, meta.documentTypeCode, params, mime_type
     )
     result = {
         "id": meta.id,
@@ -466,7 +568,9 @@ async def data_extraction_manual_flow(
     logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
     # Schedule background tasks without using FastAPI's BackgroundTasks
-    if os.getenv("CLUSTER") != "ode":  # skip data export to bigquery in ODE environment
+    if (
+        os.getenv("CLUSTER") != "ode"
+    ) & use_default_logging:  # skip data export to bigquery in ODE environment
         asyncio.create_task(
             run_background_tasks(
                 params,
@@ -477,6 +581,7 @@ async def data_extraction_manual_flow(
                 processor_version,
                 mime_type,
                 elapsed_time,
+                page_count,
             )
         )
     return result

src/postprocessing/common.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.constants import formatting_rules
 from src.io import logger
 from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
 from src.prompts.prompt_library import prompt_library
-from src.utils import get_tms_mappings
+from src.utils import batch_fetch_all_mappings, get_tms_mappings
 tms_domain = os.environ["TMS_DOMAIN"]
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
         formatted_value: string
     """
+    # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
+    value = remove_unwanted_patterns(data_field_value)
     formatted_value = ""
-    for c in data_field_value:
+    for c in value:
         if c.isnumeric() or c in [",", ".", "-"]:
             formatted_value += c
@@ -319,6 +322,14 @@ def remove_unwanted_patterns(lineitem: str):
     # Remove "HIGH CUBE"
     lineitem = lineitem.replace("HIGH CUBE", "")
+    # Remove container size e.g., 20FT, 40HC, etc.
+    pattern = [
+        f"{s}{t}"
+        for s in ("20|22|40|45".split("|"))
+        for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
+    ]
+    lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
     return lineitem
@@ -349,42 +360,75 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     # Remove the currency codes
     lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
+    # remove other patterns
+    lineitem = remove_unwanted_patterns(lineitem)
     # Remove numbers from the line item
     if (
         remove_numbers
     ):  # Do not remove numbers for the reverse charge sentence as it contains Article number
         lineitem = re.sub(r"\d+", "", lineitem)
-    # remove other patterns
-    lineitem = remove_unwanted_patterns(lineitem)
     # remove special chars
     lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
+    # Remove x from lineitem like 10 x
+    lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
     return re.sub(r"\s{2,}", " ", lineitem).strip()
-async def format_label(entity_k, entity_value, document_type_code, params):
+async def format_label(
+    entity_k,
+    entity_value,
+    document_type_code,
+    params,
+    mime_type,
+    container_map,
+    terminal_map,
+    depot_map,
+):
     llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, document_type_code, params)
+            format_label(
+                sub_k,
+                sub_v,
+                document_type_code,
+                params,
+                mime_type,
+                container_map,
+                terminal_map,
+                depot_map,
+            )
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
     if isinstance(entity_value, list):
         format_tasks = await asyncio.gather(
             *[
-                format_label(entity_k, sub_v, document_type_code, params)
+                format_label(
+                    entity_k,
+                    sub_v,
+                    document_type_code,
+                    params,
+                    mime_type,
+                    container_map,
+                    terminal_map,
+                    depot_map,
+                )
                 for sub_v in entity_value
             ]
         )
         return entity_k, [v for _, v in format_tasks]
-    if isinstance(entity_value, tuple):
-        page = entity_value[1]
-        entity_value = entity_value[0]
-    else:
-        page = -1
+    if mime_type == "application/pdf":
+        if isinstance(entity_value, tuple):
+            page = entity_value[1]
+            entity_value = entity_value[0]
+        else:
+            page = -1
     entity_key = entity_k.lower()
     formatted_value = None
@@ -394,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params):
         )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
-        formatted_value = get_tms_mappings(entity_value, "container_types")
+        formatted_value = container_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "terminal"):
-        formatted_value = get_tms_mappings(entity_value, "terminals")
+        formatted_value = terminal_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "depot"):
-        formatted_value = get_tms_mappings(entity_value, "depots")
+        formatted_value = depot_map.get(entity_value)
     elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
         try:
@@ -421,7 +465,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
         except ValueError as e:
             logger.info(f"ParserError: {e}")
-    elif entity_key in ["invoicenumber", "creditnoteinvoicenumber"]:
+    elif (
+        entity_key in ["invoicenumber", "creditnoteinvoicenumber"]
+        and document_type_code == "bundeskasse"
+    ):
         formatted_value = clean_invoice_number(entity_value)
     elif entity_key in ("shipmentid", "partnerreference"):
@@ -482,8 +529,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
     result = {
         "documentValue": entity_value,
         "formattedValue": formatted_value,
-        "page": page,
     }
+    if mime_type == "application/pdf":
+        result["page"] = page
     return entity_k, result
@@ -491,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
     """Get port code using AI model."""
     port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
-    return get_tms_mappings(port, "ports", port_llm)
+    result = await get_tms_mappings(port, "ports", port_llm)
+    return result.get(port, None)
 async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -582,7 +632,75 @@ def decimal_convertor(value, quantity=False):
     return value
-async def format_all_entities(result, document_type_code, params):
+async def collect_mapping_requests(entity_value, document_type_code):
+    """Collect all unique container types, terminals, and depots from the entity value."""
+    # Sets to store unique values
+    container_types = set()
+    terminals = set()
+    depots = set()
+    def walk(key, value):
+        key_lower = key.lower()
+        # nested dict
+        if isinstance(value, dict):
+            for k, v in value.items():
+                walk(k, v)
+        # list of values
+        elif isinstance(value, list):
+            for item in value:
+                walk(key, item)
+        # leaf node
+        else:
+            if key_lower in ("containertype", "containersize"):
+                # Take only "20DV" from ('20DV', 0) if it's a tuple
+                container_types.add(value[0]) if isinstance(
+                    value, tuple
+                ) else container_types.add(value)
+            elif check_formatting_rule(key, document_type_code, "terminal"):
+                terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
+                    value
+                )
+            elif check_formatting_rule(key, document_type_code, "depot"):
+                depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
+    walk("root", entity_value)
+    return container_types, terminals, depots
+async def format_all_labels(entity_data, document_type_code, params, mime_type):
+    """Format all labels in the entity data using cached mappings."""
+    # Collect all mapping values needed
+    container_req, terminal_req, depot_req = await collect_mapping_requests(
+        entity_data, document_type_code
+    )
+    # Batch fetch mappings
+    container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
+        container_req, terminal_req, depot_req
+    )
+    # Format labels using cached mappings
+    _, result = await format_label(
+        "root",
+        entity_data,
+        document_type_code,
+        params,
+        mime_type,
+        container_map,
+        terminal_map,
+        depot_map,
+    )
+    return _, result
+async def format_all_entities(result, document_type_code, params, mime_type):
     """Format the entity values in the result dictionary."""
     # Since we treat `customsInvoice` same as `partnerInvoice`
     document_type_code = (
@@ -597,11 +715,13 @@ async def format_all_entities(result, document_type_code, params):
         return {}
     # Format all entities recursively
-    _, aggregated_data = await format_label(None, result, document_type_code, params)
+    _, aggregated_data = await format_all_labels(
+        result, document_type_code, params, mime_type
+    )
     # Process partner invoice on lineitem mapping and reverse charge sentence
     if document_type_code in ["partnerInvoice", "bundeskasse"]:
-        process_partner_invoice(params, aggregated_data, document_type_code)
+        await process_partner_invoice(params, aggregated_data, document_type_code)
     logger.info("Data Extraction completed successfully")
     return aggregated_data
@@ -633,22 +753,46 @@ def remove_stop_words(lineitem: str):
     )
-def llm_prediction_to_tuples(llm_prediction):
+def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
     """Convert LLM prediction dictionary to tuples of (value, page_number)."""
+    # If only 1 page, simply pair each value with page number 0
+    if number_of_pages == 1:
+        effective_page = 0 if page_number is None else page_number
+        if isinstance(llm_prediction, dict):
+            return {
+                k: llm_prediction_to_tuples(
+                    v, number_of_pages, page_number=effective_page
+                )
+                for k, v in llm_prediction.items()
+            }
+        elif isinstance(llm_prediction, list):
+            return [
+                llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
+                for v in llm_prediction
+            ]
+        else:
+            return (llm_prediction, effective_page) if llm_prediction else None
+    # logic for multi-page predictions
     if isinstance(llm_prediction, dict):
         if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
             if llm_prediction["value"]:
                 try:
-                    page_number = int(llm_prediction["page_number"])
+                    _page_number = int(llm_prediction["page_number"])
                 except:  # noqa: E722
-                    page_number = -1
-                return (llm_prediction["value"], page_number)
+                    _page_number = -1
+                return (llm_prediction["value"], _page_number)
             return None
         for key, value in llm_prediction.items():
             llm_prediction[key] = llm_prediction_to_tuples(
-                llm_prediction.get(key, value)
+                llm_prediction.get(key, value), number_of_pages, page_number
             )
     elif isinstance(llm_prediction, list):
         for i, item in enumerate(llm_prediction):
-            llm_prediction[i] = llm_prediction_to_tuples(item)
+            llm_prediction[i] = llm_prediction_to_tuples(
+                item, number_of_pages, page_number
+            )
     return llm_prediction

data-science-document-ai 1.42.5__py3-none-any.whl → 1.56.1__py3-none-any.whl

data-science-document-ai 1.42.5py3-none-any.whl → 1.56.1py3-none-any.whl