PyPI - data-science-document-ai - Versions diffs - 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl - Mend

data-science-document-ai 1.13.0py3-none-any.whl → 1.56.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +7 -2
data_science_document_ai-1.56.1.dist-info/RECORD +60 -0
{data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +1 -1
src/constants.py +42 -12
src/constants_sandbox.py +2 -22
src/docai.py +18 -7
src/docai_processor_config.yaml +0 -64
src/excel_processing.py +34 -15
src/io.py +74 -6
src/llm.py +12 -34
src/pdf_processing.py +228 -78
src/postprocessing/common.py +495 -618
src/postprocessing/postprocess_partner_invoice.py +383 -27
src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
src/prompts/library/bundeskasse/other/placeholders.json +113 -0
src/prompts/library/bundeskasse/other/prompt.txt +48 -0
src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
src/prompts/library/customsAssessment/other/placeholders.json +67 -16
src/prompts/library/customsAssessment/other/prompt.txt +24 -37
src/prompts/library/customsInvoice/other/placeholders.json +205 -0
src/prompts/library/customsInvoice/other/prompt.txt +105 -0
src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
src/prompts/library/draftMbl/other/placeholders.json +33 -33
src/prompts/library/draftMbl/other/prompt.txt +34 -44
src/prompts/library/finalMbL/other/placeholders.json +34 -34
src/prompts/library/finalMbL/other/prompt.txt +34 -44
src/prompts/library/packingList/other/placeholders.json +98 -0
src/prompts/library/packingList/other/prompt.txt +1 -1
src/prompts/library/partnerInvoice/other/placeholders.json +165 -45
src/prompts/library/partnerInvoice/other/prompt.txt +82 -44
src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
src/setup.py +73 -63
src/utils.py +207 -30
data_science_document_ai-1.13.0.dist-info/RECORD +0 -55
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/finalMbL/maersk/prompt.txt +0 -17

src/llm.py CHANGED Viewed

@@ -15,6 +15,7 @@ from vertexai.generative_models import (
     Part,
 )
+from src.io import get_gcp_labels
 from src.utils import cache_on_disk
@@ -28,12 +29,12 @@ class LlmClient:
         # Initialize the model parameters
         self.model_params = {
             "temperature": parameters.get("temperature", 0),
-            "max_output_tokens": parameters.get("maxOutputTokens", 8000),
+            "max_output_tokens": parameters.get("maxOutputTokens", 65536),
             "top_p": parameters.get("top_p", 0.8),
             "top_k": parameters.get("top_k", 40),
             "seed": parameters.get("seed", 42),
         }
-        self.model_id = parameters.get("model_id", "gemini-1.5-pro-001")
+        self.model_id = parameters.get("model_id", "gemini-2.5-flash")
         # Initialize the safety configuration
         self.safety_config = {
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
@@ -69,6 +70,7 @@ class LlmClient:
         document: str = None,
         response_schema: dict = None,
         response_mime_type: str = "application/json",
+        doc_type: str = None,
     ):
         """Ask the Gemini model a question.
@@ -76,6 +78,7 @@ class LlmClient:
             prompt (str): The prompt to send to the model.
             document (str, optional): An optional document to provide context.
             response_schema (dict, optional): Defines a specific response schema for the model.
+            doc_type (str, optional): Document type for cost tracking labels.
         Returns:
             str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
             # Prepare inputs for the model
             inputs = [document, prompt] if document else prompt
-            # Generate the response
+            # Generate the response with labels for cost tracking
             model_response = await cache_on_disk(
                 self.geminy_client.generate_content_async,
                 contents=inputs,
                 generation_config=config,
                 safety_settings=self.safety_config,
+                labels=get_gcp_labels(doc_type=doc_type),
             )
             response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
             return "{}"
     async def get_unified_json_genai(
-        self, prompt, document=None, response_schema=None, model="gemini"
+        self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
     ):
         """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
@@ -122,6 +126,7 @@ class LlmClient:
             document: Content of the PDF document
             response_schema: The schema to use for the response
             model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
+            doc_type (str, optional): Document type for cost tracking labels.
         Returns:
             dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
             response = await self.ask_chatgpt(prompt, document, response_schema)
         else:
             # Default to Gemini
-            response = await self.ask_gemini(prompt, document, response_schema)
+            response = await self.ask_gemini(
+                prompt, document, response_schema, doc_type=doc_type
+            )
         try:
             return json.loads(response)
@@ -194,33 +201,4 @@ class LlmClient:
         return response
-def prompt_excel_extraction(excel_structured_text):
-    """Write a prompt to extract data from Excel files.
-    Args:
-        excel_structured_text (str): The structured text of the Excel file.
-    Returns:
-        prompt str: The prompt for common json.
-    """
-    prompt = f"""{excel_structured_text}
-    Task: Fill in the following dictionary from the information in the given in the above excel data.
-    Instructions:
-    - Do not change the keys of the following dictionary.
-    - The values should be filled in as per the schema provided below.
-    - If an entity contains a 'display_name', consider its properties as child data points in the below format.
-    {{'data-field': {{
-        'child-data-field': 'type -occurrence_type- description',
-          }}
-    }}
-    - The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
-    - Ensure the schema reflects the hierarchical relationship.
-    - Use the data field description to understand the context of the data.
-    """
-    return prompt
 # pylint: enable=all

src/pdf_processing.py CHANGED Viewed

@@ -2,19 +2,24 @@
 # flake8: noqa: E402
 import logging
+import os
 logger = logging.getLogger(__name__)
 import asyncio
-import random
 from collections import defaultdict
+from ddtrace import tracer
 from fastapi import HTTPException
 from google.cloud.documentai_v1 import Document as docaiv1_document
 from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
 from src.excel_processing import extract_data_from_excel
-from src.postprocessing.common import format_all_entities, remove_none_values
+from src.postprocessing.common import (
+    format_all_entities,
+    llm_prediction_to_tuples,
+    remove_none_values,
+)
 from src.postprocessing.postprocess_booking_confirmation import (
     postprocess_booking_confirmation,
 )
@@ -26,14 +31,19 @@ from src.postprocessing.postprocess_partner_invoice import (
 )
 from src.prompts.prompt_library import prompt_library
 from src.utils import (
-    generate_schema_structure,
+    extract_top_pages,
+    get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
+    split_pdf_into_chunks,
+    transform_schema_strings,
     validate_based_on_schema,
 )
-async def process_file_w_docai(params, image_content, client, processor_name):
+async def process_file_w_docai(
+    params, image_content, client, processor_name, doc_type=None
+):
     """
     Process a file using Document AI.
@@ -42,6 +52,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
         image_content (bytes): The file to be processed. It can be bytes object.
         client: The Document AI client.
         processor_name (str): The name of the processor to be used.
+        doc_type (str, optional): Document type for cost tracking labels.
     Returns:
         The processed document.
@@ -53,7 +64,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
     try:
         logger.info("Processing document...")
-        result = await _process_pdf_w_docai(image_content, client, processor_name)
+        result = await _process_pdf_w_docai(
+            image_content, client, processor_name, doc_type=doc_type
+        )
     except Exception as e:
         if e.reason == "PAGE_LIMIT_EXCEEDED":
             logger.warning(
@@ -62,7 +75,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
             # Process the document in batch method (offline processing)
             try:
                 result = await _batch_process_pdf_w_docai(
-                    params, image_content, client, processor_name
+                    params, image_content, client, processor_name, doc_type=doc_type
                 )
             except Exception as batch_e:
                 logger.error(f"Error processing document {batch_e}.")
@@ -92,7 +105,7 @@ async def extract_data_from_pdf_w_docai(
         )
     result = await process_file_w_docai(
-        params, file_content, processor_client, processor_name
+        params, file_content, processor_client, processor_name, doc_type=input_doc_type
     )
     # Create an entity object to store the result in gcs
@@ -103,9 +116,22 @@ async def extract_data_from_pdf_w_docai(
     # Extract entities from the result
     for entity in result.entities:
         value = (
-            {child.type_: child.mention_text for child in entity.properties}
+            {
+                child.type_: (
+                    child.mention_text,
+                    child.page_anchor.page_refs[0].page
+                    if hasattr(child.page_anchor.page_refs[0], "page")
+                    else 0,
+                )
+                for child in entity.properties
+            }
             if entity.properties
-            else entity.mention_text
+            else (
+                entity.mention_text,
+                entity.page_anchor.page_refs[0].page
+                if hasattr(entity.page_anchor.page_refs[0], "page")
+                else 0,
+            )
         )
         aggregated_data[entity.type_].append(value)
@@ -121,7 +147,7 @@ async def extract_data_from_pdf_w_docai(
     ):
         aggregated_data = postprocess_booking_confirmation(aggregated_data)
         logger.info("Transport Legs assembled successfully")
-    elif input_doc_type == "partnerInvoice":
+    elif input_doc_type in ["partnerInvoice", "customsInvoice"]:
         aggregated_data = postprocessing_partner_invoice(aggregated_data)
         logger.info("Partner Invoice naming changed successfully")
@@ -136,7 +162,9 @@ async def extract_data_from_pdf_w_docai(
     return aggregated_data, result_for_store, processor_version
-async def identify_carrier(document, llm_client, prompt, response_schema):
+async def identify_carrier(
+    document, llm_client, prompt, response_schema, doc_type=None
+):
     """Identify the carrier from the Booking Confirmation document."""
     result = await llm_client.ask_gemini(
@@ -144,92 +172,183 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
         document=document,
         response_schema=response_schema,
         response_mime_type="text/x.enum",
+        doc_type=doc_type,
     )
     if result:
-        result = result.lower()
+        result = result.strip().lower()
     else:
         result = "other"
     return result
-async def process_file_w_llm(
-    params, file_content, input_doc_type, schema_client, llm_client
-):
+async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
     """Process a document using a language model (gemini) to extract structured data.
     Args:
         params (dict): The project parameters.
         file_content (str): The content of the file to be processed.
         input_doc_type (str): The type of document, used to select the appropriate prompt from the prompt library.
-        schema_client (object): The schema client object.
         llm_client: The LLM client object.
     Returns:
         result (dict): The structured data extracted from the document, formatted as JSON.
     """
-    # convert file_content to required document
-    document = llm_client.prepare_document_for_gemini(file_content)
+    # Bundeskasse invoices contains all the required information in the first 3 pages.
+    if input_doc_type == "bundeskasse":
+        file_content = extract_top_pages(file_content, num_pages=5)
+    number_of_pages = get_pdf_page_count(file_content)
+    logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
-    # get the schema placeholder from the Doc AI and generate the response structure
-    response_schema = generate_schema_structure(params, input_doc_type)
+    # get the schema placeholder
+    response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
     carrier = "other"
-    if (
-        "preprocessing" in prompt_library.library.keys()
-        and "carrier" in prompt_library.library["preprocessing"].keys()
-        and input_doc_type
-        in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
-    ):
-        carrier_schema = prompt_library.library["preprocessing"]["carrier"][
-            "placeholders"
-        ][input_doc_type]
+    carrier_schema = (
+        prompt_library.library.get("preprocessing", {})
+        .get("carrier", {})
+        .get("placeholders", {})
+        .get(input_doc_type)
+    )
+    if carrier_schema:
         carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
         carrier_prompt = carrier_prompt.replace(
             "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
         )
+        # convert file_content to required document
+        document = llm_client.prepare_document_for_gemini(file_content)
         # identify carrier for customized prompting
         carrier = await identify_carrier(
-            document, llm_client, carrier_prompt, carrier_schema
+            document,
+            llm_client,
+            carrier_prompt,
+            carrier_schema,
+            doc_type=input_doc_type,
         )
-    # TODO: Remove the below line after the BC schema is updated in the Doc AI model
-    if input_doc_type == "bookingConfirmation":
-        response_schema = prompt_library.library[input_doc_type][carrier.lower()][
-            "placeholders"
-        ]
-    # There is one more additional field in partnerInvoice
-    # the reverseChargeSentence is added on later so its not available in Doc Ai schema.
-    elif input_doc_type == "partnerInvoice":
-        response_schema = prompt_library.library[input_doc_type][carrier.lower()][
-            "placeholders"
-        ]
+    # Select prompt
     if (
-        input_doc_type in prompt_library.library.keys()
-        and carrier.lower() in prompt_library.library[input_doc_type].keys()
+        input_doc_type not in prompt_library.library
+        or carrier not in prompt_library.library[input_doc_type]
     ):
-        # get the related prompt from predefined prompt library
-        prompt = prompt_library.library[input_doc_type][carrier.lower()]["prompt"]
+        return {}
+    # get the related prompt from predefined prompt library
+    prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
+    # Add page-number extraction for moderately large docs
+    use_chunking = number_of_pages >= params["chunk_after"]
+    # Update schema and prompt to extract value-page_number pairs
+    if not use_chunking and number_of_pages > 1:
+        response_schema = transform_schema_strings(response_schema)
+        prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
-        # generate the result with LLM (gemini)
-        result = await llm_client.get_unified_json_genai(
-            prompt=prompt, document=document, response_schema=response_schema
+    tasks = []
+    # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
+    for chunk in (
+        split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
+        if use_chunking
+        else [file_content]
+    ):
+        tasks.append(
+            process_chunk_with_retry(
+                chunk,
+                prompt,
+                response_schema,
+                llm_client,
+                input_doc_type,
+            )
         )
-        return result
-    return {}
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    if use_chunking:
+        return merge_llm_results(results, response_schema)
+    else:
+        return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
+async def process_chunk_with_retry(
+    chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
+):
+    """Process a chunk with retries in case of failure."""
+    for attempt in range(1, retries + 1):
+        try:
+            return await process_chunk(
+                chunk_content=chunk_content,
+                prompt=prompt,
+                response_schema=response_schema,
+                llm_client=llm_client,
+                input_doc_type=input_doc_type,
+            )
+        except Exception as e:
+            logger.error(f"Chunk failed on attempt {attempt}: {e}")
+            if attempt == retries:
+                raise
+            await asyncio.sleep(1)  # small backoff
-async def extract_data_from_pdf_w_llm(
-    params, input_doc_type, file_content, schema_client, llm_client
+async def process_chunk(
+    chunk_content, prompt, response_schema, llm_client, input_doc_type
 ):
+    """Process a chunk with Gemini."""
+    document = llm_client.prepare_document_for_gemini(chunk_content)
+    return await llm_client.get_unified_json_genai(
+        prompt=prompt,
+        document=document,
+        response_schema=response_schema,
+        doc_type=input_doc_type,
+    )
+def merge_llm_results(results, response_schema):
+    """Merge LLM results from multiple chunks."""
+    merged = {}
+    for i, result in enumerate(results):
+        if not isinstance(result, dict):
+            continue
+        # Add page number to all values coming from this chunk
+        result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
+        # Merge the result into the final merged dictionary
+        for key, value in result.items():
+            field_type = (
+                response_schema["properties"].get(key, {}).get("type", "").upper()
+            )
+            if key not in merged:
+                if field_type == "ARRAY":
+                    # append the values as a list
+                    merged[key] = (
+                        value if isinstance(value, list) else ([value] if value else [])
+                    )
+                else:
+                    merged[key] = value
+                continue
+            if field_type == "ARRAY":
+                # append list contents across chunks
+                if isinstance(value, list):
+                    merged[key].extend(value)
+                else:
+                    merged[key].append(value)
+            # take first non-null value only
+            if merged[key] in (None, "", [], {}):
+                merged[key] = value
+    return merged
+async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
     """Extract data from the PDF file."""
     # Process the document using LLM
-    result = await process_file_w_llm(
-        params, file_content, input_doc_type, schema_client, llm_client
-    )
+    result = await process_file_w_llm(params, file_content, input_doc_type, llm_client)
     # Add currency from the amount field
     if input_doc_type in ["commercialInvoice"]:
@@ -277,8 +396,8 @@ def combine_llm_results_w_doc_ai(
         for key in keys_to_combine:
             if key in llm.keys():
                 # Merge the list of dictionaries
+                # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
                 if len(llm[key]) < len(result[key]):
-                    # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
                     result[key] = llm[key]
                 else:
                     # If the length of the LLM list is greater than or equal to the Doc AI result,
@@ -298,14 +417,11 @@ async def extract_data_by_doctype(
     file_content,
     input_doc_type,
     processor_client,
-    schema_client,
     if_use_docai,
     if_use_llm,
+    llm_client,
     isBetaTest=False,
 ):
-    # Select LLM client
-    llm_client = params["LlmClient"]
     async def extract_w_docai():
         return await extract_data_from_pdf_w_docai(
             params=params,
@@ -320,7 +436,6 @@ async def extract_data_by_doctype(
             params=params,
             input_doc_type=input_doc_type,
             file_content=file_content,
-            schema_client=schema_client,
             llm_client=llm_client,
         )
@@ -355,7 +470,7 @@ async def data_extraction_manual_flow(
     meta,
     processor_client,
     schema_client,
-    embed_manager,
+    use_default_logging=False,
 ):
     """
     Process a PDF file and extract data from it.
@@ -367,7 +482,6 @@ async def data_extraction_manual_flow(
         meta (DocumentMeta): Metadata associated with the document.
         processor_client (DocumentProcessorClient): Client for the Document AI processor.
         schema_client (DocumentSchemaClient): Client for the Document AI schema.
-        embed_manager (EmbeddingsManager): Manager for embeddings.
     Returns:
         dict: A dictionary containing the processed document information.
@@ -375,9 +489,23 @@ async def data_extraction_manual_flow(
     Raises:
         Refer to reasons in 400 error response examples.
     """
+    # Get the start time for processing
+    start_time = asyncio.get_event_loop().time()
+    # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
+    llm_client = (
+        params["LlmClient_Flash"]
+        if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
+        else params["LlmClient"]
+    )
+    page_count = None
     # Validate the file type
     if mime_type == "application/pdf":
-        if_use_docai = True
+        # Enable Doc Ai only for certain document types.
+        if_use_docai = (
+            True if meta.documentTypeCode in params["model_config"]["stable"] else False
+        )
         if_use_llm = (
             True if meta.documentTypeCode in params["key_to_combine"].keys() else False
         )
@@ -391,11 +519,12 @@ async def data_extraction_manual_flow(
             file_content,
             meta.documentTypeCode,
             processor_client,
-            schema_client,
             if_use_docai=if_use_docai,
             if_use_llm=if_use_llm,
+            llm_client=llm_client,
             isBetaTest=False,
         )
+        page_count = get_pdf_page_count(file_content)
     elif "excel" in mime_type or "spreadsheet" in mime_type:
         # Extract data from the Excel file
@@ -403,10 +532,20 @@ async def data_extraction_manual_flow(
             params=params,
             input_doc_type=meta.documentTypeCode,
             file_content=file_content,
-            schema_client=schema_client,
             mime_type=mime_type,
+            llm_client=llm_client,
         )
+        # Get sheet count from dd-trace span (set in extract_data_from_excel)
+        # Note: we use the span metric instead of len(extracted_data) because
+        # some sheets may fail extraction and not appear in extracted_data
+        span = tracer.current_span()
+        page_count = span.get_metric("est_page_count") if span else len(extracted_data)
+        if page_count > 100:
+            logger.warning(
+                f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
+            )
     else:
         raise HTTPException(
             status_code=400,
@@ -414,7 +553,7 @@ async def data_extraction_manual_flow(
         )
     # Create the result dictionary with the extracted data
     extracted_data = await format_all_entities(
-        extracted_data, embed_manager, meta.documentTypeCode, params["LlmClient"]
+        extracted_data, meta.documentTypeCode, params, mime_type
     )
     result = {
         "id": meta.id,
@@ -422,16 +561,27 @@ async def data_extraction_manual_flow(
         "data": extracted_data,
         "processor_version": processor_version,
     }
+    # Log the time taken for processing
+    end_time = asyncio.get_event_loop().time()
+    elapsed_time = end_time - start_time
+    logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
     # Schedule background tasks without using FastAPI's BackgroundTasks
-    asyncio.create_task(
-        run_background_tasks(
-            params,
-            meta.id,
-            meta.documentTypeCode,
-            extracted_data,
-            store_data,
-            processor_version,
-            mime_type,
+    if (
+        os.getenv("CLUSTER") != "ode"
+    ) & use_default_logging:  # skip data export to bigquery in ODE environment
+        asyncio.create_task(
+            run_background_tasks(
+                params,
+                meta.id,
+                meta.documentTypeCode,
+                extracted_data,
+                store_data,
+                processor_version,
+                mime_type,
+                elapsed_time,
+                page_count,
+            )
         )
-    )
     return result

data-science-document-ai 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl

data-science-document-ai 1.13.0py3-none-any.whl → 1.56.1py3-none-any.whl