PyPI - data-science-document-ai - Versions diffs - 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl - Mend

data-science-document-ai 1.40.3py3-none-any.whl → 1.51.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +2 -2
data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
src/constants.py +6 -10
src/docai.py +14 -5
src/docai_processor_config.yaml +0 -56
src/excel_processing.py +34 -13
src/io.py +69 -1
src/llm.py +10 -32
src/pdf_processing.py +192 -54
src/postprocessing/common.py +246 -44
src/postprocessing/postprocess_partner_invoice.py +139 -85
src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
src/prompts/library/bundeskasse/other/placeholders.json +25 -25
src/prompts/library/bundeskasse/other/prompt.txt +8 -6
src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
src/prompts/library/customsAssessment/other/placeholders.json +67 -16
src/prompts/library/customsAssessment/other/prompt.txt +24 -37
src/prompts/library/customsInvoice/other/placeholders.json +20 -20
src/prompts/library/customsInvoice/other/prompt.txt +4 -4
src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
src/prompts/library/draftMbl/other/placeholders.json +33 -33
src/prompts/library/draftMbl/other/prompt.txt +34 -44
src/prompts/library/finalMbL/other/placeholders.json +34 -34
src/prompts/library/finalMbL/other/prompt.txt +34 -44
src/prompts/library/packingList/other/placeholders.json +98 -0
src/prompts/library/packingList/other/prompt.txt +1 -1
src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
src/setup.py +13 -16
src/utils.py +157 -45
data_science_document_ai-1.40.3.dist-info/RECORD +0 -59
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
{data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +0 -0

{data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.40.3
+Version: 1.51.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
 Requires-Dist: pgzip (>=0.3.5,<0.4.0)
 Requires-Dist: pyarrow (==16.1.0)
 Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
-Requires-Dist: pypdf2 (>=3.0.1,<4.0.0)
+Requires-Dist: pypdf (>=6.1.2,<7.0.0)
 Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
 Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
 Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)

data_science_document_ai-1.51.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,60 @@
+src/constants.py,sha256=k5bBnJN-kmXiAtIAlz6Kg6fDyR9n0DuIudCZ9ZHO_Jw,3528
+src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
+src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
+src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
+src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
+src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
+src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
+src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
+src/pdf_processing.py,sha256=oKVPnIu_keiN17XLOGImeyJ4iMT2H51x4OD1Tp9yw1s,19992
+src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
+src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
+src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
+src/postprocessing/postprocess_partner_invoice.py,sha256=Fv4Y6Lc8e6aFFcwX0kLOal2y4TrR-XfAzjtuQnBwo0o,12815
+src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
+src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
+src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
+src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
+src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
+src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=XgfhrFTXLJ467L4Cer77K0KTPtWTg_-QJXCsltvLlpI,3430
+src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=6p_IQMA1PUgGZqjf_by4ja9jK27ba4loYhEpIa7Oxx4,1406
+src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=t-yh1dOrcRa0fm0VPFC1xCRBf0R0Zjp9j_Hb31aZS1w,3223
+src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
+src/prompts/library/bookingConfirmation/msc/prompt.txt,sha256=_Jfioislp7SNs2BEXoklvnTPVXe6Z0M6myD1IWnBFYQ,4705
+src/prompts/library/bookingConfirmation/oocl/placeholders.json,sha256=JTtWvLSsoxN7huXY8ZNqqPkODM-DOs5wu3YvNHOna3k,1404
+src/prompts/library/bookingConfirmation/oocl/prompt.txt,sha256=xNTrJdUtDalcP3AKkfRiOnHjAdRCbcTvehcBQKurRj0,2201
+src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
+src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
+src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
+src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
+src/prompts/library/bundeskasse/other/placeholders.json,sha256=7xKzi_ypkIICO9nrEl45W9G7-h33uWVRVWnpg2b5lUg,4288
+src/prompts/library/bundeskasse/other/prompt.txt,sha256=miNYoqRZEd6Z1LNisTahX1-tenzr5kEpRA6gvPH7NCw,3316
+src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
+src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
+src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
+src/prompts/library/customsAssessment/other/prompt.txt,sha256=z3FuoHZ588Pz1WBJDW7ISAC3J6n7hPJCcS92CdHDTFw,2494
+src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
+src/prompts/library/customsInvoice/other/prompt.txt,sha256=1dR73TQZJAfO9dKl-h7VhiJkdli498IV4e5JgBlOoYw,9695
+src/prompts/library/deliveryOrder/other/placeholders.json,sha256=j-9F4V3yDg4610PPsOwU3oOj_S9vAvAB9Ix155WGIwc,3827
+src/prompts/library/deliveryOrder/other/prompt.txt,sha256=RD076vq0x0IjoEVQfh-G0u4nxITCpgKZGrwMlR9YAvk,2695
+src/prompts/library/draftMbl/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
+src/prompts/library/draftMbl/other/prompt.txt,sha256=4RjlGT2OFmcBCUJhuCnO9GtmCn3vVesXHi_ml2g3dK8,2386
+src/prompts/library/finalMbL/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
+src/prompts/library/finalMbL/other/prompt.txt,sha256=cyeKjK94sepqXiLEeZKB4VpmT0-nqXALP4dih-B67M8,2386
+src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
+src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
+src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
+src/prompts/library/partnerInvoice/other/prompt.txt,sha256=vMk-FBq9XkWiFiCf36t43DcIKNYh7IcGAsnfXq8vqio,8052
+src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
+src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
+src/prompts/library/preprocessing/carrier/placeholders.json,sha256=tQeVDtvembhVqvel9vGoy4qcKp1hOvg-bLCgZRdQj0g,192
+src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
+src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
+src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
+src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
+src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
+src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
+src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
+data_science_document_ai-1.51.0.dist-info/METADATA,sha256=mQBXhk_NZlceIozn434C-du7ESoNGyhgGJxQRZYYLNs,2152
+data_science_document_ai-1.51.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+data_science_document_ai-1.51.0.dist-info/RECORD,,

src/constants.py CHANGED Viewed

@@ -23,9 +23,12 @@ project_parameters = {
     "invoice_classification_lookup": "invoice_classification.json",
     "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
     # Fuzzy logic params
-    "fuzzy_threshold_item_code": 70,
+    "fuzzy_threshold_item_code": 90,
     "fuzzy_threshold_reverse_charge": 80,
     "fuzzy_threshold_invoice_classification": 70,
+    # Chunking params
+    "chunk_size": 1,  # page (do not change this without changing the page number logic)
+    "chunk_after": 10,  # pages
     # Big Query
     "g_ai_gbq_db_schema": "document_ai",
     "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -50,15 +53,6 @@ project_parameters = {
     "model_selector": {
         "stable": {
             "bookingConfirmation": 1,
-            "packingList": 0,
-            "commercialInvoice": 0,
-            "finalMbL": 0,
-            "draftMbl": 0,
-            "arrivalNotice": 0,
-            "shippingInstruction": 0,
-            "customsAssessment": 0,
-            "deliveryOrder": 0,
-            "partnerInvoice": 0,
         },
         "beta": {
             "bookingConfirmation": 0,
@@ -86,8 +80,10 @@ project_parameters = {
     # Key to combine the LLM results with the Doc Ai results
     "key_to_combine": {
         "bookingConfirmation": ["transportLegs"],
+        "arrivalNotice": ["containers"],
         "finalMbL": ["containers"],
         "draftMbl": ["containers"],
+        "deliveryOrder": ["Equipment", "TransportLeg"],
         "customsAssessment": ["containers"],
         "packingList": ["skuData"],
         "commercialInvoice": ["skus"],

src/docai.py CHANGED Viewed

@@ -3,11 +3,16 @@ import re
 from google.cloud import documentai
-from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
+from src.io import (
+    delete_folder_from_bucket,
+    get_gcp_labels,
+    logger,
+    upload_pdf_to_bucket,
+)
 from src.utils import cache_on_disk
-async def _process_pdf_w_docai(image_content, client, processor_name):
+async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
     """Process the PDF using Document AI.
     Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
         client: The Document AI client.
         processor_name (str): The name of the processor to be used.
                             e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
+        doc_type (str, optional): Document type for cost tracking labels.
     Returns:
         The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
         content=image_content, mime_type="application/pdf"
     )
-    # Configure the process request
+    # Configure the process request with labels for cost tracking
     request = documentai.ProcessRequest(
         name=processor_name,
         raw_document=raw_document,  # field_mask=field_mask
+        labels=get_gcp_labels(doc_type=doc_type),
     )
     result = await cache_on_disk(client.process_document, request=request)
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
 async def _batch_process_pdf_w_docai(
-    params, image_content, client, processor_name, timeout=1200
+    params, image_content, client, processor_name, timeout=1200, doc_type=None
 ):
     """Process the PDF using Document AI Batch Process API.
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
         processor_name (str): The name of the processor to be used.
                             e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
         timeout (int, optional): The timeout in seconds. Defaults to 1200.
+        doc_type (str, optional): Document type for cost tracking labels.
     Returns:
         The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
     # Where to write results
     output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
-    # The full resource name of the processor
+    # The full resource name of the processor with labels for cost tracking
     request = documentai.BatchProcessRequest(
         name=processor_name,
         input_documents=input_config,
         document_output_config=output_config,
+        labels=get_gcp_labels(doc_type=doc_type),
     )
     # BatchProcess returns a Long Running Operation (LRO)

src/docai_processor_config.yaml CHANGED Viewed

@@ -13,62 +13,6 @@ model_config:
           author: "igor.tonko@forto.com"
           created_date: ""
-    packingList:
-      - id: "d967005bd9d45aeb"
-        details:
-          display_name: "doc_cap_packingList"
-          author: "kumar.rajendrababu@forto.com"
-          created_date: ""
-    commercialInvoice:
-      - id: "7d37236207f75758"
-        details:
-          display_name: "doc_cap_commercialInvoice"
-          author: "kumar.rajendrababu@forto.com"
-          created_date: ""
-    finalMbL:
-      - id: "1eda2f22d64b1b89"
-        details:
-          display_name: "doc_cap_finalMbL"
-          author: "igor.tonko@forto.com"
-          created_date: ""
-    draftMbl:
-      - id: "1eda2f22d64b1b89"
-        details:
-          display_name: "doc_cap_finalMbL"
-          author: "igor.tonko@forto.com"
-          created_date: ""
-    shippingInstruction:
-      - id: "c77a0a515d99a8ba"
-        details:
-          display_name: "doc_cap_shippingInstruction"
-          author: "kumar.rajendrababu@forto.com"
-          created_date: ""
-    arrivalNotice:
-      - id: "748b2e2b9161dcf3"
-        details:
-          display_name: "doc_cap_arrivalNotice"
-          author: "osman.demirel@forto.com"
-          created_date: ""
-    customsAssessment:
-      - id: "c464a18d82fad9be"
-        details:
-          display_name: "doc_cap_customsAssessment"
-          author: "igor.tonko@forto.com"
-          created_date: ""
-    deliveryOrder:
-      - id: "2245a72c7a5dbf5f"
-        details:
-          display_name: "doc_cap_releaseNote"
-          author: "igor.tonko@forto.com"
-          created_date: ""
   beta:
     bookingConfirmation:
       - id: "3c280b11bdb3ed89"

src/excel_processing.py CHANGED Viewed

@@ -2,21 +2,25 @@
 # flake8: noqa: E402
 import logging
+from ddtrace import tracer
 logger = logging.getLogger(__name__)
 import asyncio
-import json
 import numpy as np
 import pandas as pd
-from src.llm import prompt_excel_extraction
-from src.utils import generate_schema_structure, get_excel_sheets
+from src.prompts.prompt_library import prompt_library
+from src.utils import estimate_page_count, get_excel_sheets
-async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
+async def extract_data_from_sheet(
+    llm_client, sheet_name, sheet, response_schema, doc_type=None
+):
     logger.info(f"Processing sheet: {sheet_name}")
-    excel_content = pd.DataFrame(sheet.values)
+    excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
     # Convert to Markdown format for the LLM model
     worksheet = (
         "This is from a excel. Pay attention to the cell position:\n"
@@ -24,12 +28,16 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
     )
     # Prompt for the LLM JSON
-    prompt_docai = prompt_excel_extraction(worksheet)
+    prompt = prompt_library.library[doc_type]["other"]["prompt"]
+    # Join the worksheet content with the prompt
+    prompt = worksheet + "\n" + prompt
     try:
-        result = await params["LlmClient"].get_unified_json_genai(
-            prompt_docai,
+        result = await llm_client.get_unified_json_genai(
+            prompt,
             response_schema=response_schema,
+            doc_type=doc_type,
         )
     except Exception as e:
         result = {}
@@ -43,6 +51,7 @@ async def extract_data_from_excel(
     input_doc_type,
     file_content,
     mime_type,
+    llm_client,
 ):
     """Extract data from the Excel file.
@@ -51,6 +60,7 @@ async def extract_data_from_excel(
         input_doc_type (str): The type of the document.
         file_content (bytes): The content of the Excel file to process.
         mime_type (str): The MIME type of the file.
+        llm_client: The LLM client to use for data extraction.
     Returns:
         formatted_data (list): A list of dictionaries containing the extracted data.
@@ -59,20 +69,31 @@ async def extract_data_from_excel(
     """
     # Generate the response structure
-    response_schema = generate_schema_structure(params, input_doc_type)
+    response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
     # Load the Excel file and get ONLY the "visible" sheet names
     sheets, workbook = get_excel_sheets(file_content, mime_type)
+    # Track the number of sheets in dd-trace
+    span = tracer.current_span()
+    if span:
+        estimated_page_counts = [
+            estimate_page_count(workbook[sheet]) for sheet in sheets
+        ]
+        est_page_count = sum(estimated_page_counts)
+        span.set_metric("est_page_count", est_page_count)
     # Excel files may contain multiple sheets. Extract data from each sheet
     sheet_extract_tasks = [
         extract_data_from_sheet(
-            params, sheet_name, workbook[sheet_name], response_schema
+            llm_client,
+            sheet_name,
+            workbook[sheet_name],
+            response_schema,
+            doc_type=input_doc_type,
         )
         for sheet_name in sheets
     ]
     extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
-    stored_data = json.dumps(extracted_data)
-    return extracted_data, stored_data, params["gemini_params"]["model_id"]
+    return extracted_data, extracted_data, llm_client.model_id

src/io.py CHANGED Viewed

@@ -12,13 +12,55 @@ from pathlib import Path
 from google.cloud import bigquery, storage
+def get_gcp_labels(**extra_labels):
+    """Generate standardized GCP labels for cost tracking.
+    Args:
+        **extra_labels: Additional custom labels
+    Returns:
+        dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
+    """
+    project_name = os.getenv("PROJECT_NAME")
+    # If not set, detect once and cache it
+    if not project_name:
+        # Try pyproject.toml first
+        try:
+            import toml
+            pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
+            if pyproject_path.exists():
+                config = toml.load(pyproject_path)
+                project_name = config.get("tool", {}).get("poetry", {}).get("name")
+        except Exception:
+            pass
+        # Fallback to unknown
+        if not project_name:
+            project_name = "unknown"
+        # Cache it
+        os.environ["PROJECT_NAME"] = project_name
+    labels = {
+        "ds-project-name": project_name.lower(),
+        "ds-env": os.getenv("CLUSTER", "local").lower(),
+    }
+    # Add any extra labels
+    labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
+    return labels
 def get_bq_client(params):
     """Get Google BigQuery client."""
     bq_client = bigquery.Client(project=params["g_ai_project_name"])
     job_config = bigquery.QueryJobConfig(
         allow_large_results=True,
         # flatten_results=True,
-        labels={"project-name": params["project_name"]},
+        labels=get_gcp_labels(),
     )
     return bq_client, job_config
@@ -112,3 +154,29 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
         Path(directory).mkdir(parents=True, exist_ok=True)
         blob.download_to_filename(directory_local / Path(blob.name))
     return result
+def bq_logs(data_to_insert, params):
+    """Insert logs into Google BigQuery.
+    Args:
+        data_to_insert (list): The data to insert into BigQuery.
+        params (dict): The parameters dictionary.
+    """
+    # Use the pre-initialized BigQuery client
+    bq_client = params["bq_client"]
+    # Get the table string
+    table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
+    logger.info(f"Log table: {table_string}")
+    # Insert the rows into the table
+    insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
+    # Check if there were any errors inserting the rows
+    if not insert_logs:
+        logger.info("New rows have been added.")
+    else:
+        logger.info("Errors occurred while inserting rows: ", insert_logs)
+# type: ignore

src/llm.py CHANGED Viewed

@@ -15,6 +15,7 @@ from vertexai.generative_models import (
     Part,
 )
+from src.io import get_gcp_labels
 from src.utils import cache_on_disk
@@ -69,6 +70,7 @@ class LlmClient:
         document: str = None,
         response_schema: dict = None,
         response_mime_type: str = "application/json",
+        doc_type: str = None,
     ):
         """Ask the Gemini model a question.
@@ -76,6 +78,7 @@ class LlmClient:
             prompt (str): The prompt to send to the model.
             document (str, optional): An optional document to provide context.
             response_schema (dict, optional): Defines a specific response schema for the model.
+            doc_type (str, optional): Document type for cost tracking labels.
         Returns:
             str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
             # Prepare inputs for the model
             inputs = [document, prompt] if document else prompt
-            # Generate the response
+            # Generate the response with labels for cost tracking
             model_response = await cache_on_disk(
                 self.geminy_client.generate_content_async,
                 contents=inputs,
                 generation_config=config,
                 safety_settings=self.safety_config,
+                labels=get_gcp_labels(doc_type=doc_type),
             )
             response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
             return "{}"
     async def get_unified_json_genai(
-        self, prompt, document=None, response_schema=None, model="gemini"
+        self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
     ):
         """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
@@ -122,6 +126,7 @@ class LlmClient:
             document: Content of the PDF document
             response_schema: The schema to use for the response
             model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
+            doc_type (str, optional): Document type for cost tracking labels.
         Returns:
             dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
             response = await self.ask_chatgpt(prompt, document, response_schema)
         else:
             # Default to Gemini
-            response = await self.ask_gemini(prompt, document, response_schema)
+            response = await self.ask_gemini(
+                prompt, document, response_schema, doc_type=doc_type
+            )
         try:
             return json.loads(response)
@@ -194,33 +201,4 @@ class LlmClient:
         return response
-def prompt_excel_extraction(excel_structured_text):
-    """Write a prompt to extract data from Excel files.
-    Args:
-        excel_structured_text (str): The structured text of the Excel file.
-    Returns:
-        prompt str: The prompt for common json.
-    """
-    prompt = f"""{excel_structured_text}
-    Task: Fill in the following dictionary from the information in the given in the above excel data.
-    Instructions:
-    - Do not change the keys of the following dictionary.
-    - The values should be filled in as per the schema provided below.
-    - If an entity contains a 'display_name', consider its properties as child data points in the below format.
-    {{'data-field': {{
-        'child-data-field': 'type -occurrence_type- description',
-          }}
-    }}
-    - The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
-    - Ensure the schema reflects the hierarchical relationship.
-    - Use the data field description to understand the context of the data.
-    """
-    return prompt
 # pylint: enable=all

data-science-document-ai 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl

data-science-document-ai 1.40.3py3-none-any.whl → 1.51.0py3-none-any.whl