PyPI - data-science-document-ai - Versions diffs - 1.51.1__py3-none-any.whl → 1.53.0__py3-none-any.whl - Mend

data-science-document-ai 1.51.1py3-none-any.whl → 1.53.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

{data_science_document_ai-1.51.1.dist-info → data_science_document_ai-1.53.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.51.1
+Version: 1.53.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.51.1.dist-info → data_science_document_ai-1.53.0.dist-info}/RECORD RENAMED Viewed

@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
 src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
 src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
 src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
-src/pdf_processing.py,sha256=oKVPnIu_keiN17XLOGImeyJ4iMT2H51x4OD1Tp9yw1s,19992
+src/pdf_processing.py,sha256=ER-gwh_YfJ-bMqh3nI8K89CZPAdPNnwjLmJ-5cnn1Rk,20469
 src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
 src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
 src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
-src/postprocessing/postprocess_partner_invoice.py,sha256=Fv4Y6Lc8e6aFFcwX0kLOal2y4TrR-XfAzjtuQnBwo0o,12815
+src/postprocessing/postprocess_partner_invoice.py,sha256=V9ANqlSBBO2_YEyfyCms7vjhUi9pjlGeRfUhMQQ4F6c,13507
 src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
 src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
 src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
@@ -55,6 +55,6 @@ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40
 src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
 src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
 src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
-data_science_document_ai-1.51.1.dist-info/METADATA,sha256=vJ9ivHgPOvyMkfqtL2893McNMupkNvNHYJd95IF4CMQ,2152
-data_science_document_ai-1.51.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-data_science_document_ai-1.51.1.dist-info/RECORD,,
+data_science_document_ai-1.53.0.dist-info/METADATA,sha256=s3O_vRgHuHypiv9A6R2NWiwexz4SYjjmuKARhcpwgCo,2152
+data_science_document_ai-1.53.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+data_science_document_ai-1.53.0.dist-info/RECORD,,

src/pdf_processing.py CHANGED Viewed

@@ -250,6 +250,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
     tasks = []
+    semaphore = asyncio.Semaphore(50)
     # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
     for chunk in (
         split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
@@ -257,8 +258,8 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         else [file_content]
     ):
         tasks.append(
-            process_chunk_with_retry(
-                chunk, prompt, response_schema, llm_client, input_doc_type
+            process_chunk_with_semaphore(
+                semaphore, chunk, prompt, response_schema, llm_client, input_doc_type
             )
         )
@@ -270,6 +271,25 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
+async def process_chunk_with_semaphore(
+    semaphore,
+    chunk_content,
+    prompt,
+    response_schema,
+    llm_client,
+    input_doc_type,
+):
+    """Process a chunk with a semaphore to limit concurrency."""
+    async with semaphore:
+        return await process_chunk_with_retry(
+            chunk_content,
+            prompt,
+            response_schema,
+            llm_client,
+            input_doc_type,
+        )
 async def process_chunk_with_retry(
     chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
 ):

src/postprocessing/postprocess_partner_invoice.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """This module contains the postprocessing functions for the partner invoice."""
+from collections import defaultdict
 from rapidfuzz import fuzz, process
 from src.io import logger
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
     ] = "Dasbachstraße 15, 54292 Trier, Germany"
+def select_unique_bank_account(bank_account):
+    # Select the unique bank account if multiple are present
+    if isinstance(bank_account, list) and bank_account:
+        best = defaultdict(lambda: None)
+        for item in bank_account:
+            dv = item["documentValue"]
+            if best[dv] is None or item["page"] < best[dv]["page"]:
+                best[dv] = item
+        unique = list(best.values())
+        return unique
 async def process_partner_invoice(params, aggregated_data, document_type_code):
     """Process the partner invoice data."""
     # Post process bundeskasse invoices
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
         post_process_bundeskasse(aggregated_data)
         return
+    if "bankAccount" in aggregated_data:
+        aggregated_data["bankAccount"] = select_unique_bank_account(
+            aggregated_data["bankAccount"]
+        )
     line_items = aggregated_data.get("lineItem", [])
     # Add debug logging
     logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
@@ -285,11 +306,14 @@ def if_reverse_charge_sentence(sentence: str, params):
         return False
     # Check if the sentence is similar to any of the reverse charge sentences
-    _, is_reverse_charge = get_fuzzy_match_score(
-        sentence, reverse_charge_sentences, threshold
+    match, _ = get_fuzzy_match_score(
+        sentence, list(reverse_charge_sentences.keys()), threshold
     )
-    return is_reverse_charge
+    if match:
+        return reverse_charge_sentences[match]
+    return False
 def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):

{data_science_document_ai-1.51.1.dist-info → data_science_document_ai-1.53.0.dist-info}/WHEEL RENAMED Viewed

File without changes

data-science-document-ai 1.51.1__py3-none-any.whl → 1.53.0__py3-none-any.whl

data-science-document-ai 1.51.1py3-none-any.whl → 1.53.0py3-none-any.whl