PyPI - data-science-document-ai - Versions diffs - 1.53.0__py3-none-any.whl → 1.55.0__py3-none-any.whl - Mend

data-science-document-ai 1.53.0py3-none-any.whl → 1.55.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{data_science_document_ai-1.53.0.dist-info → data_science_document_ai-1.55.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.53.0
+Version: 1.55.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.53.0.dist-info → data_science_document_ai-1.55.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-src/constants.py,sha256=k5bBnJN-kmXiAtIAlz6Kg6fDyR9n0DuIudCZ9ZHO_Jw,3528
+src/constants.py,sha256=H43Az9AtoBKfcq9yY4TQQJY8DfdILV5kXy8EMtRaWYA,3583
 src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
 src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
 src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
 src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
 src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
 src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
-src/pdf_processing.py,sha256=ER-gwh_YfJ-bMqh3nI8K89CZPAdPNnwjLmJ-5cnn1Rk,20469
+src/pdf_processing.py,sha256=81fS2xL36n9QgB7DpXe7SCS-Lyz11cFDgccYMK3ZVkk,20026
 src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
 src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
 src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
-src/postprocessing/postprocess_partner_invoice.py,sha256=V9ANqlSBBO2_YEyfyCms7vjhUi9pjlGeRfUhMQQ4F6c,13507
+src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
 src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
 src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
 src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
@@ -34,7 +34,7 @@ src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanET
 src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
 src/prompts/library/customsAssessment/other/prompt.txt,sha256=z3FuoHZ588Pz1WBJDW7ISAC3J6n7hPJCcS92CdHDTFw,2494
 src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
-src/prompts/library/customsInvoice/other/prompt.txt,sha256=1dR73TQZJAfO9dKl-h7VhiJkdli498IV4e5JgBlOoYw,9695
+src/prompts/library/customsInvoice/other/prompt.txt,sha256=MtDx5UIJYbOfBqfMELZposrZmLGiBpcZ8EE5PwRBRG8,9783
 src/prompts/library/deliveryOrder/other/placeholders.json,sha256=j-9F4V3yDg4610PPsOwU3oOj_S9vAvAB9Ix155WGIwc,3827
 src/prompts/library/deliveryOrder/other/prompt.txt,sha256=RD076vq0x0IjoEVQfh-G0u4nxITCpgKZGrwMlR9YAvk,2695
 src/prompts/library/draftMbl/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
@@ -44,7 +44,7 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=cyeKjK94sepqXiLEeZKB4VpmT0-
 src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
 src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
 src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
-src/prompts/library/partnerInvoice/other/prompt.txt,sha256=vMk-FBq9XkWiFiCf36t43DcIKNYh7IcGAsnfXq8vqio,8052
+src/prompts/library/partnerInvoice/other/prompt.txt,sha256=8dUZy516GwJ2EXVU5HJHBm0bbGMqxyqKacfCuSGtBxY,8067
 src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
 src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
 src/prompts/library/preprocessing/carrier/placeholders.json,sha256=tQeVDtvembhVqvel9vGoy4qcKp1hOvg-bLCgZRdQj0g,192
@@ -52,9 +52,9 @@ src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93
 src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
 src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
 src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
-src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
+src/setup.py,sha256=yb0Pz1RI-uId5lEjgQrj1Pqo9FvwG9vs0HXRVbyST2M,7186
 src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
 src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
-data_science_document_ai-1.53.0.dist-info/METADATA,sha256=s3O_vRgHuHypiv9A6R2NWiwexz4SYjjmuKARhcpwgCo,2152
-data_science_document_ai-1.53.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-data_science_document_ai-1.53.0.dist-info/RECORD,,
+data_science_document_ai-1.55.0.dist-info/METADATA,sha256=PRbleO7DbAfZHyjX5H-9hs2e7c6k0h5SQ9PbPA2Q6IY,2152
+data_science_document_ai-1.55.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+data_science_document_ai-1.55.0.dist-info/RECORD,,

src/constants.py CHANGED Viewed

@@ -20,10 +20,11 @@ project_parameters = {
     # Fuzzy lookup
     "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
     "item_code_lookup": "line_item_kvp_table.json",
+    "intermodal_partners": "intermodal_partners.json",
     "invoice_classification_lookup": "invoice_classification.json",
     "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
     # Fuzzy logic params
-    "fuzzy_threshold_item_code": 90,
+    "fuzzy_threshold_item_code": 92,
     "fuzzy_threshold_reverse_charge": 80,
     "fuzzy_threshold_invoice_classification": 70,
     # Chunking params

src/pdf_processing.py CHANGED Viewed

@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
 from src.prompts.prompt_library import prompt_library
 from src.utils import (
     extract_top_pages,
-    generate_schema_structure,
     get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
@@ -250,7 +249,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
     tasks = []
-    semaphore = asyncio.Semaphore(50)
     # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
     for chunk in (
         split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
@@ -258,8 +256,12 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         else [file_content]
     ):
         tasks.append(
-            process_chunk_with_semaphore(
-                semaphore, chunk, prompt, response_schema, llm_client, input_doc_type
+            process_chunk_with_retry(
+                chunk,
+                prompt,
+                response_schema,
+                llm_client,
+                input_doc_type,
             )
         )
@@ -271,25 +273,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
-async def process_chunk_with_semaphore(
-    semaphore,
-    chunk_content,
-    prompt,
-    response_schema,
-    llm_client,
-    input_doc_type,
-):
-    """Process a chunk with a semaphore to limit concurrency."""
-    async with semaphore:
-        return await process_chunk_with_retry(
-            chunk_content,
-            prompt,
-            response_schema,
-            llm_client,
-            input_doc_type,
-        )
 async def process_chunk_with_retry(
     chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
 ):

src/postprocessing/postprocess_partner_invoice.py CHANGED Viewed

@@ -188,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
         reverse_charge_info["formattedValue"] = reverse_charge_value
         reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
+    # Partner Name
+    partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
     # Process everything in one go
-    processed_items = await process_line_items_batch(params, line_items, reverse_charge)
+    processed_items = await process_line_items_batch(
+        params, line_items, reverse_charge, partner_name
+    )
     # Update your main data structure
     aggregated_data["lineItem"] = processed_items
 async def process_line_items_batch(
-    params: dict, line_items: list[dict], reverse_charge=None
+    params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
 ):
     """
     Processes all line items efficiently using a "Split-Apply-Combine" strategy.
@@ -234,23 +239,12 @@ async def process_line_items_batch(
     # Batch API Call for Embedding lookups
     if pending_line_items:
-        values_to_fetch = list(set(pending_line_items.values()))
-        logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
-        # Await the batch response {"desc1": "code1", "desc2": "code2"}
-        api_results = await get_tms_mappings(
-            input_list=values_to_fetch, embedding_type="line_items"
-        )
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
-        # Merge API results back into original list
         for index, desc in pending_line_items.items():
-            # Get result from API response, or None if API failed for that item
-            forto_code = api_results.get(desc)
-            # Update the original item
             line_items[index]["itemCode"] = {
                 "documentValue": desc,
-                "formattedValue": forto_code,  # Might be None if API failed
+                "formattedValue": code_map.get(desc),
                 "page": line_items[index]["lineItemDescription"].get("page"),
             }
@@ -344,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
     return None
-async def associate_forto_item_code(line_item_data, params):
+async def associate_forto_item_code(line_item_data, params, partner_name=None):
     """
     Associates Forto item codes to a list of line item descriptions.
     Args:
         line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
         params (dict): Parameters containing lookup data and thresholds.
+        partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
     Returns:
         list: A list of dictionaries with 'description' and 'itemCode' keys.
@@ -371,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
     # Batch API Call for Embedding lookups
     if pending_line_items:
-        api_results = await get_tms_mappings(
-            input_list=list(pending_line_items.values()),
-            embedding_type="line_items",
-        )
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
-        # Merge API results back into original list
         for desc, f_desc in pending_line_items.items():
-            code = api_results.get(f_desc)
-            result.append({"description": desc, "itemCode": code})
+            result.append(
+                {
+                    "description": desc,
+                    "itemCode": code_map.get(f_desc),
+                }
+            )
+    return result
+async def fetch_line_item_codes(
+    pending_line_items: dict,
+    partner_name: str | None,
+    params: dict,
+):
+    """Returns: {original_description: mapped_code_or_None}"""
+    t_mode = (
+        find_matching_lineitem(
+            partner_name.upper(),
+            params["lookup_data"]["intermodal_partners"],
+            threshold=87,
+        )
+        if partner_name
+        else None
+    )
+    unique_descs = list(set(pending_line_items.values()))
+    logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
+    # Build API input map
+    api_input_map = {
+        desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
+    }
+    api_results = await get_tms_mappings(
+        input_list=list(api_input_map.values()),
+        embedding_type="line_items",
+    )
+    # Normalize response back to original descriptions
+    result = {
+        original_desc: api_results.get(api_desc)
+        for original_desc, api_desc in api_input_map.items()
+    }
     return result

src/prompts/library/customsInvoice/other/prompt.txt CHANGED Viewed

@@ -48,7 +48,7 @@ Your role is to accurately extract specific entities from these invoices to supp
 - issueDate: The date the document was issued.
 - dueDate: The date by which the payment should be made. Do Not calculate dueDate based on issueDate or any other date. Extract it directly from the invoice.
-- lineItem: Details of each COGS and Customs line item on the invoice. Make sure to extract each amount and currency separately.
+- lineItem: Details of each COGS and Customs line item on the invoice from each page. Make sure to extract each amount and currency separately.
     - uniqueId: A unique id which associated with the lineItem as each line item can belong to a different shipment. Extract only if its available in the line item. Either a shipmentId starting with an S and followed by 6 or 8 numeric values or a mblNumber. If shipmentId or mblNumber does not exist, set it to containerNumber.
     - lineItemDescription: The name or description of the item. Usually, it will be a one line sentence.
     - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
@@ -92,6 +92,7 @@ Your role is to accurately extract specific entities from these invoices to supp
 IMPORTANT NOTE:
 - Ensure all extracted values are directly from the document. Do not make assumptions or modifications.
+- Extract line items from each page if the invoice spans multiple pages.
 - Do not normalize or modify any entity values.
 - Pay attention to the line item details and paymentInformation, as they may vary significantly across different invoices.

src/prompts/library/partnerInvoice/other/prompt.txt CHANGED Viewed

@@ -46,7 +46,7 @@ Your role is to accurately extract specific entities from these invoices to supp
 - eta and etd: Few invoices contains same date for ARRIVED/DEPARTED or ETA/ETD. Extract it for both eta and etd.
-- lineItem: Details of each COGS and Customs line item on the invoice. Make sure to extract each amount and currency separately.
+- lineItem: Details of each COGS and Customs line item on the invoice from each page. Make sure to extract each amount and currency separately.
     - uniqueId: A unique id which associated with the lineItem as each line item can belong to a different shipment. Extract only if its available in the line item. Either a shipmentId starting with an S and followed by 6 or 8 numeric values or a mblNumber. If shipmentId or mblNumber does not exist, set it to containerNumber.
     - lineItemDescription: The name or description of the item. Usually, it will be a one line sentence.
     - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.

src/setup.py CHANGED Viewed

@@ -184,6 +184,9 @@ def setup_lookup_data(params):
     input_path_item_code = (
         f'{params["g_model_fuzzy_lookup_folder"]}/{params["item_code_lookup"]}'
     )
+    input_path_intermodal_partners = (
+        f'{params["g_model_fuzzy_lookup_folder"]}/{params["intermodal_partners"]}'
+    )
     input_path_invoice_classification = f'{params["g_model_fuzzy_lookup_folder"]}/{params["invoice_classification_lookup"]}'  # noqa: E501
     input_path_reverse_charge = f'{params["g_model_fuzzy_lookup_folder"]}/{params["reverse_charge_sentence_lookup"]}'
@@ -194,6 +197,9 @@ def setup_lookup_data(params):
         return json.loads(downloaded_data)
     data["item_code"] = download_json_from_bucket(input_path_item_code)
+    data["intermodal_partners"] = download_json_from_bucket(
+        input_path_intermodal_partners
+    )
     data["invoice_classification"] = download_json_from_bucket(
         input_path_invoice_classification
     )

{data_science_document_ai-1.53.0.dist-info → data_science_document_ai-1.55.0.dist-info}/WHEEL RENAMED Viewed

File without changes

data-science-document-ai 1.53.0__py3-none-any.whl → 1.55.0__py3-none-any.whl

data-science-document-ai 1.53.0py3-none-any.whl → 1.55.0py3-none-any.whl