PyPI - data-science-document-ai - Versions diffs - 1.42.1__tar.gz → 1.42.3__tar.gz - Mend

@@ -138,36 +138,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
 def process_partner_invoice(params, aggregated_data, document_type_code):
     """Process the partner invoice data."""
-    # Post process containerNumber.
-    # TODO: Remove this block of code after migrating to LLM completely and update the placeholder in the prompt library
-    if "containerNumber" in aggregated_data and isinstance(
-        aggregated_data["containerNumber"], dict
-    ):
-        container_number = aggregated_data.get("containerNumber", {}).get(
-            "formattedValue", None
-        )
-        if container_number:
-            aggregated_data["containerNumber"] = (
-                [
-                    {
-                        "documentValue": aggregated_data.get("containerNumber", {}).get(
-                            "documentValue", ""
-                        ),
-                        "formattedValue": ctr_number,
-                    }
-                    for ctr_number in container_number
-                ]
-                if isinstance(container_number, list)
-                else [
-                    {
-                        "documentValue": aggregated_data.get("containerNumber", {}).get(
-                            "documentValue", ""
-                        ),
-                        "formattedValue": container_number,
-                    }
-                ]
-            )
+    # Post process bundeskasse invoices
     if document_type_code == "bundeskasse":
         post_process_bundeskasse(aggregated_data)
         return
@@ -197,9 +168,13 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
                 params,
             )
+            # Add page number for the consistency
+            line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
         if reverse_charge:
             # Distribute reverseChargeSentence to all line items
             line_item["reverseChargeSentence"] = reverse_charge
+            line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
 def compute_score(args):

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.42.1
+Version: 1.42.3
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-science-document-ai"
-version = "1.42.1"
+version = "1.42.3"
 description = "\"Document AI repo for data science\""
 authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
 packages = [

@@ -20,7 +20,8 @@ async def extract_data_from_sheet(
     params, sheet_name, sheet, response_schema, doc_type=None
 ):
     logger.info(f"Processing sheet: {sheet_name}")
-    excel_content = pd.DataFrame(sheet.values)
+    excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
     # Convert to Markdown format for the LLM model
     worksheet = (
         "This is from a excel. Pay attention to the cell position:\n"

@@ -456,6 +456,7 @@ async def format_label(entity_k, entity_value, document_type_code, params):
     elif "quantity" in entity_key:
         if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
             # For partner invoice, quantity can be mentioned as whole number
+            # Apply decimal convertor for 46,45 --> 46.45 but not for 1.000 --> 1000
             formatted_value = decimal_convertor(
                 extract_number(entity_value), quantity=True
             )

@@ -52,7 +52,7 @@ Your role is to accurately extract specific entities from these invoices to supp
     - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
     - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
     - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
-    - quantity: The quantity of the item or service provided in the line item.
+    - quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
     - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
 - hblNumber and mblNumber:

data-science-document-ai 1.42.1__tar.gz → 1.42.3__tar.gz

data-science-document-ai 1.42.1tar.gz → 1.42.3tar.gz