PyPI - data-science-document-ai - Versions diffs - 1.43.0__py3-none-any.whl → 1.43.2__py3-none-any.whl - Mend

data-science-document-ai 1.43.0py3-none-any.whl → 1.43.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

{data_science_document_ai-1.43.0.dist-info → data_science_document_ai-1.43.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.43.0
+Version: 1.43.2
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.43.0.dist-info → data_science_document_ai-1.43.2.dist-info}/RECORD RENAMED Viewed

@@ -6,8 +6,8 @@ src/excel_processing.py,sha256=gzP7QFCp4-n0FTevhWmXm-2UoDF0w0y5v39gsby0IV8,3135
 src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
 src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
 src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
-src/pdf_processing.py,sha256=M0RUi20j481FcvMJ3xifavMNsPHqMFpbRo9w8hMpmL8,16970
-src/postprocessing/common.py,sha256=5W-u3lKbnPQRKT4h5EfegegMjSXOKik73X7kUx9ik0Y,21888
+src/pdf_processing.py,sha256=0lmeaKwruAxqhk7NeCC4GU6Zlp0rQAmi0lbjlNTNCDc,17039
+src/postprocessing/common.py,sha256=wvlYI1S75r0q5xp9Yll89nOVWtwDd7hV4Sf0MIButA0,22150
 src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
 src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
 src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
@@ -52,6 +52,6 @@ src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg
 src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
 src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
 src/utils.py,sha256=cTF2A12jugKjXxGlNXEZQtfgcsIoaTtaU7zhVOOvXXA,16634
-data_science_document_ai-1.43.0.dist-info/METADATA,sha256=0vBy9AYJgIzbR84yLUo-SgSd2ocGFXfPGsI3swfwars,2152
-data_science_document_ai-1.43.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-data_science_document_ai-1.43.0.dist-info/RECORD,,
+data_science_document_ai-1.43.2.dist-info/METADATA,sha256=4FTsGLX2lW2bIDgXV0wRwUcKKvkMl3ZfbQokcRdTFY0,2152
+data_science_document_ai-1.43.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+data_science_document_ai-1.43.2.dist-info/RECORD,,

src/pdf_processing.py CHANGED Viewed

@@ -393,6 +393,7 @@ async def data_extraction_manual_flow(
     meta,
     processor_client,
     schema_client,
+    use_default_logging=False,
 ):
     """
     Process a PDF file and extract data from it.
@@ -480,7 +481,9 @@ async def data_extraction_manual_flow(
     logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
     # Schedule background tasks without using FastAPI's BackgroundTasks
-    if os.getenv("CLUSTER") != "ode":  # skip data export to bigquery in ODE environment
+    if (
+        os.getenv("CLUSTER") != "ode"
+    ) & use_default_logging:  # skip data export to bigquery in ODE environment
         asyncio.create_task(
             run_background_tasks(
                 params,

src/postprocessing/common.py CHANGED Viewed

@@ -319,6 +319,11 @@ def remove_unwanted_patterns(lineitem: str):
     # Remove "HIGH CUBE"
     lineitem = lineitem.replace("HIGH CUBE", "")
+    # Remove container size e.g., 20FT, 40HC, etc.
+    lineitem = re.sub(
+        r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
+    ).strip()
     return lineitem
@@ -349,18 +354,21 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     # Remove the currency codes
     lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
+    # remove other patterns
+    lineitem = remove_unwanted_patterns(lineitem)
     # Remove numbers from the line item
     if (
         remove_numbers
     ):  # Do not remove numbers for the reverse charge sentence as it contains Article number
         lineitem = re.sub(r"\d+", "", lineitem)
-    # remove other patterns
-    lineitem = remove_unwanted_patterns(lineitem)
     # remove special chars
     lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
+    # Remove x from lineitem like 10 x
+    lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
     return re.sub(r"\s{2,}", " ", lineitem).strip()

{data_science_document_ai-1.43.0.dist-info → data_science_document_ai-1.43.2.dist-info}/WHEEL RENAMED Viewed

File without changes

data-science-document-ai 1.43.0__py3-none-any.whl → 1.43.2__py3-none-any.whl

data-science-document-ai 1.43.0py3-none-any.whl → 1.43.2py3-none-any.whl