data-science-document-ai 1.43.0__py3-none-any.whl → 1.43.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.43.0.dist-info → data_science_document_ai-1.43.2.dist-info}/METADATA +1 -1
- {data_science_document_ai-1.43.0.dist-info → data_science_document_ai-1.43.2.dist-info}/RECORD +5 -5
- src/pdf_processing.py +4 -1
- src/postprocessing/common.py +11 -3
- {data_science_document_ai-1.43.0.dist-info → data_science_document_ai-1.43.2.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.43.0.dist-info → data_science_document_ai-1.43.2.dist-info}/RECORD
RENAMED
|
@@ -6,8 +6,8 @@ src/excel_processing.py,sha256=gzP7QFCp4-n0FTevhWmXm-2UoDF0w0y5v39gsby0IV8,3135
|
|
|
6
6
|
src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
|
|
7
7
|
src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=
|
|
10
|
-
src/postprocessing/common.py,sha256=
|
|
9
|
+
src/pdf_processing.py,sha256=0lmeaKwruAxqhk7NeCC4GU6Zlp0rQAmi0lbjlNTNCDc,17039
|
|
10
|
+
src/postprocessing/common.py,sha256=wvlYI1S75r0q5xp9Yll89nOVWtwDd7hV4Sf0MIButA0,22150
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
13
|
src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
|
|
@@ -52,6 +52,6 @@ src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg
|
|
|
52
52
|
src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
|
|
53
53
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
54
54
|
src/utils.py,sha256=cTF2A12jugKjXxGlNXEZQtfgcsIoaTtaU7zhVOOvXXA,16634
|
|
55
|
-
data_science_document_ai-1.43.
|
|
56
|
-
data_science_document_ai-1.43.
|
|
57
|
-
data_science_document_ai-1.43.
|
|
55
|
+
data_science_document_ai-1.43.2.dist-info/METADATA,sha256=4FTsGLX2lW2bIDgXV0wRwUcKKvkMl3ZfbQokcRdTFY0,2152
|
|
56
|
+
data_science_document_ai-1.43.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
57
|
+
data_science_document_ai-1.43.2.dist-info/RECORD,,
|
src/pdf_processing.py
CHANGED
|
@@ -393,6 +393,7 @@ async def data_extraction_manual_flow(
|
|
|
393
393
|
meta,
|
|
394
394
|
processor_client,
|
|
395
395
|
schema_client,
|
|
396
|
+
use_default_logging=False,
|
|
396
397
|
):
|
|
397
398
|
"""
|
|
398
399
|
Process a PDF file and extract data from it.
|
|
@@ -480,7 +481,9 @@ async def data_extraction_manual_flow(
|
|
|
480
481
|
logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
|
|
481
482
|
|
|
482
483
|
# Schedule background tasks without using FastAPI's BackgroundTasks
|
|
483
|
-
if
|
|
484
|
+
if (
|
|
485
|
+
os.getenv("CLUSTER") != "ode"
|
|
486
|
+
) & use_default_logging: # skip data export to bigquery in ODE environment
|
|
484
487
|
asyncio.create_task(
|
|
485
488
|
run_background_tasks(
|
|
486
489
|
params,
|
src/postprocessing/common.py
CHANGED
|
@@ -319,6 +319,11 @@ def remove_unwanted_patterns(lineitem: str):
|
|
|
319
319
|
# Remove "HIGH CUBE"
|
|
320
320
|
lineitem = lineitem.replace("HIGH CUBE", "")
|
|
321
321
|
|
|
322
|
+
# Remove container size e.g., 20FT, 40HC, etc.
|
|
323
|
+
lineitem = re.sub(
|
|
324
|
+
r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
|
|
325
|
+
).strip()
|
|
326
|
+
|
|
322
327
|
return lineitem
|
|
323
328
|
|
|
324
329
|
|
|
@@ -349,18 +354,21 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
349
354
|
# Remove the currency codes
|
|
350
355
|
lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
|
|
351
356
|
|
|
357
|
+
# remove other patterns
|
|
358
|
+
lineitem = remove_unwanted_patterns(lineitem)
|
|
359
|
+
|
|
352
360
|
# Remove numbers from the line item
|
|
353
361
|
if (
|
|
354
362
|
remove_numbers
|
|
355
363
|
): # Do not remove numbers for the reverse charge sentence as it contains Article number
|
|
356
364
|
lineitem = re.sub(r"\d+", "", lineitem)
|
|
357
365
|
|
|
358
|
-
# remove other patterns
|
|
359
|
-
lineitem = remove_unwanted_patterns(lineitem)
|
|
360
|
-
|
|
361
366
|
# remove special chars
|
|
362
367
|
lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
|
|
363
368
|
|
|
369
|
+
# Remove x from lineitem like 10 x
|
|
370
|
+
lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
|
|
371
|
+
|
|
364
372
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
365
373
|
|
|
366
374
|
|
{data_science_document_ai-1.43.0.dist-info → data_science_document_ai-1.43.2.dist-info}/WHEEL
RENAMED
|
File without changes
|