data-science-document-ai 1.43.0__py3-none-any.whl → 1.43.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.43.0
3
+ Version: 1.43.2
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -6,8 +6,8 @@ src/excel_processing.py,sha256=gzP7QFCp4-n0FTevhWmXm-2UoDF0w0y5v39gsby0IV8,3135
6
6
  src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
7
7
  src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=M0RUi20j481FcvMJ3xifavMNsPHqMFpbRo9w8hMpmL8,16970
10
- src/postprocessing/common.py,sha256=5W-u3lKbnPQRKT4h5EfegegMjSXOKik73X7kUx9ik0Y,21888
9
+ src/pdf_processing.py,sha256=0lmeaKwruAxqhk7NeCC4GU6Zlp0rQAmi0lbjlNTNCDc,17039
10
+ src/postprocessing/common.py,sha256=wvlYI1S75r0q5xp9Yll89nOVWtwDd7hV4Sf0MIButA0,22150
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
13
  src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
@@ -52,6 +52,6 @@ src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg
52
52
  src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
53
53
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
54
54
  src/utils.py,sha256=cTF2A12jugKjXxGlNXEZQtfgcsIoaTtaU7zhVOOvXXA,16634
55
- data_science_document_ai-1.43.0.dist-info/METADATA,sha256=0vBy9AYJgIzbR84yLUo-SgSd2ocGFXfPGsI3swfwars,2152
56
- data_science_document_ai-1.43.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
- data_science_document_ai-1.43.0.dist-info/RECORD,,
55
+ data_science_document_ai-1.43.2.dist-info/METADATA,sha256=4FTsGLX2lW2bIDgXV0wRwUcKKvkMl3ZfbQokcRdTFY0,2152
56
+ data_science_document_ai-1.43.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
+ data_science_document_ai-1.43.2.dist-info/RECORD,,
src/pdf_processing.py CHANGED
@@ -393,6 +393,7 @@ async def data_extraction_manual_flow(
393
393
  meta,
394
394
  processor_client,
395
395
  schema_client,
396
+ use_default_logging=False,
396
397
  ):
397
398
  """
398
399
  Process a PDF file and extract data from it.
@@ -480,7 +481,9 @@ async def data_extraction_manual_flow(
480
481
  logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
481
482
 
482
483
  # Schedule background tasks without using FastAPI's BackgroundTasks
483
- if os.getenv("CLUSTER") != "ode": # skip data export to bigquery in ODE environment
484
+ if (
485
+ os.getenv("CLUSTER") != "ode"
486
+ ) & use_default_logging: # skip data export to bigquery in ODE environment
484
487
  asyncio.create_task(
485
488
  run_background_tasks(
486
489
  params,
@@ -319,6 +319,11 @@ def remove_unwanted_patterns(lineitem: str):
319
319
  # Remove "HIGH CUBE"
320
320
  lineitem = lineitem.replace("HIGH CUBE", "")
321
321
 
322
+ # Remove container size e.g., 20FT, 40HC, etc.
323
+ lineitem = re.sub(
324
+ r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
325
+ ).strip()
326
+
322
327
  return lineitem
323
328
 
324
329
 
@@ -349,18 +354,21 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
349
354
  # Remove the currency codes
350
355
  lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
351
356
 
357
+ # remove other patterns
358
+ lineitem = remove_unwanted_patterns(lineitem)
359
+
352
360
  # Remove numbers from the line item
353
361
  if (
354
362
  remove_numbers
355
363
  ): # Do not remove numbers for the reverse charge sentence as it contains Article number
356
364
  lineitem = re.sub(r"\d+", "", lineitem)
357
365
 
358
- # remove other patterns
359
- lineitem = remove_unwanted_patterns(lineitem)
360
-
361
366
  # remove special chars
362
367
  lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
363
368
 
369
+ # Remove x from lineitem like 10 x
370
+ lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
371
+
364
372
  return re.sub(r"\s{2,}", " ", lineitem).strip()
365
373
 
366
374