data-science-document-ai 1.42.5__py3-none-any.whl → 1.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.43.1.dist-info}/METADATA +2 -2
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.43.1.dist-info}/RECORD +7 -7
- src/excel_processing.py +12 -1
- src/pdf_processing.py +15 -0
- src/postprocessing/common.py +11 -3
- src/utils.py +30 -8
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.43.1.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.43.1.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-science-document-ai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.43.1
|
|
4
4
|
Summary: "Document AI repo for data science"
|
|
5
5
|
Author: Naomi Nguyen
|
|
6
6
|
Author-email: naomi.nguyen@forto.com
|
|
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
|
|
38
38
|
Requires-Dist: pgzip (>=0.3.5,<0.4.0)
|
|
39
39
|
Requires-Dist: pyarrow (==16.1.0)
|
|
40
40
|
Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
|
|
41
|
-
Requires-Dist:
|
|
41
|
+
Requires-Dist: pypdf (>=6.1.2,<7.0.0)
|
|
42
42
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
|
43
43
|
Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
|
|
44
44
|
Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
|
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.43.1.dist-info}/RECORD
RENAMED
|
@@ -2,12 +2,12 @@ src/constants.py,sha256=TF_UblovdXZnKIb1lnyJwUqQncJCbzBVihoelI6foSU,3579
|
|
|
2
2
|
src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
|
|
3
3
|
src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
|
|
4
4
|
src/docai_processor_config.yaml,sha256=qOMmCIORpLQ_D-ytvejXxFvER0e0uGYuzPVdZBGv4Pc,2105
|
|
5
|
-
src/excel_processing.py,sha256=
|
|
5
|
+
src/excel_processing.py,sha256=gzP7QFCp4-n0FTevhWmXm-2UoDF0w0y5v39gsby0IV8,3135
|
|
6
6
|
src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
|
|
7
7
|
src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=
|
|
10
|
-
src/postprocessing/common.py,sha256=
|
|
9
|
+
src/pdf_processing.py,sha256=M0RUi20j481FcvMJ3xifavMNsPHqMFpbRo9w8hMpmL8,16970
|
|
10
|
+
src/postprocessing/common.py,sha256=wvlYI1S75r0q5xp9Yll89nOVWtwDd7hV4Sf0MIButA0,22150
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
13
|
src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
|
|
@@ -51,7 +51,7 @@ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYp
|
|
|
51
51
|
src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg,2760
|
|
52
52
|
src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
|
|
53
53
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
54
|
-
src/utils.py,sha256=
|
|
55
|
-
data_science_document_ai-1.
|
|
56
|
-
data_science_document_ai-1.
|
|
57
|
-
data_science_document_ai-1.
|
|
54
|
+
src/utils.py,sha256=cTF2A12jugKjXxGlNXEZQtfgcsIoaTtaU7zhVOOvXXA,16634
|
|
55
|
+
data_science_document_ai-1.43.1.dist-info/METADATA,sha256=qHuRdJZSrKPDOQvynxf69ssK3jjM-pnBmGyheEPt0Xw,2152
|
|
56
|
+
data_science_document_ai-1.43.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
57
|
+
data_science_document_ai-1.43.1.dist-info/RECORD,,
|
src/excel_processing.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
# flake8: noqa: E402
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
|
+
from ddtrace import tracer
|
|
6
|
+
|
|
5
7
|
from src.postprocessing.common import llm_prediction_to_tuples
|
|
6
8
|
|
|
7
9
|
logger = logging.getLogger(__name__)
|
|
@@ -13,7 +15,7 @@ import numpy as np
|
|
|
13
15
|
import pandas as pd
|
|
14
16
|
|
|
15
17
|
from src.llm import prompt_excel_extraction
|
|
16
|
-
from src.utils import generate_schema_structure, get_excel_sheets
|
|
18
|
+
from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
async def extract_data_from_sheet(
|
|
@@ -70,6 +72,15 @@ async def extract_data_from_excel(
|
|
|
70
72
|
# Load the Excel file and get ONLY the "visible" sheet names
|
|
71
73
|
sheets, workbook = get_excel_sheets(file_content, mime_type)
|
|
72
74
|
|
|
75
|
+
# Track the number of sheets in dd-trace
|
|
76
|
+
span = tracer.current_span()
|
|
77
|
+
if span:
|
|
78
|
+
estimated_page_counts = [
|
|
79
|
+
estimate_page_count(workbook[sheet]) for sheet in sheets
|
|
80
|
+
]
|
|
81
|
+
est_page_count = sum(estimated_page_counts)
|
|
82
|
+
span.set_metric("est_page_count", est_page_count)
|
|
83
|
+
|
|
73
84
|
# Excel files may contain multiple sheets. Extract data from each sheet
|
|
74
85
|
sheet_extract_tasks = [
|
|
75
86
|
extract_data_from_sheet(
|
src/pdf_processing.py
CHANGED
|
@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
|
|
|
9
9
|
import asyncio
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
|
|
12
|
+
from ddtrace import tracer
|
|
12
13
|
from fastapi import HTTPException
|
|
13
14
|
from google.cloud.documentai_v1 import Document as docaiv1_document
|
|
14
15
|
|
|
@@ -32,6 +33,7 @@ from src.prompts.prompt_library import prompt_library
|
|
|
32
33
|
from src.utils import (
|
|
33
34
|
extract_top_pages,
|
|
34
35
|
generate_schema_structure,
|
|
36
|
+
get_pdf_page_count,
|
|
35
37
|
get_processor_name,
|
|
36
38
|
run_background_tasks,
|
|
37
39
|
transform_schema_strings,
|
|
@@ -411,6 +413,7 @@ async def data_extraction_manual_flow(
|
|
|
411
413
|
"""
|
|
412
414
|
# Get the start time for processing
|
|
413
415
|
start_time = asyncio.get_event_loop().time()
|
|
416
|
+
page_count = None
|
|
414
417
|
# Validate the file type
|
|
415
418
|
if mime_type == "application/pdf":
|
|
416
419
|
# Enable Doc Ai only for certain document types.
|
|
@@ -434,6 +437,7 @@ async def data_extraction_manual_flow(
|
|
|
434
437
|
if_use_llm=if_use_llm,
|
|
435
438
|
isBetaTest=False,
|
|
436
439
|
)
|
|
440
|
+
page_count = get_pdf_page_count(file_content)
|
|
437
441
|
|
|
438
442
|
elif "excel" in mime_type or "spreadsheet" in mime_type:
|
|
439
443
|
# Extract data from the Excel file
|
|
@@ -444,6 +448,16 @@ async def data_extraction_manual_flow(
|
|
|
444
448
|
mime_type=mime_type,
|
|
445
449
|
)
|
|
446
450
|
|
|
451
|
+
# Get sheet count from dd-trace span (set in extract_data_from_excel)
|
|
452
|
+
# Note: we use the span metric instead of len(extracted_data) because
|
|
453
|
+
# some sheets may fail extraction and not appear in extracted_data
|
|
454
|
+
span = tracer.current_span()
|
|
455
|
+
page_count = span.get_metric("est_page_count") if span else len(extracted_data)
|
|
456
|
+
if page_count > 100:
|
|
457
|
+
logger.warning(
|
|
458
|
+
f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
|
|
459
|
+
)
|
|
460
|
+
|
|
447
461
|
else:
|
|
448
462
|
raise HTTPException(
|
|
449
463
|
status_code=400,
|
|
@@ -477,6 +491,7 @@ async def data_extraction_manual_flow(
|
|
|
477
491
|
processor_version,
|
|
478
492
|
mime_type,
|
|
479
493
|
elapsed_time,
|
|
494
|
+
page_count,
|
|
480
495
|
)
|
|
481
496
|
)
|
|
482
497
|
return result
|
src/postprocessing/common.py
CHANGED
|
@@ -319,6 +319,11 @@ def remove_unwanted_patterns(lineitem: str):
|
|
|
319
319
|
# Remove "HIGH CUBE"
|
|
320
320
|
lineitem = lineitem.replace("HIGH CUBE", "")
|
|
321
321
|
|
|
322
|
+
# Remove container size e.g., 20FT, 40HC, etc.
|
|
323
|
+
lineitem = re.sub(
|
|
324
|
+
r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
|
|
325
|
+
).strip()
|
|
326
|
+
|
|
322
327
|
return lineitem
|
|
323
328
|
|
|
324
329
|
|
|
@@ -349,18 +354,21 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
349
354
|
# Remove the currency codes
|
|
350
355
|
lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
|
|
351
356
|
|
|
357
|
+
# remove other patterns
|
|
358
|
+
lineitem = remove_unwanted_patterns(lineitem)
|
|
359
|
+
|
|
352
360
|
# Remove numbers from the line item
|
|
353
361
|
if (
|
|
354
362
|
remove_numbers
|
|
355
363
|
): # Do not remove numbers for the reverse charge sentence as it contains Article number
|
|
356
364
|
lineitem = re.sub(r"\d+", "", lineitem)
|
|
357
365
|
|
|
358
|
-
# remove other patterns
|
|
359
|
-
lineitem = remove_unwanted_patterns(lineitem)
|
|
360
|
-
|
|
361
366
|
# remove special chars
|
|
362
367
|
lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
|
|
363
368
|
|
|
369
|
+
# Remove x from lineitem like 10 x
|
|
370
|
+
lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
|
|
371
|
+
|
|
364
372
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
365
373
|
|
|
366
374
|
|
src/utils.py
CHANGED
|
@@ -8,15 +8,29 @@ import pickle
|
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from typing import Literal
|
|
10
10
|
|
|
11
|
+
import numpy as np
|
|
11
12
|
import openpyxl
|
|
12
13
|
import pandas as pd
|
|
13
14
|
import requests
|
|
14
15
|
from google.cloud import documentai_v1beta3 as docu_ai_beta
|
|
15
|
-
from
|
|
16
|
+
from pypdf import PdfReader, PdfWriter
|
|
16
17
|
|
|
17
18
|
from src.io import get_storage_client, logger
|
|
18
19
|
|
|
19
20
|
|
|
21
|
+
def get_pdf_page_count(pdf_bytes):
|
|
22
|
+
"""Get the number of pages in a PDF document efficiently.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
pdf_bytes (bytes): The PDF content as bytes.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
int: The number of pages in the PDF.
|
|
29
|
+
"""
|
|
30
|
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
31
|
+
return len(reader.pages)
|
|
32
|
+
|
|
33
|
+
|
|
20
34
|
def bq_logs(data_to_insert, params):
|
|
21
35
|
"""Insert logs into Google BigQuery.
|
|
22
36
|
|
|
@@ -164,6 +178,7 @@ async def run_background_tasks(
|
|
|
164
178
|
processor_version,
|
|
165
179
|
mime_type,
|
|
166
180
|
elapsed_time=None,
|
|
181
|
+
page_count=None,
|
|
167
182
|
):
|
|
168
183
|
"""
|
|
169
184
|
Run background tasks asynchronously.
|
|
@@ -177,6 +192,7 @@ async def run_background_tasks(
|
|
|
177
192
|
processor_version: The processor version used to extract the data.
|
|
178
193
|
mime_type: The MIME type of the document.
|
|
179
194
|
elapsed_time: The time taken to process the document.
|
|
195
|
+
page_count (int, optional): The number of pages in the document.
|
|
180
196
|
|
|
181
197
|
Returns:
|
|
182
198
|
None
|
|
@@ -185,13 +201,8 @@ async def run_background_tasks(
|
|
|
185
201
|
|
|
186
202
|
await loop.run_in_executor(None, store_json_in_gcs, params, doc_id, store_data)
|
|
187
203
|
|
|
188
|
-
#
|
|
189
|
-
page_count
|
|
190
|
-
# calculate the number of pages processed for PDFs
|
|
191
|
-
try:
|
|
192
|
-
if mime_type == "application/pdf":
|
|
193
|
-
page_count = len(json.loads(store_data.encode("utf-8"))["pages"])
|
|
194
|
-
except AttributeError:
|
|
204
|
+
# Use the passed page_count or default to 0 if not provided
|
|
205
|
+
if page_count is None:
|
|
195
206
|
page_count = 0
|
|
196
207
|
|
|
197
208
|
# Log the request in BigQuery
|
|
@@ -472,3 +483,14 @@ def transform_schema_strings(schema):
|
|
|
472
483
|
# Base case: for non-dict/list values (e.g., None, bool, str)
|
|
473
484
|
else:
|
|
474
485
|
return schema
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def estimate_page_count(sheet):
|
|
489
|
+
"""Assuming a page is 10 columns x 50 rows."""
|
|
490
|
+
if hasattr(sheet, "shape"):
|
|
491
|
+
pg_cnt = sheet.shape[0] * sheet.shape[1]
|
|
492
|
+
elif hasattr(sheet, "max_row"):
|
|
493
|
+
pg_cnt = sheet.max_column * sheet.max_row
|
|
494
|
+
else:
|
|
495
|
+
return None
|
|
496
|
+
return np.ceil(pg_cnt / 500)
|
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.43.1.dist-info}/WHEEL
RENAMED
|
File without changes
|