PyPI - data-science-document-ai - Versions diffs - 1.42.5__tar.gz → 1.43.0__tar.gz - Mend

data-science-document-ai 1.42.5tar.gz → 1.43.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.42.5
+Version: 1.43.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
 Requires-Dist: pgzip (>=0.3.5,<0.4.0)
 Requires-Dist: pyarrow (==16.1.0)
 Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
-Requires-Dist: pypdf2 (>=3.0.1,<4.0.0)
+Requires-Dist: pypdf (>=6.1.2,<7.0.0)
 Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
 Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
 Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)

{data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-science-document-ai"
-version = "1.42.5"
+version = "1.43.0"
 description = "\"Document AI repo for data science\""
 authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
 packages = [
@@ -48,7 +48,7 @@ rapidfuzz = "^3.12.2"
 fuzzywuzzy = "^0.18.0"
 nltk = "^3.9.1"
 pgzip = "^0.3.5"
-pypdf2 = "^3.0.1"
+pypdf = "^6.1.2"
 [tool.poetry.dev-dependencies]
 jupyter = "^1.0.0"

{data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/excel_processing.py RENAMED Viewed

@@ -2,6 +2,8 @@
 # flake8: noqa: E402
 import logging
+from ddtrace import tracer
 from src.postprocessing.common import llm_prediction_to_tuples
 logger = logging.getLogger(__name__)
@@ -13,7 +15,7 @@ import numpy as np
 import pandas as pd
 from src.llm import prompt_excel_extraction
-from src.utils import generate_schema_structure, get_excel_sheets
+from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
 async def extract_data_from_sheet(
@@ -70,6 +72,15 @@ async def extract_data_from_excel(
     # Load the Excel file and get ONLY the "visible" sheet names
     sheets, workbook = get_excel_sheets(file_content, mime_type)
+    # Track the number of sheets in dd-trace
+    span = tracer.current_span()
+    if span:
+        estimated_page_counts = [
+            estimate_page_count(workbook[sheet]) for sheet in sheets
+        ]
+        est_page_count = sum(estimated_page_counts)
+        span.set_metric("est_page_count", est_page_count)
     # Excel files may contain multiple sheets. Extract data from each sheet
     sheet_extract_tasks = [
         extract_data_from_sheet(

{data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/pdf_processing.py RENAMED Viewed

@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
 import asyncio
 from collections import defaultdict
+from ddtrace import tracer
 from fastapi import HTTPException
 from google.cloud.documentai_v1 import Document as docaiv1_document
@@ -32,6 +33,7 @@ from src.prompts.prompt_library import prompt_library
 from src.utils import (
     extract_top_pages,
     generate_schema_structure,
+    get_pdf_page_count,
     get_processor_name,
     run_background_tasks,
     transform_schema_strings,
@@ -411,6 +413,7 @@ async def data_extraction_manual_flow(
     """
     # Get the start time for processing
     start_time = asyncio.get_event_loop().time()
+    page_count = None
     # Validate the file type
     if mime_type == "application/pdf":
         # Enable Doc Ai only for certain document types.
@@ -434,6 +437,7 @@ async def data_extraction_manual_flow(
             if_use_llm=if_use_llm,
             isBetaTest=False,
         )
+        page_count = get_pdf_page_count(file_content)
     elif "excel" in mime_type or "spreadsheet" in mime_type:
         # Extract data from the Excel file
@@ -444,6 +448,16 @@ async def data_extraction_manual_flow(
             mime_type=mime_type,
         )
+        # Get sheet count from dd-trace span (set in extract_data_from_excel)
+        # Note: we use the span metric instead of len(extracted_data) because
+        # some sheets may fail extraction and not appear in extracted_data
+        span = tracer.current_span()
+        page_count = span.get_metric("est_page_count") if span else len(extracted_data)
+        if page_count > 100:
+            logger.warning(
+                f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
+            )
     else:
         raise HTTPException(
             status_code=400,
@@ -477,6 +491,7 @@ async def data_extraction_manual_flow(
                 processor_version,
                 mime_type,
                 elapsed_time,
+                page_count,
             )
         )
     return result

{data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/utils.py RENAMED Viewed

@@ -8,15 +8,29 @@ import pickle
 from datetime import datetime
 from typing import Literal
+import numpy as np
 import openpyxl
 import pandas as pd
 import requests
 from google.cloud import documentai_v1beta3 as docu_ai_beta
-from PyPDF2 import PdfReader, PdfWriter
+from pypdf import PdfReader, PdfWriter
 from src.io import get_storage_client, logger
+def get_pdf_page_count(pdf_bytes):
+    """Get the number of pages in a PDF document efficiently.
+    Args:
+        pdf_bytes (bytes): The PDF content as bytes.
+    Returns:
+        int: The number of pages in the PDF.
+    """
+    reader = PdfReader(io.BytesIO(pdf_bytes))
+    return len(reader.pages)
 def bq_logs(data_to_insert, params):
     """Insert logs into Google BigQuery.
@@ -164,6 +178,7 @@ async def run_background_tasks(
     processor_version,
     mime_type,
     elapsed_time=None,
+    page_count=None,
 ):
     """
     Run background tasks asynchronously.
@@ -177,6 +192,7 @@ async def run_background_tasks(
         processor_version: The processor version used to extract the data.
         mime_type: The MIME type of the document.
         elapsed_time: The time taken to process the document.
+        page_count (int, optional): The number of pages in the document.
     Returns:
         None
@@ -185,13 +201,8 @@ async def run_background_tasks(
     await loop.run_in_executor(None, store_json_in_gcs, params, doc_id, store_data)
-    # Keep the page count as 1 for Excel files.
-    page_count = 1
-    # calculate the number of pages processed for PDFs
-    try:
-        if mime_type == "application/pdf":
-            page_count = len(json.loads(store_data.encode("utf-8"))["pages"])
-    except AttributeError:
+    # Use the passed page_count or default to 0 if not provided
+    if page_count is None:
         page_count = 0
     # Log the request in BigQuery
@@ -472,3 +483,14 @@ def transform_schema_strings(schema):
     # Base case: for non-dict/list values (e.g., None, bool, str)
     else:
         return schema
+def estimate_page_count(sheet):
+    """Assuming a page is 10 columns x 50 rows."""
+    if hasattr(sheet, "shape"):
+        pg_cnt = sheet.shape[0] * sheet.shape[1]
+    elif hasattr(sheet, "max_row"):
+        pg_cnt = sheet.max_column * sheet.max_row
+    else:
+        return None
+    return np.ceil(pg_cnt / 500)