data-science-document-ai 1.42.5__tar.gz → 1.43.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/PKG-INFO +2 -2
  2. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/pyproject.toml +2 -2
  3. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/excel_processing.py +12 -1
  4. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/pdf_processing.py +15 -0
  5. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/utils.py +30 -8
  6. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/constants.py +0 -0
  7. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/constants_sandbox.py +0 -0
  8. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/docai.py +0 -0
  9. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/docai_processor_config.yaml +0 -0
  10. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/io.py +0 -0
  11. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/llm.py +0 -0
  12. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/log_setup.py +0 -0
  13. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/postprocessing/common.py +0 -0
  14. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  15. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  16. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/postprocessing/postprocess_partner_invoice.py +0 -0
  17. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  18. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  19. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  20. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  21. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  22. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  23. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  24. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  25. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  26. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  27. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  28. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  29. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  30. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  31. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  32. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  33. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  34. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  35. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  36. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  37. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  38. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  39. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  40. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  41. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  42. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  43. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  44. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  45. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  46. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  47. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  48. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  49. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  50. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  51. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  52. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  53. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  54. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/prompts/prompt_library.py +0 -0
  55. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/setup.py +0 -0
  56. {data_science_document_ai-1.42.5 → data_science_document_ai-1.43.0}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.42.5
3
+ Version: 1.43.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
38
38
  Requires-Dist: pgzip (>=0.3.5,<0.4.0)
39
39
  Requires-Dist: pyarrow (==16.1.0)
40
40
  Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
41
- Requires-Dist: pypdf2 (>=3.0.1,<4.0.0)
41
+ Requires-Dist: pypdf (>=6.1.2,<7.0.0)
42
42
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
43
43
  Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
44
44
  Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.42.5"
3
+ version = "1.43.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -48,7 +48,7 @@ rapidfuzz = "^3.12.2"
48
48
  fuzzywuzzy = "^0.18.0"
49
49
  nltk = "^3.9.1"
50
50
  pgzip = "^0.3.5"
51
- pypdf2 = "^3.0.1"
51
+ pypdf = "^6.1.2"
52
52
 
53
53
  [tool.poetry.dev-dependencies]
54
54
  jupyter = "^1.0.0"
@@ -2,6 +2,8 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
+ from ddtrace import tracer
6
+
5
7
  from src.postprocessing.common import llm_prediction_to_tuples
6
8
 
7
9
  logger = logging.getLogger(__name__)
@@ -13,7 +15,7 @@ import numpy as np
13
15
  import pandas as pd
14
16
 
15
17
  from src.llm import prompt_excel_extraction
16
- from src.utils import generate_schema_structure, get_excel_sheets
18
+ from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
17
19
 
18
20
 
19
21
  async def extract_data_from_sheet(
@@ -70,6 +72,15 @@ async def extract_data_from_excel(
70
72
  # Load the Excel file and get ONLY the "visible" sheet names
71
73
  sheets, workbook = get_excel_sheets(file_content, mime_type)
72
74
 
75
+ # Track the number of sheets in dd-trace
76
+ span = tracer.current_span()
77
+ if span:
78
+ estimated_page_counts = [
79
+ estimate_page_count(workbook[sheet]) for sheet in sheets
80
+ ]
81
+ est_page_count = sum(estimated_page_counts)
82
+ span.set_metric("est_page_count", est_page_count)
83
+
73
84
  # Excel files may contain multiple sheets. Extract data from each sheet
74
85
  sheet_extract_tasks = [
75
86
  extract_data_from_sheet(
@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
9
9
  import asyncio
10
10
  from collections import defaultdict
11
11
 
12
+ from ddtrace import tracer
12
13
  from fastapi import HTTPException
13
14
  from google.cloud.documentai_v1 import Document as docaiv1_document
14
15
 
@@ -32,6 +33,7 @@ from src.prompts.prompt_library import prompt_library
32
33
  from src.utils import (
33
34
  extract_top_pages,
34
35
  generate_schema_structure,
36
+ get_pdf_page_count,
35
37
  get_processor_name,
36
38
  run_background_tasks,
37
39
  transform_schema_strings,
@@ -411,6 +413,7 @@ async def data_extraction_manual_flow(
411
413
  """
412
414
  # Get the start time for processing
413
415
  start_time = asyncio.get_event_loop().time()
416
+ page_count = None
414
417
  # Validate the file type
415
418
  if mime_type == "application/pdf":
416
419
  # Enable Doc Ai only for certain document types.
@@ -434,6 +437,7 @@ async def data_extraction_manual_flow(
434
437
  if_use_llm=if_use_llm,
435
438
  isBetaTest=False,
436
439
  )
440
+ page_count = get_pdf_page_count(file_content)
437
441
 
438
442
  elif "excel" in mime_type or "spreadsheet" in mime_type:
439
443
  # Extract data from the Excel file
@@ -444,6 +448,16 @@ async def data_extraction_manual_flow(
444
448
  mime_type=mime_type,
445
449
  )
446
450
 
451
+ # Get sheet count from dd-trace span (set in extract_data_from_excel)
452
+ # Note: we use the span metric instead of len(extracted_data) because
453
+ # some sheets may fail extraction and not appear in extracted_data
454
+ span = tracer.current_span()
455
+ page_count = span.get_metric("est_page_count") if span else len(extracted_data)
456
+ if page_count > 100:
457
+ logger.warning(
458
+ f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
459
+ )
460
+
447
461
  else:
448
462
  raise HTTPException(
449
463
  status_code=400,
@@ -477,6 +491,7 @@ async def data_extraction_manual_flow(
477
491
  processor_version,
478
492
  mime_type,
479
493
  elapsed_time,
494
+ page_count,
480
495
  )
481
496
  )
482
497
  return result
@@ -8,15 +8,29 @@ import pickle
8
8
  from datetime import datetime
9
9
  from typing import Literal
10
10
 
11
+ import numpy as np
11
12
  import openpyxl
12
13
  import pandas as pd
13
14
  import requests
14
15
  from google.cloud import documentai_v1beta3 as docu_ai_beta
15
- from PyPDF2 import PdfReader, PdfWriter
16
+ from pypdf import PdfReader, PdfWriter
16
17
 
17
18
  from src.io import get_storage_client, logger
18
19
 
19
20
 
21
+ def get_pdf_page_count(pdf_bytes):
22
+ """Get the number of pages in a PDF document efficiently.
23
+
24
+ Args:
25
+ pdf_bytes (bytes): The PDF content as bytes.
26
+
27
+ Returns:
28
+ int: The number of pages in the PDF.
29
+ """
30
+ reader = PdfReader(io.BytesIO(pdf_bytes))
31
+ return len(reader.pages)
32
+
33
+
20
34
  def bq_logs(data_to_insert, params):
21
35
  """Insert logs into Google BigQuery.
22
36
 
@@ -164,6 +178,7 @@ async def run_background_tasks(
164
178
  processor_version,
165
179
  mime_type,
166
180
  elapsed_time=None,
181
+ page_count=None,
167
182
  ):
168
183
  """
169
184
  Run background tasks asynchronously.
@@ -177,6 +192,7 @@ async def run_background_tasks(
177
192
  processor_version: The processor version used to extract the data.
178
193
  mime_type: The MIME type of the document.
179
194
  elapsed_time: The time taken to process the document.
195
+ page_count (int, optional): The number of pages in the document.
180
196
 
181
197
  Returns:
182
198
  None
@@ -185,13 +201,8 @@ async def run_background_tasks(
185
201
 
186
202
  await loop.run_in_executor(None, store_json_in_gcs, params, doc_id, store_data)
187
203
 
188
- # Keep the page count as 1 for Excel files.
189
- page_count = 1
190
- # calculate the number of pages processed for PDFs
191
- try:
192
- if mime_type == "application/pdf":
193
- page_count = len(json.loads(store_data.encode("utf-8"))["pages"])
194
- except AttributeError:
204
+ # Use the passed page_count or default to 0 if not provided
205
+ if page_count is None:
195
206
  page_count = 0
196
207
 
197
208
  # Log the request in BigQuery
@@ -472,3 +483,14 @@ def transform_schema_strings(schema):
472
483
  # Base case: for non-dict/list values (e.g., None, bool, str)
473
484
  else:
474
485
  return schema
486
+
487
+
488
+ def estimate_page_count(sheet):
489
+ """Assuming a page is 10 columns x 50 rows."""
490
+ if hasattr(sheet, "shape"):
491
+ pg_cnt = sheet.shape[0] * sheet.shape[1]
492
+ elif hasattr(sheet, "max_row"):
493
+ pg_cnt = sheet.max_column * sheet.max_row
494
+ else:
495
+ return None
496
+ return np.ceil(pg_cnt / 500)