data-science-document-ai 1.41.0__tar.gz → 1.43.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/PKG-INFO +2 -2
  2. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/pyproject.toml +2 -2
  3. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/constants.py +0 -2
  4. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/docai.py +14 -5
  5. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/docai_processor_config.yaml +0 -14
  6. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/excel_processing.py +40 -12
  7. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/io.py +46 -1
  8. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/llm.py +10 -3
  9. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/pdf_processing.py +91 -29
  10. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/postprocessing/common.py +75 -29
  11. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/postprocessing/postprocess_partner_invoice.py +5 -30
  12. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bundeskasse/other/placeholders.json +1 -1
  13. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bundeskasse/other/prompt.txt +2 -2
  14. data_science_document_ai-1.43.6/src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  15. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
  16. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/customsInvoice/other/placeholders.json +1 -1
  17. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/customsInvoice/other/prompt.txt +3 -3
  18. data_science_document_ai-1.43.6/src/prompts/library/packingList/other/placeholders.json +98 -0
  19. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/partnerInvoice/other/placeholders.json +39 -12
  20. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/partnerInvoice/other/prompt.txt +4 -17
  21. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/prompt_library.py +0 -4
  22. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/setup.py +5 -1
  23. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/utils.py +72 -39
  24. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/constants_sandbox.py +0 -0
  25. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/log_setup.py +0 -0
  26. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  27. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  28. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  29. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  30. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  31. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  32. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  33. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  34. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  35. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  36. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  37. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  38. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  39. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  40. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  41. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  42. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  43. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  44. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  46. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  47. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  48. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  49. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  50. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  51. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  52. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/packingList/other/prompt.txt +0 -0
  53. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  54. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  55. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  56. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  57. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  58. {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.41.0
3
+ Version: 1.43.6
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
38
38
  Requires-Dist: pgzip (>=0.3.5,<0.4.0)
39
39
  Requires-Dist: pyarrow (==16.1.0)
40
40
  Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
41
- Requires-Dist: pypdf2 (>=3.0.1,<4.0.0)
41
+ Requires-Dist: pypdf (>=6.1.2,<7.0.0)
42
42
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
43
43
  Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
44
44
  Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.41.0"
3
+ version = "1.43.6"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -48,7 +48,7 @@ rapidfuzz = "^3.12.2"
48
48
  fuzzywuzzy = "^0.18.0"
49
49
  nltk = "^3.9.1"
50
50
  pgzip = "^0.3.5"
51
- pypdf2 = "^3.0.1"
51
+ pypdf = "^6.1.2"
52
52
 
53
53
  [tool.poetry.dev-dependencies]
54
54
  jupyter = "^1.0.0"
@@ -50,8 +50,6 @@ project_parameters = {
50
50
  "model_selector": {
51
51
  "stable": {
52
52
  "bookingConfirmation": 1,
53
- "packingList": 0,
54
- "commercialInvoice": 0,
55
53
  "finalMbL": 0,
56
54
  "draftMbl": 0,
57
55
  "arrivalNotice": 0,
@@ -3,11 +3,16 @@ import re
3
3
 
4
4
  from google.cloud import documentai
5
5
 
6
- from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
6
+ from src.io import (
7
+ delete_folder_from_bucket,
8
+ get_gcp_labels,
9
+ logger,
10
+ upload_pdf_to_bucket,
11
+ )
7
12
  from src.utils import cache_on_disk
8
13
 
9
14
 
10
- async def _process_pdf_w_docai(image_content, client, processor_name):
15
+ async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
11
16
  """Process the PDF using Document AI.
12
17
 
13
18
  Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
15
20
  client: The Document AI client.
16
21
  processor_name (str): The name of the processor to be used.
17
22
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
23
+ doc_type (str, optional): Document type for cost tracking labels.
18
24
 
19
25
  Returns:
20
26
  The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
24
30
  content=image_content, mime_type="application/pdf"
25
31
  )
26
32
 
27
- # Configure the process request
33
+ # Configure the process request with labels for cost tracking
28
34
  request = documentai.ProcessRequest(
29
35
  name=processor_name,
30
36
  raw_document=raw_document, # field_mask=field_mask
37
+ labels=get_gcp_labels(doc_type=doc_type),
31
38
  )
32
39
  result = await cache_on_disk(client.process_document, request=request)
33
40
 
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
35
42
 
36
43
 
37
44
  async def _batch_process_pdf_w_docai(
38
- params, image_content, client, processor_name, timeout=1200
45
+ params, image_content, client, processor_name, timeout=1200, doc_type=None
39
46
  ):
40
47
  """Process the PDF using Document AI Batch Process API.
41
48
 
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
45
52
  processor_name (str): The name of the processor to be used.
46
53
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
47
54
  timeout (int, optional): The timeout in seconds. Defaults to 1200.
55
+ doc_type (str, optional): Document type for cost tracking labels.
48
56
 
49
57
  Returns:
50
58
  The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
72
80
  # Where to write results
73
81
  output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
74
82
 
75
- # The full resource name of the processor
83
+ # The full resource name of the processor with labels for cost tracking
76
84
  request = documentai.BatchProcessRequest(
77
85
  name=processor_name,
78
86
  input_documents=input_config,
79
87
  document_output_config=output_config,
88
+ labels=get_gcp_labels(doc_type=doc_type),
80
89
  )
81
90
 
82
91
  # BatchProcess returns a Long Running Operation (LRO)
@@ -13,20 +13,6 @@ model_config:
13
13
  author: "igor.tonko@forto.com"
14
14
  created_date: ""
15
15
 
16
- packingList:
17
- - id: "d967005bd9d45aeb"
18
- details:
19
- display_name: "doc_cap_packingList"
20
- author: "kumar.rajendrababu@forto.com"
21
- created_date: ""
22
-
23
- commercialInvoice:
24
- - id: "7d37236207f75758"
25
- details:
26
- display_name: "doc_cap_commercialInvoice"
27
- author: "kumar.rajendrababu@forto.com"
28
- created_date: ""
29
-
30
16
  finalMbL:
31
17
  - id: "1eda2f22d64b1b89"
32
18
  details:
@@ -2,23 +2,28 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
+ from ddtrace import tracer
6
+
5
7
  from src.postprocessing.common import llm_prediction_to_tuples
6
8
 
7
9
  logger = logging.getLogger(__name__)
8
10
 
9
11
  import asyncio
10
- import json
11
12
 
12
13
  import numpy as np
13
14
  import pandas as pd
14
15
 
15
16
  from src.llm import prompt_excel_extraction
16
- from src.utils import generate_schema_structure, get_excel_sheets
17
+ from src.prompts.prompt_library import prompt_library
18
+ from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
17
19
 
18
20
 
19
- async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
21
+ async def extract_data_from_sheet(
22
+ llm_client, sheet_name, sheet, response_schema, doc_type=None
23
+ ):
20
24
  logger.info(f"Processing sheet: {sheet_name}")
21
- excel_content = pd.DataFrame(sheet.values)
25
+ excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
26
+
22
27
  # Convert to Markdown format for the LLM model
23
28
  worksheet = (
24
29
  "This is from a excel. Pay attention to the cell position:\n"
@@ -29,9 +34,10 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
29
34
  prompt_docai = prompt_excel_extraction(worksheet)
30
35
 
31
36
  try:
32
- result = await params["LlmClient"].get_unified_json_genai(
37
+ result = await llm_client.get_unified_json_genai(
33
38
  prompt_docai,
34
39
  response_schema=response_schema,
40
+ doc_type=doc_type,
35
41
  )
36
42
  except Exception as e:
37
43
  result = {}
@@ -45,6 +51,7 @@ async def extract_data_from_excel(
45
51
  input_doc_type,
46
52
  file_content,
47
53
  mime_type,
54
+ llm_client,
48
55
  ):
49
56
  """Extract data from the Excel file.
50
57
 
@@ -53,6 +60,7 @@ async def extract_data_from_excel(
53
60
  input_doc_type (str): The type of the document.
54
61
  file_content (bytes): The content of the Excel file to process.
55
62
  mime_type (str): The MIME type of the file.
63
+ llm_client: The LLM client to use for data extraction.
56
64
 
57
65
  Returns:
58
66
  formatted_data (list): A list of dictionaries containing the extracted data.
@@ -61,22 +69,42 @@ async def extract_data_from_excel(
61
69
 
62
70
  """
63
71
  # Generate the response structure
64
- response_schema = generate_schema_structure(params, input_doc_type)
72
+ response_schema = (
73
+ prompt_library.library[input_doc_type]["other"]["placeholders"]
74
+ if input_doc_type
75
+ in [
76
+ "partnerInvoice",
77
+ "customsInvoice",
78
+ "bundeskasse",
79
+ "commercialInvoice",
80
+ "packingList",
81
+ ]
82
+ else generate_schema_structure(params, input_doc_type)
83
+ )
65
84
 
66
85
  # Load the Excel file and get ONLY the "visible" sheet names
67
86
  sheets, workbook = get_excel_sheets(file_content, mime_type)
68
87
 
88
+ # Track the number of sheets in dd-trace
89
+ span = tracer.current_span()
90
+ if span:
91
+ estimated_page_counts = [
92
+ estimate_page_count(workbook[sheet]) for sheet in sheets
93
+ ]
94
+ est_page_count = sum(estimated_page_counts)
95
+ span.set_metric("est_page_count", est_page_count)
96
+
69
97
  # Excel files may contain multiple sheets. Extract data from each sheet
70
98
  sheet_extract_tasks = [
71
99
  extract_data_from_sheet(
72
- params, sheet_name, workbook[sheet_name], response_schema
100
+ llm_client,
101
+ sheet_name,
102
+ workbook[sheet_name],
103
+ response_schema,
104
+ doc_type=input_doc_type,
73
105
  )
74
106
  for sheet_name in sheets
75
107
  ]
76
108
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
77
109
 
78
- # Convert LLM prediction dictionary to tuples of (value, page_number).
79
- extracted_data = llm_prediction_to_tuples(extracted_data)
80
- stored_data = json.dumps(extracted_data)
81
-
82
- return extracted_data, stored_data, params["gemini_params"]["model_id"]
110
+ return extracted_data, extracted_data, llm_client.model_id
@@ -12,13 +12,55 @@ from pathlib import Path
12
12
  from google.cloud import bigquery, storage
13
13
 
14
14
 
15
+ def get_gcp_labels(**extra_labels):
16
+ """Generate standardized GCP labels for cost tracking.
17
+
18
+ Args:
19
+ **extra_labels: Additional custom labels
20
+
21
+ Returns:
22
+ dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
23
+ """
24
+ project_name = os.getenv("PROJECT_NAME")
25
+
26
+ # If not set, detect once and cache it
27
+ if not project_name:
28
+ # Try pyproject.toml first
29
+ try:
30
+ import toml
31
+
32
+ pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
33
+ if pyproject_path.exists():
34
+ config = toml.load(pyproject_path)
35
+ project_name = config.get("tool", {}).get("poetry", {}).get("name")
36
+ except Exception:
37
+ pass
38
+
39
+ # Fallback to unknown
40
+ if not project_name:
41
+ project_name = "unknown"
42
+
43
+ # Cache it
44
+ os.environ["PROJECT_NAME"] = project_name
45
+
46
+ labels = {
47
+ "ds-project-name": project_name.lower(),
48
+ "ds-env": os.getenv("CLUSTER", "local").lower(),
49
+ }
50
+
51
+ # Add any extra labels
52
+ labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
53
+
54
+ return labels
55
+
56
+
15
57
  def get_bq_client(params):
16
58
  """Get Google BigQuery client."""
17
59
  bq_client = bigquery.Client(project=params["g_ai_project_name"])
18
60
  job_config = bigquery.QueryJobConfig(
19
61
  allow_large_results=True,
20
62
  # flatten_results=True,
21
- labels={"project-name": params["project_name"]},
63
+ labels=get_gcp_labels(),
22
64
  )
23
65
  return bq_client, job_config
24
66
 
@@ -112,3 +154,6 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
112
154
  Path(directory).mkdir(parents=True, exist_ok=True)
113
155
  blob.download_to_filename(directory_local / Path(blob.name))
114
156
  return result
157
+
158
+
159
+ # type: ignore
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
15
15
  Part,
16
16
  )
17
17
 
18
+ from src.io import get_gcp_labels
18
19
  from src.utils import cache_on_disk
19
20
 
20
21
 
@@ -69,6 +70,7 @@ class LlmClient:
69
70
  document: str = None,
70
71
  response_schema: dict = None,
71
72
  response_mime_type: str = "application/json",
73
+ doc_type: str = None,
72
74
  ):
73
75
  """Ask the Gemini model a question.
74
76
 
@@ -76,6 +78,7 @@ class LlmClient:
76
78
  prompt (str): The prompt to send to the model.
77
79
  document (str, optional): An optional document to provide context.
78
80
  response_schema (dict, optional): Defines a specific response schema for the model.
81
+ doc_type (str, optional): Document type for cost tracking labels.
79
82
 
80
83
  Returns:
81
84
  str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
96
99
  # Prepare inputs for the model
97
100
  inputs = [document, prompt] if document else prompt
98
101
 
99
- # Generate the response
102
+ # Generate the response with labels for cost tracking
100
103
  model_response = await cache_on_disk(
101
104
  self.geminy_client.generate_content_async,
102
105
  contents=inputs,
103
106
  generation_config=config,
104
107
  safety_settings=self.safety_config,
108
+ labels=get_gcp_labels(doc_type=doc_type),
105
109
  )
106
110
 
107
111
  response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
113
117
  return "{}"
114
118
 
115
119
  async def get_unified_json_genai(
116
- self, prompt, document=None, response_schema=None, model="gemini"
120
+ self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
117
121
  ):
118
122
  """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
119
123
 
@@ -122,6 +126,7 @@ class LlmClient:
122
126
  document: Content of the PDF document
123
127
  response_schema: The schema to use for the response
124
128
  model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
129
+ doc_type (str, optional): Document type for cost tracking labels.
125
130
 
126
131
  Returns:
127
132
  dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
131
136
  response = await self.ask_chatgpt(prompt, document, response_schema)
132
137
  else:
133
138
  # Default to Gemini
134
- response = await self.ask_gemini(prompt, document, response_schema)
139
+ response = await self.ask_gemini(
140
+ prompt, document, response_schema, doc_type=doc_type
141
+ )
135
142
 
136
143
  try:
137
144
  return json.loads(response)
@@ -9,12 +9,17 @@ logger = logging.getLogger(__name__)
9
9
  import asyncio
10
10
  from collections import defaultdict
11
11
 
12
+ from ddtrace import tracer
12
13
  from fastapi import HTTPException
13
14
  from google.cloud.documentai_v1 import Document as docaiv1_document
14
15
 
15
16
  from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
16
17
  from src.excel_processing import extract_data_from_excel
17
- from src.postprocessing.common import format_all_entities, remove_none_values, llm_prediction_to_tuples
18
+ from src.postprocessing.common import (
19
+ format_all_entities,
20
+ llm_prediction_to_tuples,
21
+ remove_none_values,
22
+ )
18
23
  from src.postprocessing.postprocess_booking_confirmation import (
19
24
  postprocess_booking_confirmation,
20
25
  )
@@ -28,14 +33,17 @@ from src.prompts.prompt_library import prompt_library
28
33
  from src.utils import (
29
34
  extract_top_pages,
30
35
  generate_schema_structure,
36
+ get_pdf_page_count,
31
37
  get_processor_name,
32
38
  run_background_tasks,
39
+ transform_schema_strings,
33
40
  validate_based_on_schema,
34
- transform_schema_strings
35
41
  )
36
42
 
37
43
 
38
- async def process_file_w_docai(params, image_content, client, processor_name):
44
+ async def process_file_w_docai(
45
+ params, image_content, client, processor_name, doc_type=None
46
+ ):
39
47
  """
40
48
  Process a file using Document AI.
41
49
 
@@ -44,6 +52,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
44
52
  image_content (bytes): The file to be processed. It can be bytes object.
45
53
  client: The Document AI client.
46
54
  processor_name (str): The name of the processor to be used.
55
+ doc_type (str, optional): Document type for cost tracking labels.
47
56
 
48
57
  Returns:
49
58
  The processed document.
@@ -55,7 +64,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
55
64
 
56
65
  try:
57
66
  logger.info("Processing document...")
58
- result = await _process_pdf_w_docai(image_content, client, processor_name)
67
+ result = await _process_pdf_w_docai(
68
+ image_content, client, processor_name, doc_type=doc_type
69
+ )
59
70
  except Exception as e:
60
71
  if e.reason == "PAGE_LIMIT_EXCEEDED":
61
72
  logger.warning(
@@ -64,7 +75,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
64
75
  # Process the document in batch method (offline processing)
65
76
  try:
66
77
  result = await _batch_process_pdf_w_docai(
67
- params, image_content, client, processor_name
78
+ params, image_content, client, processor_name, doc_type=doc_type
68
79
  )
69
80
  except Exception as batch_e:
70
81
  logger.error(f"Error processing document {batch_e}.")
@@ -94,7 +105,7 @@ async def extract_data_from_pdf_w_docai(
94
105
  )
95
106
 
96
107
  result = await process_file_w_docai(
97
- params, file_content, processor_client, processor_name
108
+ params, file_content, processor_client, processor_name, doc_type=input_doc_type
98
109
  )
99
110
 
100
111
  # Create an entity object to store the result in gcs
@@ -105,16 +116,22 @@ async def extract_data_from_pdf_w_docai(
105
116
  # Extract entities from the result
106
117
  for entity in result.entities:
107
118
  value = (
108
- {child.type_: (child.mention_text,
109
- child.page_anchor.page_refs[0].page
110
- if hasattr(child.page_anchor.page_refs[0], "page")
111
- else 0)
112
- for child in entity.properties}
119
+ {
120
+ child.type_: (
121
+ child.mention_text,
122
+ child.page_anchor.page_refs[0].page
123
+ if hasattr(child.page_anchor.page_refs[0], "page")
124
+ else 0,
125
+ )
126
+ for child in entity.properties
127
+ }
113
128
  if entity.properties
114
- else (entity.mention_text,
115
- entity.page_anchor.page_refs[0].page
116
- if hasattr(entity.page_anchor.page_refs[0], "page")
117
- else 0)
129
+ else (
130
+ entity.mention_text,
131
+ entity.page_anchor.page_refs[0].page
132
+ if hasattr(entity.page_anchor.page_refs[0], "page")
133
+ else 0,
134
+ )
118
135
  )
119
136
  aggregated_data[entity.type_].append(value)
120
137
 
@@ -145,7 +162,9 @@ async def extract_data_from_pdf_w_docai(
145
162
  return aggregated_data, result_for_store, processor_version
146
163
 
147
164
 
148
- async def identify_carrier(document, llm_client, prompt, response_schema):
165
+ async def identify_carrier(
166
+ document, llm_client, prompt, response_schema, doc_type=None
167
+ ):
149
168
  """Identify the carrier from the Booking Confirmation document."""
150
169
 
151
170
  result = await llm_client.ask_gemini(
@@ -153,6 +172,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
153
172
  document=document,
154
173
  response_schema=response_schema,
155
174
  response_mime_type="text/x.enum",
175
+ doc_type=doc_type,
156
176
  )
157
177
 
158
178
  if result:
@@ -180,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
180
200
  if input_doc_type == "bundeskasse"
181
201
  else file_content
182
202
  )
203
+ number_of_pages = get_pdf_page_count(file_content)
183
204
 
184
205
  # convert file_content to required document
185
206
  document = llm_client.prepare_document_for_gemini(file_content)
@@ -187,7 +208,14 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
187
208
  # get the schema placeholder from the Doc AI and generate the response structure
188
209
  response_schema = (
189
210
  prompt_library.library[input_doc_type]["other"]["placeholders"]
190
- if input_doc_type in ["partnerInvoice", "customsInvoice", "bundeskasse"]
211
+ if input_doc_type
212
+ in [
213
+ "partnerInvoice",
214
+ "customsInvoice",
215
+ "bundeskasse",
216
+ "commercialInvoice",
217
+ "packingList",
218
+ ]
191
219
  else generate_schema_structure(params, input_doc_type)
192
220
  )
193
221
 
@@ -209,7 +237,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
209
237
 
210
238
  # identify carrier for customized prompting
211
239
  carrier = await identify_carrier(
212
- document, llm_client, carrier_prompt, carrier_schema
240
+ document,
241
+ llm_client,
242
+ carrier_prompt,
243
+ carrier_schema,
244
+ doc_type=input_doc_type,
213
245
  )
214
246
 
215
247
  if input_doc_type == "bookingConfirmation":
@@ -224,12 +256,22 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
224
256
  # get the related prompt from predefined prompt library
225
257
  prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
226
258
 
259
+ # Update schema to extract value-page_number pairs
260
+ if number_of_pages > 1:
261
+ response_schema = transform_schema_strings(response_schema)
262
+
263
+ # Update the prompt to instruct LLM to include page numbers
264
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
265
+
227
266
  # generate the result with LLM (gemini)
228
267
  result = await llm_client.get_unified_json_genai(
229
- prompt=prompt, document=document, response_schema=response_schema
268
+ prompt=prompt,
269
+ document=document,
270
+ response_schema=response_schema,
271
+ doc_type=input_doc_type,
230
272
  )
231
273
 
232
- result = llm_prediction_to_tuples(result)
274
+ result = llm_prediction_to_tuples(result, number_of_pages)
233
275
 
234
276
  return result
235
277
  return {}
@@ -309,15 +351,9 @@ async def extract_data_by_doctype(
309
351
  processor_client,
310
352
  if_use_docai,
311
353
  if_use_llm,
354
+ llm_client,
312
355
  isBetaTest=False,
313
356
  ):
314
- # Select LLM client (Using 2.5 Flash model for Bundeskasse)
315
- llm_client = (
316
- params["LlmClient_Flash"]
317
- if input_doc_type == "bundeskasse"
318
- else params["LlmClient"]
319
- )
320
-
321
357
  async def extract_w_docai():
322
358
  return await extract_data_from_pdf_w_docai(
323
359
  params=params,
@@ -366,6 +402,7 @@ async def data_extraction_manual_flow(
366
402
  meta,
367
403
  processor_client,
368
404
  schema_client,
405
+ use_default_logging=False,
369
406
  ):
370
407
  """
371
408
  Process a PDF file and extract data from it.
@@ -386,6 +423,15 @@ async def data_extraction_manual_flow(
386
423
  """
387
424
  # Get the start time for processing
388
425
  start_time = asyncio.get_event_loop().time()
426
+
427
+ # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
428
+ llm_client = (
429
+ params["LlmClient_Flash"]
430
+ if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
431
+ else params["LlmClient"]
432
+ )
433
+
434
+ page_count = None
389
435
  # Validate the file type
390
436
  if mime_type == "application/pdf":
391
437
  # Enable Doc Ai only for certain document types.
@@ -407,8 +453,10 @@ async def data_extraction_manual_flow(
407
453
  processor_client,
408
454
  if_use_docai=if_use_docai,
409
455
  if_use_llm=if_use_llm,
456
+ llm_client=llm_client,
410
457
  isBetaTest=False,
411
458
  )
459
+ page_count = get_pdf_page_count(file_content)
412
460
 
413
461
  elif "excel" in mime_type or "spreadsheet" in mime_type:
414
462
  # Extract data from the Excel file
@@ -417,8 +465,19 @@ async def data_extraction_manual_flow(
417
465
  input_doc_type=meta.documentTypeCode,
418
466
  file_content=file_content,
419
467
  mime_type=mime_type,
468
+ llm_client=llm_client,
420
469
  )
421
470
 
471
+ # Get sheet count from dd-trace span (set in extract_data_from_excel)
472
+ # Note: we use the span metric instead of len(extracted_data) because
473
+ # some sheets may fail extraction and not appear in extracted_data
474
+ span = tracer.current_span()
475
+ page_count = span.get_metric("est_page_count") if span else len(extracted_data)
476
+ if page_count > 100:
477
+ logger.warning(
478
+ f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
479
+ )
480
+
422
481
  else:
423
482
  raise HTTPException(
424
483
  status_code=400,
@@ -426,7 +485,7 @@ async def data_extraction_manual_flow(
426
485
  )
427
486
  # Create the result dictionary with the extracted data
428
487
  extracted_data = await format_all_entities(
429
- extracted_data, meta.documentTypeCode, params
488
+ extracted_data, meta.documentTypeCode, params, mime_type
430
489
  )
431
490
  result = {
432
491
  "id": meta.id,
@@ -441,7 +500,9 @@ async def data_extraction_manual_flow(
441
500
  logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
442
501
 
443
502
  # Schedule background tasks without using FastAPI's BackgroundTasks
444
- if os.getenv("CLUSTER") != "ode": # skip data export to bigquery in ODE environment
503
+ if (
504
+ os.getenv("CLUSTER") != "ode"
505
+ ) & use_default_logging: # skip data export to bigquery in ODE environment
445
506
  asyncio.create_task(
446
507
  run_background_tasks(
447
508
  params,
@@ -452,6 +513,7 @@ async def data_extraction_manual_flow(
452
513
  processor_version,
453
514
  mime_type,
454
515
  elapsed_time,
516
+ page_count,
455
517
  )
456
518
  )
457
519
  return result