data-science-document-ai 1.41.0__tar.gz → 1.45.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/PKG-INFO +2 -2
  2. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/pyproject.toml +2 -2
  3. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/constants.py +4 -3
  4. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/docai.py +14 -5
  5. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/docai_processor_config.yaml +0 -14
  6. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/excel_processing.py +40 -13
  7. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/io.py +69 -1
  8. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/llm.py +10 -3
  9. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/pdf_processing.py +197 -58
  10. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/postprocessing/common.py +192 -39
  11. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_partner_invoice.py +123 -82
  12. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bundeskasse/other/placeholders.json +1 -1
  13. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bundeskasse/other/prompt.txt +2 -2
  14. data_science_document_ai-1.45.2/src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  15. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
  16. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/customsInvoice/other/placeholders.json +1 -1
  17. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/customsInvoice/other/prompt.txt +4 -4
  18. data_science_document_ai-1.45.2/src/prompts/library/packingList/other/placeholders.json +98 -0
  19. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/partnerInvoice/other/placeholders.json +39 -12
  20. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
  21. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/prompt_library.py +0 -4
  22. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/setup.py +5 -1
  23. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/utils.py +127 -72
  24. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/constants_sandbox.py +0 -0
  25. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/log_setup.py +0 -0
  26. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  27. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  28. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  29. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  30. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  31. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  32. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  33. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  34. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  35. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  36. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  37. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  38. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  39. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  40. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  41. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  42. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  43. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  44. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  46. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  47. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  48. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  49. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  50. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  51. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  52. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/packingList/other/prompt.txt +0 -0
  53. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  54. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  55. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  56. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  57. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  58. {data_science_document_ai-1.41.0 → data_science_document_ai-1.45.2}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.41.0
3
+ Version: 1.45.2
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
38
38
  Requires-Dist: pgzip (>=0.3.5,<0.4.0)
39
39
  Requires-Dist: pyarrow (==16.1.0)
40
40
  Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
41
- Requires-Dist: pypdf2 (>=3.0.1,<4.0.0)
41
+ Requires-Dist: pypdf (>=6.1.2,<7.0.0)
42
42
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
43
43
  Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
44
44
  Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.41.0"
3
+ version = "1.45.2"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -48,7 +48,7 @@ rapidfuzz = "^3.12.2"
48
48
  fuzzywuzzy = "^0.18.0"
49
49
  nltk = "^3.9.1"
50
50
  pgzip = "^0.3.5"
51
- pypdf2 = "^3.0.1"
51
+ pypdf = "^6.1.2"
52
52
 
53
53
  [tool.poetry.dev-dependencies]
54
54
  jupyter = "^1.0.0"
@@ -23,9 +23,12 @@ project_parameters = {
23
23
  "invoice_classification_lookup": "invoice_classification.json",
24
24
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
25
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 70,
26
+ "fuzzy_threshold_item_code": 90,
27
27
  "fuzzy_threshold_reverse_charge": 80,
28
28
  "fuzzy_threshold_invoice_classification": 70,
29
+ # Chunking params
30
+ "chunk_size": 1, # page (do not change this without changing the page number logic)
31
+ "chunk_after": 10, # pages
29
32
  # Big Query
30
33
  "g_ai_gbq_db_schema": "document_ai",
31
34
  "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -50,8 +53,6 @@ project_parameters = {
50
53
  "model_selector": {
51
54
  "stable": {
52
55
  "bookingConfirmation": 1,
53
- "packingList": 0,
54
- "commercialInvoice": 0,
55
56
  "finalMbL": 0,
56
57
  "draftMbl": 0,
57
58
  "arrivalNotice": 0,
@@ -3,11 +3,16 @@ import re
3
3
 
4
4
  from google.cloud import documentai
5
5
 
6
- from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
6
+ from src.io import (
7
+ delete_folder_from_bucket,
8
+ get_gcp_labels,
9
+ logger,
10
+ upload_pdf_to_bucket,
11
+ )
7
12
  from src.utils import cache_on_disk
8
13
 
9
14
 
10
- async def _process_pdf_w_docai(image_content, client, processor_name):
15
+ async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
11
16
  """Process the PDF using Document AI.
12
17
 
13
18
  Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
15
20
  client: The Document AI client.
16
21
  processor_name (str): The name of the processor to be used.
17
22
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
23
+ doc_type (str, optional): Document type for cost tracking labels.
18
24
 
19
25
  Returns:
20
26
  The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
24
30
  content=image_content, mime_type="application/pdf"
25
31
  )
26
32
 
27
- # Configure the process request
33
+ # Configure the process request with labels for cost tracking
28
34
  request = documentai.ProcessRequest(
29
35
  name=processor_name,
30
36
  raw_document=raw_document, # field_mask=field_mask
37
+ labels=get_gcp_labels(doc_type=doc_type),
31
38
  )
32
39
  result = await cache_on_disk(client.process_document, request=request)
33
40
 
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
35
42
 
36
43
 
37
44
  async def _batch_process_pdf_w_docai(
38
- params, image_content, client, processor_name, timeout=1200
45
+ params, image_content, client, processor_name, timeout=1200, doc_type=None
39
46
  ):
40
47
  """Process the PDF using Document AI Batch Process API.
41
48
 
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
45
52
  processor_name (str): The name of the processor to be used.
46
53
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
47
54
  timeout (int, optional): The timeout in seconds. Defaults to 1200.
55
+ doc_type (str, optional): Document type for cost tracking labels.
48
56
 
49
57
  Returns:
50
58
  The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
72
80
  # Where to write results
73
81
  output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
74
82
 
75
- # The full resource name of the processor
83
+ # The full resource name of the processor with labels for cost tracking
76
84
  request = documentai.BatchProcessRequest(
77
85
  name=processor_name,
78
86
  input_documents=input_config,
79
87
  document_output_config=output_config,
88
+ labels=get_gcp_labels(doc_type=doc_type),
80
89
  )
81
90
 
82
91
  # BatchProcess returns a Long Running Operation (LRO)
@@ -13,20 +13,6 @@ model_config:
13
13
  author: "igor.tonko@forto.com"
14
14
  created_date: ""
15
15
 
16
- packingList:
17
- - id: "d967005bd9d45aeb"
18
- details:
19
- display_name: "doc_cap_packingList"
20
- author: "kumar.rajendrababu@forto.com"
21
- created_date: ""
22
-
23
- commercialInvoice:
24
- - id: "7d37236207f75758"
25
- details:
26
- display_name: "doc_cap_commercialInvoice"
27
- author: "kumar.rajendrababu@forto.com"
28
- created_date: ""
29
-
30
16
  finalMbL:
31
17
  - id: "1eda2f22d64b1b89"
32
18
  details:
@@ -2,23 +2,26 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
- from src.postprocessing.common import llm_prediction_to_tuples
5
+ from ddtrace import tracer
6
6
 
7
7
  logger = logging.getLogger(__name__)
8
8
 
9
9
  import asyncio
10
- import json
11
10
 
12
11
  import numpy as np
13
12
  import pandas as pd
14
13
 
15
14
  from src.llm import prompt_excel_extraction
16
- from src.utils import generate_schema_structure, get_excel_sheets
15
+ from src.prompts.prompt_library import prompt_library
16
+ from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
17
17
 
18
18
 
19
- async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
19
+ async def extract_data_from_sheet(
20
+ llm_client, sheet_name, sheet, response_schema, doc_type=None
21
+ ):
20
22
  logger.info(f"Processing sheet: {sheet_name}")
21
- excel_content = pd.DataFrame(sheet.values)
23
+ excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
24
+
22
25
  # Convert to Markdown format for the LLM model
23
26
  worksheet = (
24
27
  "This is from a excel. Pay attention to the cell position:\n"
@@ -29,9 +32,10 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
29
32
  prompt_docai = prompt_excel_extraction(worksheet)
30
33
 
31
34
  try:
32
- result = await params["LlmClient"].get_unified_json_genai(
35
+ result = await llm_client.get_unified_json_genai(
33
36
  prompt_docai,
34
37
  response_schema=response_schema,
38
+ doc_type=doc_type,
35
39
  )
36
40
  except Exception as e:
37
41
  result = {}
@@ -45,6 +49,7 @@ async def extract_data_from_excel(
45
49
  input_doc_type,
46
50
  file_content,
47
51
  mime_type,
52
+ llm_client,
48
53
  ):
49
54
  """Extract data from the Excel file.
50
55
 
@@ -53,6 +58,7 @@ async def extract_data_from_excel(
53
58
  input_doc_type (str): The type of the document.
54
59
  file_content (bytes): The content of the Excel file to process.
55
60
  mime_type (str): The MIME type of the file.
61
+ llm_client: The LLM client to use for data extraction.
56
62
 
57
63
  Returns:
58
64
  formatted_data (list): A list of dictionaries containing the extracted data.
@@ -61,22 +67,43 @@ async def extract_data_from_excel(
61
67
 
62
68
  """
63
69
  # Generate the response structure
64
- response_schema = generate_schema_structure(params, input_doc_type)
70
+ response_schema = (
71
+ prompt_library.library[input_doc_type]["other"]["placeholders"]
72
+ if input_doc_type
73
+ in [
74
+ "partnerInvoice",
75
+ "customsInvoice",
76
+ "bundeskasse",
77
+ "commercialInvoice",
78
+ "packingList",
79
+ "bookingConfirmation",
80
+ ]
81
+ else generate_schema_structure(params, input_doc_type)
82
+ )
65
83
 
66
84
  # Load the Excel file and get ONLY the "visible" sheet names
67
85
  sheets, workbook = get_excel_sheets(file_content, mime_type)
68
86
 
87
+ # Track the number of sheets in dd-trace
88
+ span = tracer.current_span()
89
+ if span:
90
+ estimated_page_counts = [
91
+ estimate_page_count(workbook[sheet]) for sheet in sheets
92
+ ]
93
+ est_page_count = sum(estimated_page_counts)
94
+ span.set_metric("est_page_count", est_page_count)
95
+
69
96
  # Excel files may contain multiple sheets. Extract data from each sheet
70
97
  sheet_extract_tasks = [
71
98
  extract_data_from_sheet(
72
- params, sheet_name, workbook[sheet_name], response_schema
99
+ llm_client,
100
+ sheet_name,
101
+ workbook[sheet_name],
102
+ response_schema,
103
+ doc_type=input_doc_type,
73
104
  )
74
105
  for sheet_name in sheets
75
106
  ]
76
107
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
77
108
 
78
- # Convert LLM prediction dictionary to tuples of (value, page_number).
79
- extracted_data = llm_prediction_to_tuples(extracted_data)
80
- stored_data = json.dumps(extracted_data)
81
-
82
- return extracted_data, stored_data, params["gemini_params"]["model_id"]
109
+ return extracted_data, extracted_data, llm_client.model_id
@@ -12,13 +12,55 @@ from pathlib import Path
12
12
  from google.cloud import bigquery, storage
13
13
 
14
14
 
15
+ def get_gcp_labels(**extra_labels):
16
+ """Generate standardized GCP labels for cost tracking.
17
+
18
+ Args:
19
+ **extra_labels: Additional custom labels
20
+
21
+ Returns:
22
+ dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
23
+ """
24
+ project_name = os.getenv("PROJECT_NAME")
25
+
26
+ # If not set, detect once and cache it
27
+ if not project_name:
28
+ # Try pyproject.toml first
29
+ try:
30
+ import toml
31
+
32
+ pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
33
+ if pyproject_path.exists():
34
+ config = toml.load(pyproject_path)
35
+ project_name = config.get("tool", {}).get("poetry", {}).get("name")
36
+ except Exception:
37
+ pass
38
+
39
+ # Fallback to unknown
40
+ if not project_name:
41
+ project_name = "unknown"
42
+
43
+ # Cache it
44
+ os.environ["PROJECT_NAME"] = project_name
45
+
46
+ labels = {
47
+ "ds-project-name": project_name.lower(),
48
+ "ds-env": os.getenv("CLUSTER", "local").lower(),
49
+ }
50
+
51
+ # Add any extra labels
52
+ labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
53
+
54
+ return labels
55
+
56
+
15
57
  def get_bq_client(params):
16
58
  """Get Google BigQuery client."""
17
59
  bq_client = bigquery.Client(project=params["g_ai_project_name"])
18
60
  job_config = bigquery.QueryJobConfig(
19
61
  allow_large_results=True,
20
62
  # flatten_results=True,
21
- labels={"project-name": params["project_name"]},
63
+ labels=get_gcp_labels(),
22
64
  )
23
65
  return bq_client, job_config
24
66
 
@@ -112,3 +154,29 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
112
154
  Path(directory).mkdir(parents=True, exist_ok=True)
113
155
  blob.download_to_filename(directory_local / Path(blob.name))
114
156
  return result
157
+
158
+
159
+ def bq_logs(data_to_insert, params):
160
+ """Insert logs into Google BigQuery.
161
+
162
+ Args:
163
+ data_to_insert (list): The data to insert into BigQuery.
164
+ params (dict): The parameters dictionary.
165
+ """
166
+ # Use the pre-initialized BigQuery client
167
+ bq_client = params["bq_client"]
168
+ # Get the table string
169
+ table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
170
+
171
+ logger.info(f"Log table: {table_string}")
172
+ # Insert the rows into the table
173
+ insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
174
+
175
+ # Check if there were any errors inserting the rows
176
+ if not insert_logs:
177
+ logger.info("New rows have been added.")
178
+ else:
179
+ logger.info("Errors occurred while inserting rows: ", insert_logs)
180
+
181
+
182
+ # type: ignore
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
15
15
  Part,
16
16
  )
17
17
 
18
+ from src.io import get_gcp_labels
18
19
  from src.utils import cache_on_disk
19
20
 
20
21
 
@@ -69,6 +70,7 @@ class LlmClient:
69
70
  document: str = None,
70
71
  response_schema: dict = None,
71
72
  response_mime_type: str = "application/json",
73
+ doc_type: str = None,
72
74
  ):
73
75
  """Ask the Gemini model a question.
74
76
 
@@ -76,6 +78,7 @@ class LlmClient:
76
78
  prompt (str): The prompt to send to the model.
77
79
  document (str, optional): An optional document to provide context.
78
80
  response_schema (dict, optional): Defines a specific response schema for the model.
81
+ doc_type (str, optional): Document type for cost tracking labels.
79
82
 
80
83
  Returns:
81
84
  str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
96
99
  # Prepare inputs for the model
97
100
  inputs = [document, prompt] if document else prompt
98
101
 
99
- # Generate the response
102
+ # Generate the response with labels for cost tracking
100
103
  model_response = await cache_on_disk(
101
104
  self.geminy_client.generate_content_async,
102
105
  contents=inputs,
103
106
  generation_config=config,
104
107
  safety_settings=self.safety_config,
108
+ labels=get_gcp_labels(doc_type=doc_type),
105
109
  )
106
110
 
107
111
  response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
113
117
  return "{}"
114
118
 
115
119
  async def get_unified_json_genai(
116
- self, prompt, document=None, response_schema=None, model="gemini"
120
+ self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
117
121
  ):
118
122
  """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
119
123
 
@@ -122,6 +126,7 @@ class LlmClient:
122
126
  document: Content of the PDF document
123
127
  response_schema: The schema to use for the response
124
128
  model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
129
+ doc_type (str, optional): Document type for cost tracking labels.
125
130
 
126
131
  Returns:
127
132
  dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
131
136
  response = await self.ask_chatgpt(prompt, document, response_schema)
132
137
  else:
133
138
  # Default to Gemini
134
- response = await self.ask_gemini(prompt, document, response_schema)
139
+ response = await self.ask_gemini(
140
+ prompt, document, response_schema, doc_type=doc_type
141
+ )
135
142
 
136
143
  try:
137
144
  return json.loads(response)