data-science-document-ai 1.24.0__tar.gz → 1.43.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/PKG-INFO +3 -2
  2. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/pyproject.toml +2 -1
  3. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/constants.py +26 -4
  4. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/docai.py +14 -5
  5. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/docai_processor_config.yaml +0 -22
  6. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/excel_processing.py +42 -12
  7. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/io.py +46 -1
  8. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/llm.py +12 -5
  9. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/pdf_processing.py +127 -48
  10. data_science_document_ai-1.43.6/src/postprocessing/common.py +691 -0
  11. data_science_document_ai-1.43.6/src/postprocessing/postprocess_partner_invoice.py +281 -0
  12. data_science_document_ai-1.43.6/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +32 -0
  13. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  14. data_science_document_ai-1.43.6/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +32 -0
  15. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  16. data_science_document_ai-1.43.6/src/prompts/library/bookingConfirmation/maersk/placeholders.json +32 -0
  17. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  18. data_science_document_ai-1.43.6/src/prompts/library/bookingConfirmation/msc/placeholders.json +32 -0
  19. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  20. data_science_document_ai-1.43.6/src/prompts/library/bookingConfirmation/oocl/placeholders.json +32 -0
  21. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  22. data_science_document_ai-1.43.6/src/prompts/library/bookingConfirmation/other/placeholders.json +32 -0
  23. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  24. data_science_document_ai-1.43.6/src/prompts/library/bookingConfirmation/yangming/placeholders.json +32 -0
  25. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  26. data_science_document_ai-1.43.6/src/prompts/library/bundeskasse/other/placeholders.json +113 -0
  27. data_science_document_ai-1.43.6/src/prompts/library/bundeskasse/other/prompt.txt +46 -0
  28. data_science_document_ai-1.43.6/src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  29. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  30. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/customsAssessment/other/prompt.txt +1 -1
  31. data_science_document_ai-1.43.6/src/prompts/library/customsInvoice/other/placeholders.json +205 -0
  32. data_science_document_ai-1.43.6/src/prompts/library/customsInvoice/other/prompt.txt +104 -0
  33. data_science_document_ai-1.43.6/src/prompts/library/deliveryOrder/other/placeholders.json +29 -0
  34. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/deliveryOrder/other/prompt.txt +1 -1
  35. {data_science_document_ai-1.24.0/src/prompts/library/finalMbL → data_science_document_ai-1.43.6/src/prompts/library/draftMbl}/hapag-lloyd/prompt.txt +2 -1
  36. {data_science_document_ai-1.24.0/src/prompts/library/finalMbL → data_science_document_ai-1.43.6/src/prompts/library/draftMbl}/maersk/prompt.txt +2 -0
  37. {data_science_document_ai-1.24.0/src/prompts/library/finalMbL → data_science_document_ai-1.43.6/src/prompts/library/draftMbl}/other/prompt.txt +1 -1
  38. {data_science_document_ai-1.24.0/src/prompts/library/draftMbl → data_science_document_ai-1.43.6/src/prompts/library/finalMbL}/hapag-lloyd/prompt.txt +1 -1
  39. {data_science_document_ai-1.24.0/src/prompts/library/draftMbl → data_science_document_ai-1.43.6/src/prompts/library/finalMbL}/maersk/prompt.txt +2 -0
  40. {data_science_document_ai-1.24.0/src/prompts/library/draftMbl → data_science_document_ai-1.43.6/src/prompts/library/finalMbL}/other/prompt.txt +1 -1
  41. data_science_document_ai-1.43.6/src/prompts/library/packingList/other/placeholders.json +98 -0
  42. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/packingList/other/prompt.txt +1 -1
  43. data_science_document_ai-1.43.6/src/prompts/library/partnerInvoice/other/placeholders.json +181 -0
  44. data_science_document_ai-1.43.6/src/prompts/library/partnerInvoice/other/prompt.txt +90 -0
  45. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/shippingInstruction/other/prompt.txt +1 -0
  46. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/setup.py +8 -46
  47. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/utils.py +168 -13
  48. data_science_document_ai-1.24.0/src/postprocessing/common.py +0 -1177
  49. data_science_document_ai-1.24.0/src/postprocessing/postprocess_partner_invoice.py +0 -63
  50. data_science_document_ai-1.24.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -32
  51. data_science_document_ai-1.24.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -32
  52. data_science_document_ai-1.24.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -32
  53. data_science_document_ai-1.24.0/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -32
  54. data_science_document_ai-1.24.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -32
  55. data_science_document_ai-1.24.0/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -32
  56. data_science_document_ai-1.24.0/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -32
  57. data_science_document_ai-1.24.0/src/prompts/library/customsAssessment/other/placeholders.json +0 -19
  58. data_science_document_ai-1.24.0/src/prompts/library/deliveryOrder/other/placeholders.json +0 -31
  59. data_science_document_ai-1.24.0/src/prompts/library/finalMbL/other/placeholders.json +0 -80
  60. data_science_document_ai-1.24.0/src/prompts/library/partnerInvoice/other/placeholders.json +0 -105
  61. data_science_document_ai-1.24.0/src/prompts/library/partnerInvoice/other/prompt.txt +0 -53
  62. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/constants_sandbox.py +0 -0
  63. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/log_setup.py +0 -0
  64. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  65. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  66. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  67. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  68. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  69. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  70. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  71. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/prompts/prompt_library.py +0 -0
  72. {data_science_document_ai-1.24.0 → data_science_document_ai-1.43.6}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.24.0
3
+ Version: 1.43.6
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -38,6 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
38
38
  Requires-Dist: pgzip (>=0.3.5,<0.4.0)
39
39
  Requires-Dist: pyarrow (==16.1.0)
40
40
  Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
41
+ Requires-Dist: pypdf (>=6.1.2,<7.0.0)
41
42
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
42
43
  Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
43
44
  Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.24.0"
3
+ version = "1.43.6"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -48,6 +48,7 @@ rapidfuzz = "^3.12.2"
48
48
  fuzzywuzzy = "^0.18.0"
49
49
  nltk = "^3.9.1"
50
50
  pgzip = "^0.3.5"
51
+ pypdf = "^6.1.2"
51
52
 
52
53
  [tool.poetry.dev-dependencies]
53
54
  jupyter = "^1.0.0"
@@ -33,11 +33,23 @@ project_parameters = {
33
33
  # models metadata (confidence),
34
34
  "g_model_data_folder": "models",
35
35
  "local_model_data_folder": "data",
36
+ "released_doc_types": {
37
+ "bookingConfirmation",
38
+ "packingList",
39
+ "commercialInvoice",
40
+ "finalMbL",
41
+ "draftMbl",
42
+ "arrivalNotice",
43
+ "shippingInstruction",
44
+ "customsAssessment",
45
+ "deliveryOrder",
46
+ "partnerInvoice",
47
+ "customsInvoice",
48
+ "bundeskasse",
49
+ },
36
50
  "model_selector": {
37
51
  "stable": {
38
52
  "bookingConfirmation": 1,
39
- "packingList": 0,
40
- "commercialInvoice": 0,
41
53
  "finalMbL": 0,
42
54
  "draftMbl": 0,
43
55
  "arrivalNotice": 0,
@@ -55,11 +67,19 @@ project_parameters = {
55
67
  # LLM model parameters
56
68
  "gemini_params": {
57
69
  "temperature": 0,
58
- "maxOutputTokens": 8000,
70
+ "maxOutputTokens": 65536,
71
+ "top_p": 0.8,
72
+ "top_k": 40,
73
+ "seed": 42,
74
+ "model_id": "gemini-2.5-pro",
75
+ },
76
+ "gemini_flash_params": {
77
+ "temperature": 0,
78
+ "maxOutputTokens": 65536,
59
79
  "top_p": 0.8,
60
80
  "top_k": 40,
61
81
  "seed": 42,
62
- "model_id": "gemini-2.0-flash",
82
+ "model_id": "gemini-2.5-flash",
63
83
  },
64
84
  # Key to combine the LLM results with the Doc Ai results
65
85
  "key_to_combine": {
@@ -71,6 +91,8 @@ project_parameters = {
71
91
  "commercialInvoice": ["skus"],
72
92
  "shippingInstruction": ["containers"],
73
93
  "partnerInvoice": ["lineItem"],
94
+ "customsInvoice": ["lineItem"],
95
+ "bundeskasse": ["lineItem"],
74
96
  },
75
97
  }
76
98
 
@@ -3,11 +3,16 @@ import re
3
3
 
4
4
  from google.cloud import documentai
5
5
 
6
- from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
6
+ from src.io import (
7
+ delete_folder_from_bucket,
8
+ get_gcp_labels,
9
+ logger,
10
+ upload_pdf_to_bucket,
11
+ )
7
12
  from src.utils import cache_on_disk
8
13
 
9
14
 
10
- async def _process_pdf_w_docai(image_content, client, processor_name):
15
+ async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
11
16
  """Process the PDF using Document AI.
12
17
 
13
18
  Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
15
20
  client: The Document AI client.
16
21
  processor_name (str): The name of the processor to be used.
17
22
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
23
+ doc_type (str, optional): Document type for cost tracking labels.
18
24
 
19
25
  Returns:
20
26
  The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
24
30
  content=image_content, mime_type="application/pdf"
25
31
  )
26
32
 
27
- # Configure the process request
33
+ # Configure the process request with labels for cost tracking
28
34
  request = documentai.ProcessRequest(
29
35
  name=processor_name,
30
36
  raw_document=raw_document, # field_mask=field_mask
37
+ labels=get_gcp_labels(doc_type=doc_type),
31
38
  )
32
39
  result = await cache_on_disk(client.process_document, request=request)
33
40
 
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
35
42
 
36
43
 
37
44
  async def _batch_process_pdf_w_docai(
38
- params, image_content, client, processor_name, timeout=1200
45
+ params, image_content, client, processor_name, timeout=1200, doc_type=None
39
46
  ):
40
47
  """Process the PDF using Document AI Batch Process API.
41
48
 
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
45
52
  processor_name (str): The name of the processor to be used.
46
53
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
47
54
  timeout (int, optional): The timeout in seconds. Defaults to 1200.
55
+ doc_type (str, optional): Document type for cost tracking labels.
48
56
 
49
57
  Returns:
50
58
  The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
72
80
  # Where to write results
73
81
  output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
74
82
 
75
- # The full resource name of the processor
83
+ # The full resource name of the processor with labels for cost tracking
76
84
  request = documentai.BatchProcessRequest(
77
85
  name=processor_name,
78
86
  input_documents=input_config,
79
87
  document_output_config=output_config,
88
+ labels=get_gcp_labels(doc_type=doc_type),
80
89
  )
81
90
 
82
91
  # BatchProcess returns a Long Running Operation (LRO)
@@ -13,20 +13,6 @@ model_config:
13
13
  author: "igor.tonko@forto.com"
14
14
  created_date: ""
15
15
 
16
- packingList:
17
- - id: "d967005bd9d45aeb"
18
- details:
19
- display_name: "doc_cap_packingList"
20
- author: "kumar.rajendrababu@forto.com"
21
- created_date: ""
22
-
23
- commercialInvoice:
24
- - id: "7d37236207f75758"
25
- details:
26
- display_name: "doc_cap_commercialInvoice"
27
- author: "kumar.rajendrababu@forto.com"
28
- created_date: ""
29
-
30
16
  finalMbL:
31
17
  - id: "1eda2f22d64b1b89"
32
18
  details:
@@ -48,7 +34,6 @@ model_config:
48
34
  author: "kumar.rajendrababu@forto.com"
49
35
  created_date: ""
50
36
 
51
-
52
37
  arrivalNotice:
53
38
  - id: "748b2e2b9161dcf3"
54
39
  details:
@@ -70,13 +55,6 @@ model_config:
70
55
  author: "igor.tonko@forto.com"
71
56
  created_date: ""
72
57
 
73
- partnerInvoice:
74
- - id: "17d103181e745a05"
75
- details:
76
- display_name: "doc_cap_partnerInvoice"
77
- author: "osman.demirel@forto.com"
78
- created_date: ""
79
-
80
58
  beta:
81
59
  bookingConfirmation:
82
60
  - id: "3c280b11bdb3ed89"
@@ -2,21 +2,28 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
+ from ddtrace import tracer
6
+
7
+ from src.postprocessing.common import llm_prediction_to_tuples
8
+
5
9
  logger = logging.getLogger(__name__)
6
10
 
7
11
  import asyncio
8
- import json
9
12
 
10
13
  import numpy as np
11
14
  import pandas as pd
12
15
 
13
16
  from src.llm import prompt_excel_extraction
14
- from src.utils import generate_schema_structure, get_excel_sheets
17
+ from src.prompts.prompt_library import prompt_library
18
+ from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
15
19
 
16
20
 
17
- async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
21
+ async def extract_data_from_sheet(
22
+ llm_client, sheet_name, sheet, response_schema, doc_type=None
23
+ ):
18
24
  logger.info(f"Processing sheet: {sheet_name}")
19
- excel_content = pd.DataFrame(sheet.values)
25
+ excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
26
+
20
27
  # Convert to Markdown format for the LLM model
21
28
  worksheet = (
22
29
  "This is from a excel. Pay attention to the cell position:\n"
@@ -27,9 +34,10 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
27
34
  prompt_docai = prompt_excel_extraction(worksheet)
28
35
 
29
36
  try:
30
- result = await params["LlmClient"].get_unified_json_genai(
37
+ result = await llm_client.get_unified_json_genai(
31
38
  prompt_docai,
32
39
  response_schema=response_schema,
40
+ doc_type=doc_type,
33
41
  )
34
42
  except Exception as e:
35
43
  result = {}
@@ -42,8 +50,8 @@ async def extract_data_from_excel(
42
50
  params,
43
51
  input_doc_type,
44
52
  file_content,
45
- schema_client,
46
53
  mime_type,
54
+ llm_client,
47
55
  ):
48
56
  """Extract data from the Excel file.
49
57
 
@@ -51,8 +59,8 @@ async def extract_data_from_excel(
51
59
  params (dict): Parameters for the data extraction process.
52
60
  input_doc_type (str): The type of the document.
53
61
  file_content (bytes): The content of the Excel file to process.
54
- schema_client (DocumentSchemaClient): Client for the Document AI schema.
55
62
  mime_type (str): The MIME type of the file.
63
+ llm_client: The LLM client to use for data extraction.
56
64
 
57
65
  Returns:
58
66
  formatted_data (list): A list of dictionaries containing the extracted data.
@@ -61,20 +69,42 @@ async def extract_data_from_excel(
61
69
 
62
70
  """
63
71
  # Generate the response structure
64
- response_schema = generate_schema_structure(params, input_doc_type)
72
+ response_schema = (
73
+ prompt_library.library[input_doc_type]["other"]["placeholders"]
74
+ if input_doc_type
75
+ in [
76
+ "partnerInvoice",
77
+ "customsInvoice",
78
+ "bundeskasse",
79
+ "commercialInvoice",
80
+ "packingList",
81
+ ]
82
+ else generate_schema_structure(params, input_doc_type)
83
+ )
65
84
 
66
85
  # Load the Excel file and get ONLY the "visible" sheet names
67
86
  sheets, workbook = get_excel_sheets(file_content, mime_type)
68
87
 
88
+ # Track the number of sheets in dd-trace
89
+ span = tracer.current_span()
90
+ if span:
91
+ estimated_page_counts = [
92
+ estimate_page_count(workbook[sheet]) for sheet in sheets
93
+ ]
94
+ est_page_count = sum(estimated_page_counts)
95
+ span.set_metric("est_page_count", est_page_count)
96
+
69
97
  # Excel files may contain multiple sheets. Extract data from each sheet
70
98
  sheet_extract_tasks = [
71
99
  extract_data_from_sheet(
72
- params, sheet_name, workbook[sheet_name], response_schema
100
+ llm_client,
101
+ sheet_name,
102
+ workbook[sheet_name],
103
+ response_schema,
104
+ doc_type=input_doc_type,
73
105
  )
74
106
  for sheet_name in sheets
75
107
  ]
76
108
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
77
109
 
78
- stored_data = json.dumps(extracted_data)
79
-
80
- return extracted_data, stored_data, params["gemini_params"]["model_id"]
110
+ return extracted_data, extracted_data, llm_client.model_id
@@ -12,13 +12,55 @@ from pathlib import Path
12
12
  from google.cloud import bigquery, storage
13
13
 
14
14
 
15
+ def get_gcp_labels(**extra_labels):
16
+ """Generate standardized GCP labels for cost tracking.
17
+
18
+ Args:
19
+ **extra_labels: Additional custom labels
20
+
21
+ Returns:
22
+ dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
23
+ """
24
+ project_name = os.getenv("PROJECT_NAME")
25
+
26
+ # If not set, detect once and cache it
27
+ if not project_name:
28
+ # Try pyproject.toml first
29
+ try:
30
+ import toml
31
+
32
+ pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
33
+ if pyproject_path.exists():
34
+ config = toml.load(pyproject_path)
35
+ project_name = config.get("tool", {}).get("poetry", {}).get("name")
36
+ except Exception:
37
+ pass
38
+
39
+ # Fallback to unknown
40
+ if not project_name:
41
+ project_name = "unknown"
42
+
43
+ # Cache it
44
+ os.environ["PROJECT_NAME"] = project_name
45
+
46
+ labels = {
47
+ "ds-project-name": project_name.lower(),
48
+ "ds-env": os.getenv("CLUSTER", "local").lower(),
49
+ }
50
+
51
+ # Add any extra labels
52
+ labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
53
+
54
+ return labels
55
+
56
+
15
57
  def get_bq_client(params):
16
58
  """Get Google BigQuery client."""
17
59
  bq_client = bigquery.Client(project=params["g_ai_project_name"])
18
60
  job_config = bigquery.QueryJobConfig(
19
61
  allow_large_results=True,
20
62
  # flatten_results=True,
21
- labels={"project-name": params["project_name"]},
63
+ labels=get_gcp_labels(),
22
64
  )
23
65
  return bq_client, job_config
24
66
 
@@ -112,3 +154,6 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
112
154
  Path(directory).mkdir(parents=True, exist_ok=True)
113
155
  blob.download_to_filename(directory_local / Path(blob.name))
114
156
  return result
157
+
158
+
159
+ # type: ignore
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
15
15
  Part,
16
16
  )
17
17
 
18
+ from src.io import get_gcp_labels
18
19
  from src.utils import cache_on_disk
19
20
 
20
21
 
@@ -28,12 +29,12 @@ class LlmClient:
28
29
  # Initialize the model parameters
29
30
  self.model_params = {
30
31
  "temperature": parameters.get("temperature", 0),
31
- "max_output_tokens": parameters.get("maxOutputTokens", 8000),
32
+ "max_output_tokens": parameters.get("maxOutputTokens", 65536),
32
33
  "top_p": parameters.get("top_p", 0.8),
33
34
  "top_k": parameters.get("top_k", 40),
34
35
  "seed": parameters.get("seed", 42),
35
36
  }
36
- self.model_id = parameters.get("model_id", "gemini-1.5-pro-001")
37
+ self.model_id = parameters.get("model_id", "gemini-2.5-flash")
37
38
  # Initialize the safety configuration
38
39
  self.safety_config = {
39
40
  HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
@@ -69,6 +70,7 @@ class LlmClient:
69
70
  document: str = None,
70
71
  response_schema: dict = None,
71
72
  response_mime_type: str = "application/json",
73
+ doc_type: str = None,
72
74
  ):
73
75
  """Ask the Gemini model a question.
74
76
 
@@ -76,6 +78,7 @@ class LlmClient:
76
78
  prompt (str): The prompt to send to the model.
77
79
  document (str, optional): An optional document to provide context.
78
80
  response_schema (dict, optional): Defines a specific response schema for the model.
81
+ doc_type (str, optional): Document type for cost tracking labels.
79
82
 
80
83
  Returns:
81
84
  str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
96
99
  # Prepare inputs for the model
97
100
  inputs = [document, prompt] if document else prompt
98
101
 
99
- # Generate the response
102
+ # Generate the response with labels for cost tracking
100
103
  model_response = await cache_on_disk(
101
104
  self.geminy_client.generate_content_async,
102
105
  contents=inputs,
103
106
  generation_config=config,
104
107
  safety_settings=self.safety_config,
108
+ labels=get_gcp_labels(doc_type=doc_type),
105
109
  )
106
110
 
107
111
  response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
113
117
  return "{}"
114
118
 
115
119
  async def get_unified_json_genai(
116
- self, prompt, document=None, response_schema=None, model="gemini"
120
+ self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
117
121
  ):
118
122
  """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
119
123
 
@@ -122,6 +126,7 @@ class LlmClient:
122
126
  document: Content of the PDF document
123
127
  response_schema: The schema to use for the response
124
128
  model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
129
+ doc_type (str, optional): Document type for cost tracking labels.
125
130
 
126
131
  Returns:
127
132
  dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
131
136
  response = await self.ask_chatgpt(prompt, document, response_schema)
132
137
  else:
133
138
  # Default to Gemini
134
- response = await self.ask_gemini(prompt, document, response_schema)
139
+ response = await self.ask_gemini(
140
+ prompt, document, response_schema, doc_type=doc_type
141
+ )
135
142
 
136
143
  try:
137
144
  return json.loads(response)