data-science-document-ai 1.41.0__py3-none-any.whl → 1.42.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.41.0
3
+ Version: 1.42.1
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,13 +1,13 @@
1
1
  src/constants.py,sha256=TF_UblovdXZnKIb1lnyJwUqQncJCbzBVihoelI6foSU,3579
2
2
  src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
- src/docai.py,sha256=AepGdF3ZuSGkujLpewX393FgOBMy-e4sEudiGKho5EA,5280
3
+ src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
4
  src/docai_processor_config.yaml,sha256=qOMmCIORpLQ_D-ytvejXxFvER0e0uGYuzPVdZBGv4Pc,2105
5
- src/excel_processing.py,sha256=wArdSxwxdgyj5WVgVTaWsVSmF7z5zK6rq-bUKGENmo4,2660
6
- src/io.py,sha256=IXz4wWqiHa9mnHNgtrC6X9M2lItYp9eu6rHCThUIh5c,3585
7
- src/llm.py,sha256=aEK3rL8XvY7CakvkOJQmcHpEKwZRd8PPrLrzHiO-GFk,7827
5
+ src/excel_processing.py,sha256=jBL6h5T3fJ4uM_rFiV8c0yWAy8Tt3V3RFtBBqb8ztfo,2744
6
+ src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
7
+ src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=GNHQl_ryyVOHu3FK39XzPJzOCrn01NNW3E2HO43Ot_c,15836
10
- src/postprocessing/common.py,sha256=Vj_NohcgWZRCzipnPGeM-rg11wdDJ-wwCR12QeE6qOY,21451
9
+ src/pdf_processing.py,sha256=dxsYvNnONAjzS-T7K5aSo89rz7QcdW3ZDfeuFyeCeII,16294
10
+ src/postprocessing/common.py,sha256=lc95nGvy-KrFFQyX2X3ABMjrx1xVYDjuTBgeAXQTcuU,21570
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
13
  src/postprocessing/postprocess_partner_invoice.py,sha256=cM4te4qjOI_bXyrF8Zhb6X7eNf5aMKoRaPCFfqFv-98,11538
@@ -49,9 +49,9 @@ src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLI
49
49
  src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
50
50
  src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYpwmok_1dWQ2Oa8Qy9NGZ6CCLOUI4,1468
51
51
  src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg,2760
52
- src/setup.py,sha256=kPSZosrICfaGZeDaajr40Ha7Ok4XK4fo_uq35Omiwr0,7128
52
+ src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
53
53
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
54
- src/utils.py,sha256=SIEThJlaXWGoWV7236iNoAlabCPNge5oTBpDywTxJw0,15968
55
- data_science_document_ai-1.41.0.dist-info/METADATA,sha256=MmKbqDbe9voabVucTrE-GoM192GMqFgD09_KNvp6Wsg,2153
56
- data_science_document_ai-1.41.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
- data_science_document_ai-1.41.0.dist-info/RECORD,,
54
+ src/utils.py,sha256=nU69zR3TB7IZmCc19DD8H27Riek8GJAldmhJjCSwNEE,16090
55
+ data_science_document_ai-1.42.1.dist-info/METADATA,sha256=nsGhuml2YNlNF7s7aRUJPpY8psKss8wiLcIavpVInjs,2153
56
+ data_science_document_ai-1.42.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
+ data_science_document_ai-1.42.1.dist-info/RECORD,,
src/docai.py CHANGED
@@ -3,11 +3,16 @@ import re
3
3
 
4
4
  from google.cloud import documentai
5
5
 
6
- from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
6
+ from src.io import (
7
+ delete_folder_from_bucket,
8
+ get_gcp_labels,
9
+ logger,
10
+ upload_pdf_to_bucket,
11
+ )
7
12
  from src.utils import cache_on_disk
8
13
 
9
14
 
10
- async def _process_pdf_w_docai(image_content, client, processor_name):
15
+ async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
11
16
  """Process the PDF using Document AI.
12
17
 
13
18
  Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
15
20
  client: The Document AI client.
16
21
  processor_name (str): The name of the processor to be used.
17
22
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
23
+ doc_type (str, optional): Document type for cost tracking labels.
18
24
 
19
25
  Returns:
20
26
  The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
24
30
  content=image_content, mime_type="application/pdf"
25
31
  )
26
32
 
27
- # Configure the process request
33
+ # Configure the process request with labels for cost tracking
28
34
  request = documentai.ProcessRequest(
29
35
  name=processor_name,
30
36
  raw_document=raw_document, # field_mask=field_mask
37
+ labels=get_gcp_labels(doc_type=doc_type),
31
38
  )
32
39
  result = await cache_on_disk(client.process_document, request=request)
33
40
 
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
35
42
 
36
43
 
37
44
  async def _batch_process_pdf_w_docai(
38
- params, image_content, client, processor_name, timeout=1200
45
+ params, image_content, client, processor_name, timeout=1200, doc_type=None
39
46
  ):
40
47
  """Process the PDF using Document AI Batch Process API.
41
48
 
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
45
52
  processor_name (str): The name of the processor to be used.
46
53
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
47
54
  timeout (int, optional): The timeout in seconds. Defaults to 1200.
55
+ doc_type (str, optional): Document type for cost tracking labels.
48
56
 
49
57
  Returns:
50
58
  The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
72
80
  # Where to write results
73
81
  output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
74
82
 
75
- # The full resource name of the processor
83
+ # The full resource name of the processor with labels for cost tracking
76
84
  request = documentai.BatchProcessRequest(
77
85
  name=processor_name,
78
86
  input_documents=input_config,
79
87
  document_output_config=output_config,
88
+ labels=get_gcp_labels(doc_type=doc_type),
80
89
  )
81
90
 
82
91
  # BatchProcess returns a Long Running Operation (LRO)
src/excel_processing.py CHANGED
@@ -16,7 +16,9 @@ from src.llm import prompt_excel_extraction
16
16
  from src.utils import generate_schema_structure, get_excel_sheets
17
17
 
18
18
 
19
- async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
19
+ async def extract_data_from_sheet(
20
+ params, sheet_name, sheet, response_schema, doc_type=None
21
+ ):
20
22
  logger.info(f"Processing sheet: {sheet_name}")
21
23
  excel_content = pd.DataFrame(sheet.values)
22
24
  # Convert to Markdown format for the LLM model
@@ -32,6 +34,7 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
32
34
  result = await params["LlmClient"].get_unified_json_genai(
33
35
  prompt_docai,
34
36
  response_schema=response_schema,
37
+ doc_type=doc_type,
35
38
  )
36
39
  except Exception as e:
37
40
  result = {}
@@ -69,7 +72,11 @@ async def extract_data_from_excel(
69
72
  # Excel files may contain multiple sheets. Extract data from each sheet
70
73
  sheet_extract_tasks = [
71
74
  extract_data_from_sheet(
72
- params, sheet_name, workbook[sheet_name], response_schema
75
+ params,
76
+ sheet_name,
77
+ workbook[sheet_name],
78
+ response_schema,
79
+ doc_type=input_doc_type,
73
80
  )
74
81
  for sheet_name in sheets
75
82
  ]
@@ -77,6 +84,5 @@ async def extract_data_from_excel(
77
84
 
78
85
  # Convert LLM prediction dictionary to tuples of (value, page_number).
79
86
  extracted_data = llm_prediction_to_tuples(extracted_data)
80
- stored_data = json.dumps(extracted_data)
81
87
 
82
- return extracted_data, stored_data, params["gemini_params"]["model_id"]
88
+ return extracted_data, extracted_data, params["gemini_params"]["model_id"]
src/io.py CHANGED
@@ -12,13 +12,55 @@ from pathlib import Path
12
12
  from google.cloud import bigquery, storage
13
13
 
14
14
 
15
+ def get_gcp_labels(**extra_labels):
16
+ """Generate standardized GCP labels for cost tracking.
17
+
18
+ Args:
19
+ **extra_labels: Additional custom labels
20
+
21
+ Returns:
22
+ dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
23
+ """
24
+ project_name = os.getenv("PROJECT_NAME")
25
+
26
+ # If not set, detect once and cache it
27
+ if not project_name:
28
+ # Try pyproject.toml first
29
+ try:
30
+ import toml
31
+
32
+ pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
33
+ if pyproject_path.exists():
34
+ config = toml.load(pyproject_path)
35
+ project_name = config.get("tool", {}).get("poetry", {}).get("name")
36
+ except Exception:
37
+ pass
38
+
39
+ # Fallback to unknown
40
+ if not project_name:
41
+ project_name = "unknown"
42
+
43
+ # Cache it
44
+ os.environ["PROJECT_NAME"] = project_name
45
+
46
+ labels = {
47
+ "ds-project-name": project_name.lower(),
48
+ "ds-env": os.getenv("CLUSTER", "local").lower(),
49
+ }
50
+
51
+ # Add any extra labels
52
+ labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
53
+
54
+ return labels
55
+
56
+
15
57
  def get_bq_client(params):
16
58
  """Get Google BigQuery client."""
17
59
  bq_client = bigquery.Client(project=params["g_ai_project_name"])
18
60
  job_config = bigquery.QueryJobConfig(
19
61
  allow_large_results=True,
20
62
  # flatten_results=True,
21
- labels={"project-name": params["project_name"]},
63
+ labels=get_gcp_labels(),
22
64
  )
23
65
  return bq_client, job_config
24
66
 
@@ -112,3 +154,6 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
112
154
  Path(directory).mkdir(parents=True, exist_ok=True)
113
155
  blob.download_to_filename(directory_local / Path(blob.name))
114
156
  return result
157
+
158
+
159
+ # type: ignore
src/llm.py CHANGED
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
15
15
  Part,
16
16
  )
17
17
 
18
+ from src.io import get_gcp_labels
18
19
  from src.utils import cache_on_disk
19
20
 
20
21
 
@@ -69,6 +70,7 @@ class LlmClient:
69
70
  document: str = None,
70
71
  response_schema: dict = None,
71
72
  response_mime_type: str = "application/json",
73
+ doc_type: str = None,
72
74
  ):
73
75
  """Ask the Gemini model a question.
74
76
 
@@ -76,6 +78,7 @@ class LlmClient:
76
78
  prompt (str): The prompt to send to the model.
77
79
  document (str, optional): An optional document to provide context.
78
80
  response_schema (dict, optional): Defines a specific response schema for the model.
81
+ doc_type (str, optional): Document type for cost tracking labels.
79
82
 
80
83
  Returns:
81
84
  str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
96
99
  # Prepare inputs for the model
97
100
  inputs = [document, prompt] if document else prompt
98
101
 
99
- # Generate the response
102
+ # Generate the response with labels for cost tracking
100
103
  model_response = await cache_on_disk(
101
104
  self.geminy_client.generate_content_async,
102
105
  contents=inputs,
103
106
  generation_config=config,
104
107
  safety_settings=self.safety_config,
108
+ labels=get_gcp_labels(doc_type=doc_type),
105
109
  )
106
110
 
107
111
  response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
113
117
  return "{}"
114
118
 
115
119
  async def get_unified_json_genai(
116
- self, prompt, document=None, response_schema=None, model="gemini"
120
+ self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
117
121
  ):
118
122
  """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
119
123
 
@@ -122,6 +126,7 @@ class LlmClient:
122
126
  document: Content of the PDF document
123
127
  response_schema: The schema to use for the response
124
128
  model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
129
+ doc_type (str, optional): Document type for cost tracking labels.
125
130
 
126
131
  Returns:
127
132
  dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
131
136
  response = await self.ask_chatgpt(prompt, document, response_schema)
132
137
  else:
133
138
  # Default to Gemini
134
- response = await self.ask_gemini(prompt, document, response_schema)
139
+ response = await self.ask_gemini(
140
+ prompt, document, response_schema, doc_type=doc_type
141
+ )
135
142
 
136
143
  try:
137
144
  return json.loads(response)
src/pdf_processing.py CHANGED
@@ -14,7 +14,11 @@ from google.cloud.documentai_v1 import Document as docaiv1_document
14
14
 
15
15
  from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
16
16
  from src.excel_processing import extract_data_from_excel
17
- from src.postprocessing.common import format_all_entities, remove_none_values, llm_prediction_to_tuples
17
+ from src.postprocessing.common import (
18
+ format_all_entities,
19
+ llm_prediction_to_tuples,
20
+ remove_none_values,
21
+ )
18
22
  from src.postprocessing.postprocess_booking_confirmation import (
19
23
  postprocess_booking_confirmation,
20
24
  )
@@ -30,12 +34,14 @@ from src.utils import (
30
34
  generate_schema_structure,
31
35
  get_processor_name,
32
36
  run_background_tasks,
37
+ transform_schema_strings,
33
38
  validate_based_on_schema,
34
- transform_schema_strings
35
39
  )
36
40
 
37
41
 
38
- async def process_file_w_docai(params, image_content, client, processor_name):
42
+ async def process_file_w_docai(
43
+ params, image_content, client, processor_name, doc_type=None
44
+ ):
39
45
  """
40
46
  Process a file using Document AI.
41
47
 
@@ -44,6 +50,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
44
50
  image_content (bytes): The file to be processed. It can be bytes object.
45
51
  client: The Document AI client.
46
52
  processor_name (str): The name of the processor to be used.
53
+ doc_type (str, optional): Document type for cost tracking labels.
47
54
 
48
55
  Returns:
49
56
  The processed document.
@@ -55,7 +62,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
55
62
 
56
63
  try:
57
64
  logger.info("Processing document...")
58
- result = await _process_pdf_w_docai(image_content, client, processor_name)
65
+ result = await _process_pdf_w_docai(
66
+ image_content, client, processor_name, doc_type=doc_type
67
+ )
59
68
  except Exception as e:
60
69
  if e.reason == "PAGE_LIMIT_EXCEEDED":
61
70
  logger.warning(
@@ -64,7 +73,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
64
73
  # Process the document in batch method (offline processing)
65
74
  try:
66
75
  result = await _batch_process_pdf_w_docai(
67
- params, image_content, client, processor_name
76
+ params, image_content, client, processor_name, doc_type=doc_type
68
77
  )
69
78
  except Exception as batch_e:
70
79
  logger.error(f"Error processing document {batch_e}.")
@@ -94,7 +103,7 @@ async def extract_data_from_pdf_w_docai(
94
103
  )
95
104
 
96
105
  result = await process_file_w_docai(
97
- params, file_content, processor_client, processor_name
106
+ params, file_content, processor_client, processor_name, doc_type=input_doc_type
98
107
  )
99
108
 
100
109
  # Create an entity object to store the result in gcs
@@ -105,16 +114,22 @@ async def extract_data_from_pdf_w_docai(
105
114
  # Extract entities from the result
106
115
  for entity in result.entities:
107
116
  value = (
108
- {child.type_: (child.mention_text,
109
- child.page_anchor.page_refs[0].page
110
- if hasattr(child.page_anchor.page_refs[0], "page")
111
- else 0)
112
- for child in entity.properties}
117
+ {
118
+ child.type_: (
119
+ child.mention_text,
120
+ child.page_anchor.page_refs[0].page
121
+ if hasattr(child.page_anchor.page_refs[0], "page")
122
+ else 0,
123
+ )
124
+ for child in entity.properties
125
+ }
113
126
  if entity.properties
114
- else (entity.mention_text,
115
- entity.page_anchor.page_refs[0].page
116
- if hasattr(entity.page_anchor.page_refs[0], "page")
117
- else 0)
127
+ else (
128
+ entity.mention_text,
129
+ entity.page_anchor.page_refs[0].page
130
+ if hasattr(entity.page_anchor.page_refs[0], "page")
131
+ else 0,
132
+ )
118
133
  )
119
134
  aggregated_data[entity.type_].append(value)
120
135
 
@@ -145,7 +160,9 @@ async def extract_data_from_pdf_w_docai(
145
160
  return aggregated_data, result_for_store, processor_version
146
161
 
147
162
 
148
- async def identify_carrier(document, llm_client, prompt, response_schema):
163
+ async def identify_carrier(
164
+ document, llm_client, prompt, response_schema, doc_type=None
165
+ ):
149
166
  """Identify the carrier from the Booking Confirmation document."""
150
167
 
151
168
  result = await llm_client.ask_gemini(
@@ -153,6 +170,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
153
170
  document=document,
154
171
  response_schema=response_schema,
155
172
  response_mime_type="text/x.enum",
173
+ doc_type=doc_type,
156
174
  )
157
175
 
158
176
  if result:
@@ -209,7 +227,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
209
227
 
210
228
  # identify carrier for customized prompting
211
229
  carrier = await identify_carrier(
212
- document, llm_client, carrier_prompt, carrier_schema
230
+ document,
231
+ llm_client,
232
+ carrier_prompt,
233
+ carrier_schema,
234
+ doc_type=input_doc_type,
213
235
  )
214
236
 
215
237
  if input_doc_type == "bookingConfirmation":
@@ -226,7 +248,10 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
226
248
 
227
249
  # generate the result with LLM (gemini)
228
250
  result = await llm_client.get_unified_json_genai(
229
- prompt=prompt, document=document, response_schema=response_schema
251
+ prompt=prompt,
252
+ document=document,
253
+ response_schema=response_schema,
254
+ doc_type=input_doc_type,
230
255
  )
231
256
 
232
257
  result = llm_prediction_to_tuples(result)
@@ -389,7 +389,9 @@ async def format_label(entity_k, entity_value, document_type_code, params):
389
389
  formatted_value = None
390
390
 
391
391
  if entity_key.startswith("port"):
392
- formatted_value = await get_port_code_ai(entity_value, llm_client)
392
+ formatted_value = await get_port_code_ai(
393
+ entity_value, llm_client, doc_type=document_type_code
394
+ )
393
395
 
394
396
  elif (entity_key == "containertype") or (entity_key == "containersize"):
395
397
  formatted_value = get_tms_mappings(entity_value, "container_types")
@@ -484,14 +486,14 @@ async def format_label(entity_k, entity_value, document_type_code, params):
484
486
  return entity_k, result
485
487
 
486
488
 
487
- async def get_port_code_ai(port: str, llm_client):
489
+ async def get_port_code_ai(port: str, llm_client, doc_type=None):
488
490
  """Get port code using AI model."""
489
- port_llm = await get_port_code_llm(port, llm_client)
491
+ port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
490
492
 
491
493
  return get_tms_mappings(port, "ports", port_llm)
492
494
 
493
495
 
494
- async def get_port_code_llm(port: str, llm_client):
496
+ async def get_port_code_llm(port: str, llm_client, doc_type=None):
495
497
  if (
496
498
  "postprocessing" in prompt_library.library.keys()
497
499
  and "port_code" in prompt_library.library["postprocessing"].keys()
@@ -518,7 +520,7 @@ async def get_port_code_llm(port: str, llm_client):
518
520
  }
519
521
 
520
522
  response = await llm_client.get_unified_json_genai(
521
- prompt, response_schema=response_schema, model="chatgpt"
523
+ prompt, response_schema=response_schema, model="chatgpt", doc_type=doc_type
522
524
  )
523
525
  try:
524
526
  mapped_port = response["port"]
src/setup.py CHANGED
@@ -18,7 +18,7 @@ from src.constants import project_parameters
18
18
  from src.constants_sandbox import project_parameters_sandbox
19
19
 
20
20
  # Parent repos are imported without .
21
- from src.io import download_dir_from_bucket, get_storage_client, logger
21
+ from src.io import download_dir_from_bucket, get_bq_client, get_storage_client, logger
22
22
  from src.llm import LlmClient
23
23
 
24
24
 
@@ -118,6 +118,10 @@ def setup_params(args=None):
118
118
 
119
119
  params = setup_docai_client_and_path(params)
120
120
 
121
+ # Set up BigQuery client for logging
122
+ bq_client, _ = get_bq_client(params)
123
+ params["bq_client"] = bq_client
124
+
121
125
  # Set up Vertex AI for text embeddings
122
126
  setup_vertexai(params)
123
127
 
src/utils.py CHANGED
@@ -14,7 +14,7 @@ import requests
14
14
  from google.cloud import documentai_v1beta3 as docu_ai_beta
15
15
  from PyPDF2 import PdfReader, PdfWriter
16
16
 
17
- from src.io import get_bq_client, get_storage_client, logger
17
+ from src.io import get_storage_client, logger
18
18
 
19
19
 
20
20
  def bq_logs(data_to_insert, params):
@@ -24,8 +24,8 @@ def bq_logs(data_to_insert, params):
24
24
  data_to_insert (list): The data to insert into BigQuery.
25
25
  params (dict): The parameters dictionary.
26
26
  """
27
- # Get the BigQuery client
28
- bq_client, config = get_bq_client(params)
27
+ # Use the pre-initialized BigQuery client
28
+ bq_client = params["bq_client"]
29
29
  # Get the table string
30
30
  table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
31
31
 
@@ -139,7 +139,12 @@ def store_json_in_gcs(
139
139
  bucket = storage_client.bucket(params.get("doc_ai_bucket_name"))
140
140
  full_object_name = folder_path + document_id
141
141
  blob = bucket.blob(full_object_name)
142
- blob.upload_from_string(json_data, content_type="application/json")
142
+
143
+ # Convert dict to JSON string if needed
144
+ json_string = (
145
+ json.dumps(json_data) if isinstance(json_data, dict) else json_data
146
+ )
147
+ blob.upload_from_string(json_string, content_type="application/json")
143
148
 
144
149
  logger.info(
145
150
  f"JSON object stored successfully in gs://{params.get('doc_ai_bucket_name')}/{full_object_name}" # noqa
@@ -435,15 +440,13 @@ def transform_schema_strings(schema):
435
440
  new_schema = {
436
441
  "type": "OBJECT",
437
442
  "properties": {
438
- "value": {
439
- "type": "STRING"
440
- },
443
+ "value": {"type": "STRING"},
441
444
  "page_number": {
442
445
  "type": "STRING",
443
- "description": "Number of a page where the value was found in the document starting from 0."
444
- }
446
+ "description": "Number of a page where the value was found in the document starting from 0.",
447
+ },
445
448
  },
446
- "required": []
449
+ "required": [],
447
450
  }
448
451
 
449
452
  # Preserve original properties like nullable and description on the new 'value' key
@@ -468,4 +471,4 @@ def transform_schema_strings(schema):
468
471
 
469
472
  # Base case: for non-dict/list values (e.g., None, bool, str)
470
473
  else:
471
- return schema
474
+ return schema