data-science-document-ai 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +7 -2
  2. data_science_document_ai-1.56.1.dist-info/RECORD +60 -0
  3. {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +1 -1
  4. src/constants.py +42 -12
  5. src/constants_sandbox.py +2 -22
  6. src/docai.py +18 -7
  7. src/docai_processor_config.yaml +0 -64
  8. src/excel_processing.py +34 -15
  9. src/io.py +74 -6
  10. src/llm.py +12 -34
  11. src/pdf_processing.py +228 -78
  12. src/postprocessing/common.py +495 -618
  13. src/postprocessing/postprocess_partner_invoice.py +383 -27
  14. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  15. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  16. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  17. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  18. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  19. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  20. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  21. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  22. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  23. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  24. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  25. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  26. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  27. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  28. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  29. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  30. src/prompts/library/bundeskasse/other/placeholders.json +113 -0
  31. src/prompts/library/bundeskasse/other/prompt.txt +48 -0
  32. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  33. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  34. src/prompts/library/customsAssessment/other/placeholders.json +67 -16
  35. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  36. src/prompts/library/customsInvoice/other/placeholders.json +205 -0
  37. src/prompts/library/customsInvoice/other/prompt.txt +105 -0
  38. src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
  39. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  40. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  41. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  42. src/prompts/library/finalMbL/other/placeholders.json +34 -34
  43. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  44. src/prompts/library/packingList/other/placeholders.json +98 -0
  45. src/prompts/library/packingList/other/prompt.txt +1 -1
  46. src/prompts/library/partnerInvoice/other/placeholders.json +165 -45
  47. src/prompts/library/partnerInvoice/other/prompt.txt +82 -44
  48. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  49. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  50. src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
  51. src/setup.py +73 -63
  52. src/utils.py +207 -30
  53. data_science_document_ai-1.13.0.dist-info/RECORD +0 -55
  54. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
  55. src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
  56. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  57. src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
src/llm.py CHANGED
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
15
15
  Part,
16
16
  )
17
17
 
18
+ from src.io import get_gcp_labels
18
19
  from src.utils import cache_on_disk
19
20
 
20
21
 
@@ -28,12 +29,12 @@ class LlmClient:
28
29
  # Initialize the model parameters
29
30
  self.model_params = {
30
31
  "temperature": parameters.get("temperature", 0),
31
- "max_output_tokens": parameters.get("maxOutputTokens", 8000),
32
+ "max_output_tokens": parameters.get("maxOutputTokens", 65536),
32
33
  "top_p": parameters.get("top_p", 0.8),
33
34
  "top_k": parameters.get("top_k", 40),
34
35
  "seed": parameters.get("seed", 42),
35
36
  }
36
- self.model_id = parameters.get("model_id", "gemini-1.5-pro-001")
37
+ self.model_id = parameters.get("model_id", "gemini-2.5-flash")
37
38
  # Initialize the safety configuration
38
39
  self.safety_config = {
39
40
  HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
@@ -69,6 +70,7 @@ class LlmClient:
69
70
  document: str = None,
70
71
  response_schema: dict = None,
71
72
  response_mime_type: str = "application/json",
73
+ doc_type: str = None,
72
74
  ):
73
75
  """Ask the Gemini model a question.
74
76
 
@@ -76,6 +78,7 @@ class LlmClient:
76
78
  prompt (str): The prompt to send to the model.
77
79
  document (str, optional): An optional document to provide context.
78
80
  response_schema (dict, optional): Defines a specific response schema for the model.
81
+ doc_type (str, optional): Document type for cost tracking labels.
79
82
 
80
83
  Returns:
81
84
  str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
96
99
  # Prepare inputs for the model
97
100
  inputs = [document, prompt] if document else prompt
98
101
 
99
- # Generate the response
102
+ # Generate the response with labels for cost tracking
100
103
  model_response = await cache_on_disk(
101
104
  self.geminy_client.generate_content_async,
102
105
  contents=inputs,
103
106
  generation_config=config,
104
107
  safety_settings=self.safety_config,
108
+ labels=get_gcp_labels(doc_type=doc_type),
105
109
  )
106
110
 
107
111
  response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
113
117
  return "{}"
114
118
 
115
119
  async def get_unified_json_genai(
116
- self, prompt, document=None, response_schema=None, model="gemini"
120
+ self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
117
121
  ):
118
122
  """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
119
123
 
@@ -122,6 +126,7 @@ class LlmClient:
122
126
  document: Content of the PDF document
123
127
  response_schema: The schema to use for the response
124
128
  model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
129
+ doc_type (str, optional): Document type for cost tracking labels.
125
130
 
126
131
  Returns:
127
132
  dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
131
136
  response = await self.ask_chatgpt(prompt, document, response_schema)
132
137
  else:
133
138
  # Default to Gemini
134
- response = await self.ask_gemini(prompt, document, response_schema)
139
+ response = await self.ask_gemini(
140
+ prompt, document, response_schema, doc_type=doc_type
141
+ )
135
142
 
136
143
  try:
137
144
  return json.loads(response)
@@ -194,33 +201,4 @@ class LlmClient:
194
201
  return response
195
202
 
196
203
 
197
- def prompt_excel_extraction(excel_structured_text):
198
- """Write a prompt to extract data from Excel files.
199
-
200
- Args:
201
- excel_structured_text (str): The structured text of the Excel file.
202
-
203
- Returns:
204
- prompt str: The prompt for common json.
205
- """
206
- prompt = f"""{excel_structured_text}
207
-
208
- Task: Fill in the following dictionary from the information in the given in the above excel data.
209
-
210
- Instructions:
211
- - Do not change the keys of the following dictionary.
212
- - The values should be filled in as per the schema provided below.
213
- - If an entity contains a 'display_name', consider its properties as child data points in the below format.
214
- {{'data-field': {{
215
- 'child-data-field': 'type -occurrence_type- description',
216
- }}
217
- }}
218
- - The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
219
- - Ensure the schema reflects the hierarchical relationship.
220
- - Use the data field description to understand the context of the data.
221
-
222
- """
223
- return prompt
224
-
225
-
226
204
  # pylint: enable=all
src/pdf_processing.py CHANGED
@@ -2,19 +2,24 @@
2
2
  # flake8: noqa: E402
3
3
 
4
4
  import logging
5
+ import os
5
6
 
6
7
  logger = logging.getLogger(__name__)
7
8
 
8
9
  import asyncio
9
- import random
10
10
  from collections import defaultdict
11
11
 
12
+ from ddtrace import tracer
12
13
  from fastapi import HTTPException
13
14
  from google.cloud.documentai_v1 import Document as docaiv1_document
14
15
 
15
16
  from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
16
17
  from src.excel_processing import extract_data_from_excel
17
- from src.postprocessing.common import format_all_entities, remove_none_values
18
+ from src.postprocessing.common import (
19
+ format_all_entities,
20
+ llm_prediction_to_tuples,
21
+ remove_none_values,
22
+ )
18
23
  from src.postprocessing.postprocess_booking_confirmation import (
19
24
  postprocess_booking_confirmation,
20
25
  )
@@ -26,14 +31,19 @@ from src.postprocessing.postprocess_partner_invoice import (
26
31
  )
27
32
  from src.prompts.prompt_library import prompt_library
28
33
  from src.utils import (
29
- generate_schema_structure,
34
+ extract_top_pages,
35
+ get_pdf_page_count,
30
36
  get_processor_name,
31
37
  run_background_tasks,
38
+ split_pdf_into_chunks,
39
+ transform_schema_strings,
32
40
  validate_based_on_schema,
33
41
  )
34
42
 
35
43
 
36
- async def process_file_w_docai(params, image_content, client, processor_name):
44
+ async def process_file_w_docai(
45
+ params, image_content, client, processor_name, doc_type=None
46
+ ):
37
47
  """
38
48
  Process a file using Document AI.
39
49
 
@@ -42,6 +52,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
42
52
  image_content (bytes): The file to be processed. It can be bytes object.
43
53
  client: The Document AI client.
44
54
  processor_name (str): The name of the processor to be used.
55
+ doc_type (str, optional): Document type for cost tracking labels.
45
56
 
46
57
  Returns:
47
58
  The processed document.
@@ -53,7 +64,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
53
64
 
54
65
  try:
55
66
  logger.info("Processing document...")
56
- result = await _process_pdf_w_docai(image_content, client, processor_name)
67
+ result = await _process_pdf_w_docai(
68
+ image_content, client, processor_name, doc_type=doc_type
69
+ )
57
70
  except Exception as e:
58
71
  if e.reason == "PAGE_LIMIT_EXCEEDED":
59
72
  logger.warning(
@@ -62,7 +75,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
62
75
  # Process the document in batch method (offline processing)
63
76
  try:
64
77
  result = await _batch_process_pdf_w_docai(
65
- params, image_content, client, processor_name
78
+ params, image_content, client, processor_name, doc_type=doc_type
66
79
  )
67
80
  except Exception as batch_e:
68
81
  logger.error(f"Error processing document {batch_e}.")
@@ -92,7 +105,7 @@ async def extract_data_from_pdf_w_docai(
92
105
  )
93
106
 
94
107
  result = await process_file_w_docai(
95
- params, file_content, processor_client, processor_name
108
+ params, file_content, processor_client, processor_name, doc_type=input_doc_type
96
109
  )
97
110
 
98
111
  # Create an entity object to store the result in gcs
@@ -103,9 +116,22 @@ async def extract_data_from_pdf_w_docai(
103
116
  # Extract entities from the result
104
117
  for entity in result.entities:
105
118
  value = (
106
- {child.type_: child.mention_text for child in entity.properties}
119
+ {
120
+ child.type_: (
121
+ child.mention_text,
122
+ child.page_anchor.page_refs[0].page
123
+ if hasattr(child.page_anchor.page_refs[0], "page")
124
+ else 0,
125
+ )
126
+ for child in entity.properties
127
+ }
107
128
  if entity.properties
108
- else entity.mention_text
129
+ else (
130
+ entity.mention_text,
131
+ entity.page_anchor.page_refs[0].page
132
+ if hasattr(entity.page_anchor.page_refs[0], "page")
133
+ else 0,
134
+ )
109
135
  )
110
136
  aggregated_data[entity.type_].append(value)
111
137
 
@@ -121,7 +147,7 @@ async def extract_data_from_pdf_w_docai(
121
147
  ):
122
148
  aggregated_data = postprocess_booking_confirmation(aggregated_data)
123
149
  logger.info("Transport Legs assembled successfully")
124
- elif input_doc_type == "partnerInvoice":
150
+ elif input_doc_type in ["partnerInvoice", "customsInvoice"]:
125
151
  aggregated_data = postprocessing_partner_invoice(aggregated_data)
126
152
  logger.info("Partner Invoice naming changed successfully")
127
153
 
@@ -136,7 +162,9 @@ async def extract_data_from_pdf_w_docai(
136
162
  return aggregated_data, result_for_store, processor_version
137
163
 
138
164
 
139
- async def identify_carrier(document, llm_client, prompt, response_schema):
165
+ async def identify_carrier(
166
+ document, llm_client, prompt, response_schema, doc_type=None
167
+ ):
140
168
  """Identify the carrier from the Booking Confirmation document."""
141
169
 
142
170
  result = await llm_client.ask_gemini(
@@ -144,92 +172,183 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
144
172
  document=document,
145
173
  response_schema=response_schema,
146
174
  response_mime_type="text/x.enum",
175
+ doc_type=doc_type,
147
176
  )
148
177
 
149
178
  if result:
150
- result = result.lower()
179
+ result = result.strip().lower()
151
180
  else:
152
181
  result = "other"
153
182
  return result
154
183
 
155
184
 
156
- async def process_file_w_llm(
157
- params, file_content, input_doc_type, schema_client, llm_client
158
- ):
185
+ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
159
186
  """Process a document using a language model (gemini) to extract structured data.
160
187
 
161
188
  Args:
162
189
  params (dict): The project parameters.
163
190
  file_content (str): The content of the file to be processed.
164
191
  input_doc_type (str): The type of document, used to select the appropriate prompt from the prompt library.
165
- schema_client (object): The schema client object.
166
192
  llm_client: The LLM client object.
167
193
 
168
194
  Returns:
169
195
  result (dict): The structured data extracted from the document, formatted as JSON.
170
196
  """
171
- # convert file_content to required document
172
- document = llm_client.prepare_document_for_gemini(file_content)
197
+ # Bundeskasse invoices contains all the required information in the first 3 pages.
198
+ if input_doc_type == "bundeskasse":
199
+ file_content = extract_top_pages(file_content, num_pages=5)
200
+
201
+ number_of_pages = get_pdf_page_count(file_content)
202
+ logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
173
203
 
174
- # get the schema placeholder from the Doc AI and generate the response structure
175
- response_schema = generate_schema_structure(params, input_doc_type)
204
+ # get the schema placeholder
205
+ response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
176
206
 
177
207
  carrier = "other"
178
- if (
179
- "preprocessing" in prompt_library.library.keys()
180
- and "carrier" in prompt_library.library["preprocessing"].keys()
181
- and input_doc_type
182
- in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
183
- ):
184
- carrier_schema = prompt_library.library["preprocessing"]["carrier"][
185
- "placeholders"
186
- ][input_doc_type]
208
+ carrier_schema = (
209
+ prompt_library.library.get("preprocessing", {})
210
+ .get("carrier", {})
211
+ .get("placeholders", {})
212
+ .get(input_doc_type)
213
+ )
187
214
 
215
+ if carrier_schema:
188
216
  carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
189
217
  carrier_prompt = carrier_prompt.replace(
190
218
  "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
191
219
  )
192
220
 
221
+ # convert file_content to required document
222
+ document = llm_client.prepare_document_for_gemini(file_content)
223
+
193
224
  # identify carrier for customized prompting
194
225
  carrier = await identify_carrier(
195
- document, llm_client, carrier_prompt, carrier_schema
226
+ document,
227
+ llm_client,
228
+ carrier_prompt,
229
+ carrier_schema,
230
+ doc_type=input_doc_type,
196
231
  )
197
232
 
198
- # TODO: Remove the below line after the BC schema is updated in the Doc AI model
199
- if input_doc_type == "bookingConfirmation":
200
- response_schema = prompt_library.library[input_doc_type][carrier.lower()][
201
- "placeholders"
202
- ]
203
- # There is one more additional field in partnerInvoice
204
- # the reverseChargeSentence is added on later so its not available in Doc Ai schema.
205
- elif input_doc_type == "partnerInvoice":
206
- response_schema = prompt_library.library[input_doc_type][carrier.lower()][
207
- "placeholders"
208
- ]
209
-
233
+ # Select prompt
210
234
  if (
211
- input_doc_type in prompt_library.library.keys()
212
- and carrier.lower() in prompt_library.library[input_doc_type].keys()
235
+ input_doc_type not in prompt_library.library
236
+ or carrier not in prompt_library.library[input_doc_type]
213
237
  ):
214
- # get the related prompt from predefined prompt library
215
- prompt = prompt_library.library[input_doc_type][carrier.lower()]["prompt"]
238
+ return {}
239
+
240
+ # get the related prompt from predefined prompt library
241
+ prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
242
+
243
+ # Add page-number extraction for moderately large docs
244
+ use_chunking = number_of_pages >= params["chunk_after"]
245
+
246
+ # Update schema and prompt to extract value-page_number pairs
247
+ if not use_chunking and number_of_pages > 1:
248
+ response_schema = transform_schema_strings(response_schema)
249
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
216
250
 
217
- # generate the result with LLM (gemini)
218
- result = await llm_client.get_unified_json_genai(
219
- prompt=prompt, document=document, response_schema=response_schema
251
+ tasks = []
252
+ # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
253
+ for chunk in (
254
+ split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
255
+ if use_chunking
256
+ else [file_content]
257
+ ):
258
+ tasks.append(
259
+ process_chunk_with_retry(
260
+ chunk,
261
+ prompt,
262
+ response_schema,
263
+ llm_client,
264
+ input_doc_type,
265
+ )
220
266
  )
221
- return result
222
- return {}
267
+
268
+ results = await asyncio.gather(*tasks, return_exceptions=True)
269
+
270
+ if use_chunking:
271
+ return merge_llm_results(results, response_schema)
272
+ else:
273
+ return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
274
+
275
+
276
+ async def process_chunk_with_retry(
277
+ chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
278
+ ):
279
+ """Process a chunk with retries in case of failure."""
280
+ for attempt in range(1, retries + 1):
281
+ try:
282
+ return await process_chunk(
283
+ chunk_content=chunk_content,
284
+ prompt=prompt,
285
+ response_schema=response_schema,
286
+ llm_client=llm_client,
287
+ input_doc_type=input_doc_type,
288
+ )
289
+ except Exception as e:
290
+ logger.error(f"Chunk failed on attempt {attempt}: {e}")
291
+ if attempt == retries:
292
+ raise
293
+ await asyncio.sleep(1) # small backoff
223
294
 
224
295
 
225
- async def extract_data_from_pdf_w_llm(
226
- params, input_doc_type, file_content, schema_client, llm_client
296
+ async def process_chunk(
297
+ chunk_content, prompt, response_schema, llm_client, input_doc_type
227
298
  ):
299
+ """Process a chunk with Gemini."""
300
+ document = llm_client.prepare_document_for_gemini(chunk_content)
301
+ return await llm_client.get_unified_json_genai(
302
+ prompt=prompt,
303
+ document=document,
304
+ response_schema=response_schema,
305
+ doc_type=input_doc_type,
306
+ )
307
+
308
+
309
+ def merge_llm_results(results, response_schema):
310
+ """Merge LLM results from multiple chunks."""
311
+ merged = {}
312
+ for i, result in enumerate(results):
313
+ if not isinstance(result, dict):
314
+ continue
315
+ # Add page number to all values coming from this chunk
316
+ result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
317
+
318
+ # Merge the result into the final merged dictionary
319
+ for key, value in result.items():
320
+ field_type = (
321
+ response_schema["properties"].get(key, {}).get("type", "").upper()
322
+ )
323
+
324
+ if key not in merged:
325
+ if field_type == "ARRAY":
326
+ # append the values as a list
327
+ merged[key] = (
328
+ value if isinstance(value, list) else ([value] if value else [])
329
+ )
330
+ else:
331
+ merged[key] = value
332
+ continue
333
+
334
+ if field_type == "ARRAY":
335
+ # append list contents across chunks
336
+ if isinstance(value, list):
337
+ merged[key].extend(value)
338
+ else:
339
+ merged[key].append(value)
340
+
341
+ # take first non-null value only
342
+ if merged[key] in (None, "", [], {}):
343
+ merged[key] = value
344
+
345
+ return merged
346
+
347
+
348
+ async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
228
349
  """Extract data from the PDF file."""
229
350
  # Process the document using LLM
230
- result = await process_file_w_llm(
231
- params, file_content, input_doc_type, schema_client, llm_client
232
- )
351
+ result = await process_file_w_llm(params, file_content, input_doc_type, llm_client)
233
352
 
234
353
  # Add currency from the amount field
235
354
  if input_doc_type in ["commercialInvoice"]:
@@ -277,8 +396,8 @@ def combine_llm_results_w_doc_ai(
277
396
  for key in keys_to_combine:
278
397
  if key in llm.keys():
279
398
  # Merge the list of dictionaries
399
+ # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
280
400
  if len(llm[key]) < len(result[key]):
281
- # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
282
401
  result[key] = llm[key]
283
402
  else:
284
403
  # If the length of the LLM list is greater than or equal to the Doc AI result,
@@ -298,14 +417,11 @@ async def extract_data_by_doctype(
298
417
  file_content,
299
418
  input_doc_type,
300
419
  processor_client,
301
- schema_client,
302
420
  if_use_docai,
303
421
  if_use_llm,
422
+ llm_client,
304
423
  isBetaTest=False,
305
424
  ):
306
- # Select LLM client
307
- llm_client = params["LlmClient"]
308
-
309
425
  async def extract_w_docai():
310
426
  return await extract_data_from_pdf_w_docai(
311
427
  params=params,
@@ -320,7 +436,6 @@ async def extract_data_by_doctype(
320
436
  params=params,
321
437
  input_doc_type=input_doc_type,
322
438
  file_content=file_content,
323
- schema_client=schema_client,
324
439
  llm_client=llm_client,
325
440
  )
326
441
 
@@ -355,7 +470,7 @@ async def data_extraction_manual_flow(
355
470
  meta,
356
471
  processor_client,
357
472
  schema_client,
358
- embed_manager,
473
+ use_default_logging=False,
359
474
  ):
360
475
  """
361
476
  Process a PDF file and extract data from it.
@@ -367,7 +482,6 @@ async def data_extraction_manual_flow(
367
482
  meta (DocumentMeta): Metadata associated with the document.
368
483
  processor_client (DocumentProcessorClient): Client for the Document AI processor.
369
484
  schema_client (DocumentSchemaClient): Client for the Document AI schema.
370
- embed_manager (EmbeddingsManager): Manager for embeddings.
371
485
 
372
486
  Returns:
373
487
  dict: A dictionary containing the processed document information.
@@ -375,9 +489,23 @@ async def data_extraction_manual_flow(
375
489
  Raises:
376
490
  Refer to reasons in 400 error response examples.
377
491
  """
492
+ # Get the start time for processing
493
+ start_time = asyncio.get_event_loop().time()
494
+
495
+ # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
496
+ llm_client = (
497
+ params["LlmClient_Flash"]
498
+ if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
499
+ else params["LlmClient"]
500
+ )
501
+
502
+ page_count = None
378
503
  # Validate the file type
379
504
  if mime_type == "application/pdf":
380
- if_use_docai = True
505
+ # Enable Doc Ai only for certain document types.
506
+ if_use_docai = (
507
+ True if meta.documentTypeCode in params["model_config"]["stable"] else False
508
+ )
381
509
  if_use_llm = (
382
510
  True if meta.documentTypeCode in params["key_to_combine"].keys() else False
383
511
  )
@@ -391,11 +519,12 @@ async def data_extraction_manual_flow(
391
519
  file_content,
392
520
  meta.documentTypeCode,
393
521
  processor_client,
394
- schema_client,
395
522
  if_use_docai=if_use_docai,
396
523
  if_use_llm=if_use_llm,
524
+ llm_client=llm_client,
397
525
  isBetaTest=False,
398
526
  )
527
+ page_count = get_pdf_page_count(file_content)
399
528
 
400
529
  elif "excel" in mime_type or "spreadsheet" in mime_type:
401
530
  # Extract data from the Excel file
@@ -403,10 +532,20 @@ async def data_extraction_manual_flow(
403
532
  params=params,
404
533
  input_doc_type=meta.documentTypeCode,
405
534
  file_content=file_content,
406
- schema_client=schema_client,
407
535
  mime_type=mime_type,
536
+ llm_client=llm_client,
408
537
  )
409
538
 
539
+ # Get sheet count from dd-trace span (set in extract_data_from_excel)
540
+ # Note: we use the span metric instead of len(extracted_data) because
541
+ # some sheets may fail extraction and not appear in extracted_data
542
+ span = tracer.current_span()
543
+ page_count = span.get_metric("est_page_count") if span else len(extracted_data)
544
+ if page_count > 100:
545
+ logger.warning(
546
+ f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
547
+ )
548
+
410
549
  else:
411
550
  raise HTTPException(
412
551
  status_code=400,
@@ -414,7 +553,7 @@ async def data_extraction_manual_flow(
414
553
  )
415
554
  # Create the result dictionary with the extracted data
416
555
  extracted_data = await format_all_entities(
417
- extracted_data, embed_manager, meta.documentTypeCode, params["LlmClient"]
556
+ extracted_data, meta.documentTypeCode, params, mime_type
418
557
  )
419
558
  result = {
420
559
  "id": meta.id,
@@ -422,16 +561,27 @@ async def data_extraction_manual_flow(
422
561
  "data": extracted_data,
423
562
  "processor_version": processor_version,
424
563
  }
564
+
565
+ # Log the time taken for processing
566
+ end_time = asyncio.get_event_loop().time()
567
+ elapsed_time = end_time - start_time
568
+ logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
569
+
425
570
  # Schedule background tasks without using FastAPI's BackgroundTasks
426
- asyncio.create_task(
427
- run_background_tasks(
428
- params,
429
- meta.id,
430
- meta.documentTypeCode,
431
- extracted_data,
432
- store_data,
433
- processor_version,
434
- mime_type,
571
+ if (
572
+ os.getenv("CLUSTER") != "ode"
573
+ ) & use_default_logging: # skip data export to bigquery in ODE environment
574
+ asyncio.create_task(
575
+ run_background_tasks(
576
+ params,
577
+ meta.id,
578
+ meta.documentTypeCode,
579
+ extracted_data,
580
+ store_data,
581
+ processor_version,
582
+ mime_type,
583
+ elapsed_time,
584
+ page_count,
585
+ )
435
586
  )
436
- )
437
587
  return result