data-science-document-ai 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +2 -2
  2. data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
  3. src/constants.py +6 -10
  4. src/docai.py +14 -5
  5. src/docai_processor_config.yaml +0 -56
  6. src/excel_processing.py +34 -13
  7. src/io.py +69 -1
  8. src/llm.py +10 -32
  9. src/pdf_processing.py +192 -54
  10. src/postprocessing/common.py +246 -44
  11. src/postprocessing/postprocess_partner_invoice.py +139 -85
  12. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  13. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  14. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  15. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  16. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  17. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  18. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  19. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  20. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  21. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  22. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  23. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  24. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  25. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  26. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  27. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  28. src/prompts/library/bundeskasse/other/placeholders.json +25 -25
  29. src/prompts/library/bundeskasse/other/prompt.txt +8 -6
  30. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  31. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  32. src/prompts/library/customsAssessment/other/placeholders.json +67 -16
  33. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  34. src/prompts/library/customsInvoice/other/placeholders.json +20 -20
  35. src/prompts/library/customsInvoice/other/prompt.txt +4 -4
  36. src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
  37. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  38. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  39. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  40. src/prompts/library/finalMbL/other/placeholders.json +34 -34
  41. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  42. src/prompts/library/packingList/other/placeholders.json +98 -0
  43. src/prompts/library/packingList/other/prompt.txt +1 -1
  44. src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
  45. src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
  46. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  47. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  48. src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
  49. src/setup.py +13 -16
  50. src/utils.py +157 -45
  51. data_science_document_ai-1.40.3.dist-info/RECORD +0 -59
  52. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
  53. src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
  54. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  55. src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
  56. {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +0 -0
src/pdf_processing.py CHANGED
@@ -9,12 +9,17 @@ logger = logging.getLogger(__name__)
9
9
  import asyncio
10
10
  from collections import defaultdict
11
11
 
12
+ from ddtrace import tracer
12
13
  from fastapi import HTTPException
13
14
  from google.cloud.documentai_v1 import Document as docaiv1_document
14
15
 
15
16
  from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
16
17
  from src.excel_processing import extract_data_from_excel
17
- from src.postprocessing.common import format_all_entities, remove_none_values
18
+ from src.postprocessing.common import (
19
+ format_all_entities,
20
+ llm_prediction_to_tuples,
21
+ remove_none_values,
22
+ )
18
23
  from src.postprocessing.postprocess_booking_confirmation import (
19
24
  postprocess_booking_confirmation,
20
25
  )
@@ -28,13 +33,18 @@ from src.prompts.prompt_library import prompt_library
28
33
  from src.utils import (
29
34
  extract_top_pages,
30
35
  generate_schema_structure,
36
+ get_pdf_page_count,
31
37
  get_processor_name,
32
38
  run_background_tasks,
39
+ split_pdf_into_chunks,
40
+ transform_schema_strings,
33
41
  validate_based_on_schema,
34
42
  )
35
43
 
36
44
 
37
- async def process_file_w_docai(params, image_content, client, processor_name):
45
+ async def process_file_w_docai(
46
+ params, image_content, client, processor_name, doc_type=None
47
+ ):
38
48
  """
39
49
  Process a file using Document AI.
40
50
 
@@ -43,6 +53,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
43
53
  image_content (bytes): The file to be processed. It can be bytes object.
44
54
  client: The Document AI client.
45
55
  processor_name (str): The name of the processor to be used.
56
+ doc_type (str, optional): Document type for cost tracking labels.
46
57
 
47
58
  Returns:
48
59
  The processed document.
@@ -54,7 +65,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
54
65
 
55
66
  try:
56
67
  logger.info("Processing document...")
57
- result = await _process_pdf_w_docai(image_content, client, processor_name)
68
+ result = await _process_pdf_w_docai(
69
+ image_content, client, processor_name, doc_type=doc_type
70
+ )
58
71
  except Exception as e:
59
72
  if e.reason == "PAGE_LIMIT_EXCEEDED":
60
73
  logger.warning(
@@ -63,7 +76,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
63
76
  # Process the document in batch method (offline processing)
64
77
  try:
65
78
  result = await _batch_process_pdf_w_docai(
66
- params, image_content, client, processor_name
79
+ params, image_content, client, processor_name, doc_type=doc_type
67
80
  )
68
81
  except Exception as batch_e:
69
82
  logger.error(f"Error processing document {batch_e}.")
@@ -93,7 +106,7 @@ async def extract_data_from_pdf_w_docai(
93
106
  )
94
107
 
95
108
  result = await process_file_w_docai(
96
- params, file_content, processor_client, processor_name
109
+ params, file_content, processor_client, processor_name, doc_type=input_doc_type
97
110
  )
98
111
 
99
112
  # Create an entity object to store the result in gcs
@@ -104,9 +117,22 @@ async def extract_data_from_pdf_w_docai(
104
117
  # Extract entities from the result
105
118
  for entity in result.entities:
106
119
  value = (
107
- {child.type_: child.mention_text for child in entity.properties}
120
+ {
121
+ child.type_: (
122
+ child.mention_text,
123
+ child.page_anchor.page_refs[0].page
124
+ if hasattr(child.page_anchor.page_refs[0], "page")
125
+ else 0,
126
+ )
127
+ for child in entity.properties
128
+ }
108
129
  if entity.properties
109
- else entity.mention_text
130
+ else (
131
+ entity.mention_text,
132
+ entity.page_anchor.page_refs[0].page
133
+ if hasattr(entity.page_anchor.page_refs[0], "page")
134
+ else 0,
135
+ )
110
136
  )
111
137
  aggregated_data[entity.type_].append(value)
112
138
 
@@ -137,7 +163,9 @@ async def extract_data_from_pdf_w_docai(
137
163
  return aggregated_data, result_for_store, processor_version
138
164
 
139
165
 
140
- async def identify_carrier(document, llm_client, prompt, response_schema):
166
+ async def identify_carrier(
167
+ document, llm_client, prompt, response_schema, doc_type=None
168
+ ):
141
169
  """Identify the carrier from the Booking Confirmation document."""
142
170
 
143
171
  result = await llm_client.ask_gemini(
@@ -145,6 +173,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
145
173
  document=document,
146
174
  response_schema=response_schema,
147
175
  response_mime_type="text/x.enum",
176
+ doc_type=doc_type,
148
177
  )
149
178
 
150
179
  if result:
@@ -167,61 +196,150 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
167
196
  result (dict): The structured data extracted from the document, formatted as JSON.
168
197
  """
169
198
  # Bundeskasse invoices contains all the required information in the first 3 pages.
170
- file_content = (
171
- extract_top_pages(file_content, num_pages=5)
172
- if input_doc_type == "bundeskasse"
173
- else file_content
174
- )
199
+ if input_doc_type == "bundeskasse":
200
+ file_content = extract_top_pages(file_content, num_pages=5)
175
201
 
176
- # convert file_content to required document
177
- document = llm_client.prepare_document_for_gemini(file_content)
202
+ number_of_pages = get_pdf_page_count(file_content)
203
+ logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
178
204
 
179
- # get the schema placeholder from the Doc AI and generate the response structure
180
- response_schema = (
181
- prompt_library.library[input_doc_type]["other"]["placeholders"]
182
- if input_doc_type in ["partnerInvoice", "customsInvoice", "bundeskasse"]
183
- else generate_schema_structure(params, input_doc_type)
184
- )
205
+ # get the schema placeholder
206
+ response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
185
207
 
186
208
  carrier = "other"
187
- if (
188
- "preprocessing" in prompt_library.library.keys()
189
- and "carrier" in prompt_library.library["preprocessing"].keys()
190
- and input_doc_type
191
- in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
192
- ):
193
- carrier_schema = prompt_library.library["preprocessing"]["carrier"][
194
- "placeholders"
195
- ][input_doc_type]
209
+ carrier_schema = (
210
+ prompt_library.library.get("preprocessing", {})
211
+ .get("carrier", {})
212
+ .get("placeholders", {})
213
+ .get(input_doc_type)
214
+ )
196
215
 
216
+ if carrier_schema:
197
217
  carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
198
218
  carrier_prompt = carrier_prompt.replace(
199
219
  "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
200
220
  )
201
221
 
222
+ # convert file_content to required document
223
+ document = llm_client.prepare_document_for_gemini(file_content)
224
+
202
225
  # identify carrier for customized prompting
203
226
  carrier = await identify_carrier(
204
- document, llm_client, carrier_prompt, carrier_schema
227
+ document,
228
+ llm_client,
229
+ carrier_prompt,
230
+ carrier_schema,
231
+ doc_type=input_doc_type,
205
232
  )
206
233
 
207
- if input_doc_type == "bookingConfirmation":
208
- response_schema = prompt_library.library[input_doc_type][carrier][
209
- "placeholders"
210
- ]
211
-
234
+ # Select prompt
212
235
  if (
213
- input_doc_type in prompt_library.library.keys()
214
- and carrier in prompt_library.library[input_doc_type].keys()
236
+ input_doc_type not in prompt_library.library
237
+ or carrier not in prompt_library.library[input_doc_type]
215
238
  ):
216
- # get the related prompt from predefined prompt library
217
- prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
239
+ return {}
240
+
241
+ # get the related prompt from predefined prompt library
242
+ prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
243
+
244
+ # Add page-number extraction for moderately large docs
245
+ use_chunking = number_of_pages >= params["chunk_after"]
218
246
 
219
- # generate the result with LLM (gemini)
220
- result = await llm_client.get_unified_json_genai(
221
- prompt=prompt, document=document, response_schema=response_schema
247
+ # Update schema and prompt to extract value-page_number pairs
248
+ if not use_chunking and number_of_pages > 1:
249
+ response_schema = transform_schema_strings(response_schema)
250
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
251
+
252
+ tasks = []
253
+ # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
254
+ for chunk in (
255
+ split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
256
+ if use_chunking
257
+ else [file_content]
258
+ ):
259
+ tasks.append(
260
+ process_chunk_with_retry(
261
+ chunk, prompt, response_schema, llm_client, input_doc_type
262
+ )
222
263
  )
223
- return result
224
- return {}
264
+
265
+ results = await asyncio.gather(*tasks, return_exceptions=True)
266
+
267
+ if use_chunking:
268
+ return merge_llm_results(results, response_schema)
269
+ else:
270
+ return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
271
+
272
+
273
+ async def process_chunk_with_retry(
274
+ chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
275
+ ):
276
+ """Process a chunk with retries in case of failure."""
277
+ for attempt in range(1, retries + 1):
278
+ try:
279
+ return await process_chunk(
280
+ chunk_content=chunk_content,
281
+ prompt=prompt,
282
+ response_schema=response_schema,
283
+ llm_client=llm_client,
284
+ input_doc_type=input_doc_type,
285
+ )
286
+ except Exception as e:
287
+ logger.error(f"Chunk failed on attempt {attempt}: {e}")
288
+ if attempt == retries:
289
+ raise
290
+ await asyncio.sleep(1) # small backoff
291
+
292
+
293
+ async def process_chunk(
294
+ chunk_content, prompt, response_schema, llm_client, input_doc_type
295
+ ):
296
+ """Process a chunk with Gemini."""
297
+ document = llm_client.prepare_document_for_gemini(chunk_content)
298
+ return await llm_client.get_unified_json_genai(
299
+ prompt=prompt,
300
+ document=document,
301
+ response_schema=response_schema,
302
+ doc_type=input_doc_type,
303
+ )
304
+
305
+
306
+ def merge_llm_results(results, response_schema):
307
+ """Merge LLM results from multiple chunks."""
308
+ merged = {}
309
+ for i, result in enumerate(results):
310
+ if not isinstance(result, dict):
311
+ continue
312
+ # Add page number to all values coming from this chunk
313
+ result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
314
+
315
+ # Merge the result into the final merged dictionary
316
+ for key, value in result.items():
317
+ field_type = (
318
+ response_schema["properties"].get(key, {}).get("type", "").upper()
319
+ )
320
+
321
+ if key not in merged:
322
+ if field_type == "ARRAY":
323
+ # append the values as a list
324
+ merged[key] = (
325
+ value if isinstance(value, list) else ([value] if value else [])
326
+ )
327
+ else:
328
+ merged[key] = value
329
+ continue
330
+
331
+ if field_type == "ARRAY":
332
+ # append list contents across chunks
333
+ if isinstance(value, list):
334
+ merged[key].extend(value)
335
+ else:
336
+ merged[key].append(value)
337
+
338
+ # take first non-null value only
339
+ if merged[key] in (None, "", [], {}):
340
+ merged[key] = value
341
+
342
+ return merged
225
343
 
226
344
 
227
345
  async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
@@ -298,15 +416,9 @@ async def extract_data_by_doctype(
298
416
  processor_client,
299
417
  if_use_docai,
300
418
  if_use_llm,
419
+ llm_client,
301
420
  isBetaTest=False,
302
421
  ):
303
- # Select LLM client (Using 2.5 Flash model for Bundeskasse)
304
- llm_client = (
305
- params["LlmClient_Flash"]
306
- if input_doc_type == "bundeskasse"
307
- else params["LlmClient"]
308
- )
309
-
310
422
  async def extract_w_docai():
311
423
  return await extract_data_from_pdf_w_docai(
312
424
  params=params,
@@ -355,6 +467,7 @@ async def data_extraction_manual_flow(
355
467
  meta,
356
468
  processor_client,
357
469
  schema_client,
470
+ use_default_logging=False,
358
471
  ):
359
472
  """
360
473
  Process a PDF file and extract data from it.
@@ -375,6 +488,15 @@ async def data_extraction_manual_flow(
375
488
  """
376
489
  # Get the start time for processing
377
490
  start_time = asyncio.get_event_loop().time()
491
+
492
+ # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
493
+ llm_client = (
494
+ params["LlmClient_Flash"]
495
+ if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
496
+ else params["LlmClient"]
497
+ )
498
+
499
+ page_count = None
378
500
  # Validate the file type
379
501
  if mime_type == "application/pdf":
380
502
  # Enable Doc Ai only for certain document types.
@@ -396,8 +518,10 @@ async def data_extraction_manual_flow(
396
518
  processor_client,
397
519
  if_use_docai=if_use_docai,
398
520
  if_use_llm=if_use_llm,
521
+ llm_client=llm_client,
399
522
  isBetaTest=False,
400
523
  )
524
+ page_count = get_pdf_page_count(file_content)
401
525
 
402
526
  elif "excel" in mime_type or "spreadsheet" in mime_type:
403
527
  # Extract data from the Excel file
@@ -406,8 +530,19 @@ async def data_extraction_manual_flow(
406
530
  input_doc_type=meta.documentTypeCode,
407
531
  file_content=file_content,
408
532
  mime_type=mime_type,
533
+ llm_client=llm_client,
409
534
  )
410
535
 
536
+ # Get sheet count from dd-trace span (set in extract_data_from_excel)
537
+ # Note: we use the span metric instead of len(extracted_data) because
538
+ # some sheets may fail extraction and not appear in extracted_data
539
+ span = tracer.current_span()
540
+ page_count = span.get_metric("est_page_count") if span else len(extracted_data)
541
+ if page_count > 100:
542
+ logger.warning(
543
+ f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
544
+ )
545
+
411
546
  else:
412
547
  raise HTTPException(
413
548
  status_code=400,
@@ -415,7 +550,7 @@ async def data_extraction_manual_flow(
415
550
  )
416
551
  # Create the result dictionary with the extracted data
417
552
  extracted_data = await format_all_entities(
418
- extracted_data, meta.documentTypeCode, params
553
+ extracted_data, meta.documentTypeCode, params, mime_type
419
554
  )
420
555
  result = {
421
556
  "id": meta.id,
@@ -430,7 +565,9 @@ async def data_extraction_manual_flow(
430
565
  logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
431
566
 
432
567
  # Schedule background tasks without using FastAPI's BackgroundTasks
433
- if os.getenv("CLUSTER") != "ode": # skip data export to bigquery in ODE environment
568
+ if (
569
+ os.getenv("CLUSTER") != "ode"
570
+ ) & use_default_logging: # skip data export to bigquery in ODE environment
434
571
  asyncio.create_task(
435
572
  run_background_tasks(
436
573
  params,
@@ -441,6 +578,7 @@ async def data_extraction_manual_flow(
441
578
  processor_version,
442
579
  mime_type,
443
580
  elapsed_time,
581
+ page_count,
444
582
  )
445
583
  )
446
584
  return result