data-science-document-ai 1.43.6__tar.gz → 1.44.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/constants.py +3 -0
  4. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/excel_processing.py +1 -2
  5. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/io.py +23 -0
  6. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/pdf_processing.py +116 -40
  7. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/postprocessing/common.py +20 -15
  8. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/postprocessing/postprocess_partner_invoice.py +98 -35
  9. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/partnerInvoice/other/prompt.txt +2 -1
  10. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/utils.py +57 -45
  11. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/constants_sandbox.py +0 -0
  12. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/docai.py +0 -0
  13. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/docai_processor_config.yaml +0 -0
  14. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/llm.py +0 -0
  15. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/log_setup.py +0 -0
  16. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  17. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  18. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  19. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  20. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  21. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  22. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  23. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  24. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  25. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  26. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  27. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  28. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  29. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  30. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  31. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  32. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  33. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  34. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  35. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  36. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  37. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  38. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  39. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  40. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  41. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  42. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  43. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  44. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  46. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  47. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  48. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
  49. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  50. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  51. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  52. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  53. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  54. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  55. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  56. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/prompts/prompt_library.py +0 -0
  57. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/setup.py +0 -0
  58. {data_science_document_ai-1.43.6 → data_science_document_ai-1.44.0}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.43.6
3
+ Version: 1.44.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.43.6"
3
+ version = "1.44.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -26,6 +26,9 @@ project_parameters = {
26
26
  "fuzzy_threshold_item_code": 70,
27
27
  "fuzzy_threshold_reverse_charge": 80,
28
28
  "fuzzy_threshold_invoice_classification": 70,
29
+ # Chunking params
30
+ "chunk_size": 1, # page (do not change this without changing the page number logic)
31
+ "chunk_after": 10, # pages
29
32
  # Big Query
30
33
  "g_ai_gbq_db_schema": "document_ai",
31
34
  "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -4,8 +4,6 @@ import logging
4
4
 
5
5
  from ddtrace import tracer
6
6
 
7
- from src.postprocessing.common import llm_prediction_to_tuples
8
-
9
7
  logger = logging.getLogger(__name__)
10
8
 
11
9
  import asyncio
@@ -78,6 +76,7 @@ async def extract_data_from_excel(
78
76
  "bundeskasse",
79
77
  "commercialInvoice",
80
78
  "packingList",
79
+ "bookingConfirmation",
81
80
  ]
82
81
  else generate_schema_structure(params, input_doc_type)
83
82
  )
@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
156
156
  return result
157
157
 
158
158
 
159
+ def bq_logs(data_to_insert, params):
160
+ """Insert logs into Google BigQuery.
161
+
162
+ Args:
163
+ data_to_insert (list): The data to insert into BigQuery.
164
+ params (dict): The parameters dictionary.
165
+ """
166
+ # Use the pre-initialized BigQuery client
167
+ bq_client = params["bq_client"]
168
+ # Get the table string
169
+ table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
170
+
171
+ logger.info(f"Log table: {table_string}")
172
+ # Insert the rows into the table
173
+ insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
174
+
175
+ # Check if there were any errors inserting the rows
176
+ if not insert_logs:
177
+ logger.info("New rows have been added.")
178
+ else:
179
+ logger.info("Errors occurred while inserting rows: ", insert_logs)
180
+
181
+
159
182
  # type: ignore
@@ -36,6 +36,7 @@ from src.utils import (
36
36
  get_pdf_page_count,
37
37
  get_processor_name,
38
38
  run_background_tasks,
39
+ split_pdf_into_chunks,
39
40
  transform_schema_strings,
40
41
  validate_based_on_schema,
41
42
  )
@@ -195,15 +196,10 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
195
196
  result (dict): The structured data extracted from the document, formatted as JSON.
196
197
  """
197
198
  # Bundeskasse invoices contains all the required information in the first 3 pages.
198
- file_content = (
199
- extract_top_pages(file_content, num_pages=5)
200
- if input_doc_type == "bundeskasse"
201
- else file_content
202
- )
203
- number_of_pages = get_pdf_page_count(file_content)
199
+ if input_doc_type == "bundeskasse":
200
+ file_content = extract_top_pages(file_content, num_pages=5)
204
201
 
205
- # convert file_content to required document
206
- document = llm_client.prepare_document_for_gemini(file_content)
202
+ number_of_pages = get_pdf_page_count(file_content)
207
203
 
208
204
  # get the schema placeholder from the Doc AI and generate the response structure
209
205
  response_schema = (
@@ -215,26 +211,28 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
215
211
  "bundeskasse",
216
212
  "commercialInvoice",
217
213
  "packingList",
214
+ "bookingConfirmation",
218
215
  ]
219
216
  else generate_schema_structure(params, input_doc_type)
220
217
  )
221
218
 
222
219
  carrier = "other"
223
- if (
224
- "preprocessing" in prompt_library.library.keys()
225
- and "carrier" in prompt_library.library["preprocessing"].keys()
226
- and input_doc_type
227
- in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
228
- ):
229
- carrier_schema = prompt_library.library["preprocessing"]["carrier"][
230
- "placeholders"
231
- ][input_doc_type]
220
+ carrier_schema = (
221
+ prompt_library.library.get("preprocessing", {})
222
+ .get("carrier", {})
223
+ .get("placeholders", {})
224
+ .get(input_doc_type)
225
+ )
232
226
 
227
+ if carrier_schema:
233
228
  carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
234
229
  carrier_prompt = carrier_prompt.replace(
235
230
  "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
236
231
  )
237
232
 
233
+ # convert file_content to required document
234
+ document = llm_client.prepare_document_for_gemini(file_content)
235
+
238
236
  # identify carrier for customized prompting
239
237
  carrier = await identify_carrier(
240
238
  document,
@@ -244,37 +242,115 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
244
242
  doc_type=input_doc_type,
245
243
  )
246
244
 
247
- if input_doc_type == "bookingConfirmation":
248
- response_schema = prompt_library.library[input_doc_type][carrier][
249
- "placeholders"
250
- ]
251
-
245
+ # Select prompt
252
246
  if (
253
- input_doc_type in prompt_library.library.keys()
254
- and carrier in prompt_library.library[input_doc_type].keys()
247
+ input_doc_type not in prompt_library.library
248
+ or carrier not in prompt_library.library[input_doc_type]
255
249
  ):
256
- # get the related prompt from predefined prompt library
257
- prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
250
+ return {}
258
251
 
259
- # Update schema to extract value-page_number pairs
260
- if number_of_pages > 1:
261
- response_schema = transform_schema_strings(response_schema)
252
+ # get the related prompt from predefined prompt library
253
+ prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
262
254
 
263
- # Update the prompt to instruct LLM to include page numbers
264
- prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
255
+ # Add page-number extraction for moderately large docs
256
+ use_chunking = number_of_pages >= params["chunk_after"]
265
257
 
266
- # generate the result with LLM (gemini)
267
- result = await llm_client.get_unified_json_genai(
268
- prompt=prompt,
269
- document=document,
270
- response_schema=response_schema,
271
- doc_type=input_doc_type,
258
+ # Update schema and prompt to extract value-page_number pairs
259
+ if not use_chunking and number_of_pages > 1:
260
+ response_schema = transform_schema_strings(response_schema)
261
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
262
+
263
+ tasks = []
264
+ # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
265
+ for chunk in (
266
+ split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
267
+ if use_chunking
268
+ else [file_content]
269
+ ):
270
+ tasks.append(
271
+ process_chunk_with_retry(
272
+ chunk, prompt, response_schema, llm_client, input_doc_type
273
+ )
272
274
  )
273
275
 
274
- result = llm_prediction_to_tuples(result, number_of_pages)
276
+ results = await asyncio.gather(*tasks, return_exceptions=True)
275
277
 
276
- return result
277
- return {}
278
+ if use_chunking:
279
+ return merge_llm_results(results, response_schema)
280
+ else:
281
+ return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
282
+
283
+
284
+ async def process_chunk_with_retry(
285
+ chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
286
+ ):
287
+ """Process a chunk with retries in case of failure."""
288
+ for attempt in range(1, retries + 1):
289
+ try:
290
+ return await process_chunk(
291
+ chunk_content=chunk_content,
292
+ prompt=prompt,
293
+ response_schema=response_schema,
294
+ llm_client=llm_client,
295
+ input_doc_type=input_doc_type,
296
+ )
297
+ except Exception as e:
298
+ logger.error(f"Chunk failed on attempt {attempt}: {e}")
299
+ if attempt == retries:
300
+ raise
301
+ await asyncio.sleep(1) # small backoff
302
+
303
+
304
+ async def process_chunk(
305
+ chunk_content, prompt, response_schema, llm_client, input_doc_type
306
+ ):
307
+ """Process a chunk with Gemini."""
308
+ document = llm_client.prepare_document_for_gemini(chunk_content)
309
+ return await llm_client.get_unified_json_genai(
310
+ prompt=prompt,
311
+ document=document,
312
+ response_schema=response_schema,
313
+ doc_type=input_doc_type,
314
+ )
315
+
316
+
317
+ def merge_llm_results(results, response_schema):
318
+ """Merge LLM results from multiple chunks."""
319
+ merged = {}
320
+ for i, result in enumerate(results):
321
+ if not isinstance(result, dict):
322
+ continue
323
+ # Add page number to all values coming from this chunk
324
+ result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
325
+
326
+ # Merge the result into the final merged dictionary
327
+ for key, value in result.items():
328
+ field_type = (
329
+ response_schema["properties"].get(key, {}).get("type", "").upper()
330
+ )
331
+
332
+ if key not in merged:
333
+ if field_type == "ARRAY":
334
+ # append the values as a list
335
+ merged[key] = (
336
+ value if isinstance(value, list) else ([value] if value else [])
337
+ )
338
+ else:
339
+ merged[key] = value
340
+ continue
341
+
342
+ if field_type == "ARRAY":
343
+ # append list contents across chunks
344
+ if isinstance(value, list):
345
+ merged[key].extend(value)
346
+ else:
347
+ merged[key].append(value)
348
+
349
+ # take first non-null value only
350
+ if merged[key] in (None, "", [], {}):
351
+ merged[key] = value
352
+
353
+ return merged
278
354
 
279
355
 
280
356
  async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
@@ -405,13 +405,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
405
405
  )
406
406
 
407
407
  elif (entity_key == "containertype") or (entity_key == "containersize"):
408
- formatted_value = get_tms_mappings(entity_value, "container_types")
408
+ formatted_value = await get_tms_mappings(entity_value, "container_types")
409
409
 
410
410
  elif check_formatting_rule(entity_k, document_type_code, "terminal"):
411
- formatted_value = get_tms_mappings(entity_value, "terminals")
411
+ formatted_value = await get_tms_mappings(entity_value, "terminals")
412
412
 
413
413
  elif check_formatting_rule(entity_k, document_type_code, "depot"):
414
- formatted_value = get_tms_mappings(entity_value, "depots")
414
+ formatted_value = await get_tms_mappings(entity_value, "depots")
415
415
 
416
416
  elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
417
417
  try:
@@ -507,7 +507,7 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
507
507
  """Get port code using AI model."""
508
508
  port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
509
509
 
510
- return get_tms_mappings(port, "ports", port_llm)
510
+ return await get_tms_mappings(port, "ports", port_llm)
511
511
 
512
512
 
513
513
  async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -619,7 +619,7 @@ async def format_all_entities(result, document_type_code, params, mime_type):
619
619
 
620
620
  # Process partner invoice on lineitem mapping and reverse charge sentence
621
621
  if document_type_code in ["partnerInvoice", "bundeskasse"]:
622
- process_partner_invoice(params, aggregated_data, document_type_code)
622
+ await process_partner_invoice(params, aggregated_data, document_type_code)
623
623
 
624
624
  logger.info("Data Extraction completed successfully")
625
625
  return aggregated_data
@@ -651,41 +651,46 @@ def remove_stop_words(lineitem: str):
651
651
  )
652
652
 
653
653
 
654
- def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
654
+ def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
655
655
  """Convert LLM prediction dictionary to tuples of (value, page_number)."""
656
-
657
656
  # If only 1 page, simply pair each value with page number 0
658
657
  if number_of_pages == 1:
658
+ effective_page = 0 if page_number is None else page_number
659
659
  if isinstance(llm_prediction, dict):
660
660
  return {
661
- k: llm_prediction_to_tuples(v, number_of_pages)
661
+ k: llm_prediction_to_tuples(
662
+ v, number_of_pages, page_number=effective_page
663
+ )
662
664
  for k, v in llm_prediction.items()
663
665
  }
664
666
  elif isinstance(llm_prediction, list):
665
667
  return [
666
- llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
668
+ llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
669
+ for v in llm_prediction
667
670
  ]
668
671
  else:
669
- return (llm_prediction, 0) if llm_prediction else None
672
+ return (llm_prediction, effective_page) if llm_prediction else None
670
673
 
671
674
  # logic for multi-page predictions
672
675
  if isinstance(llm_prediction, dict):
673
676
  if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
674
677
  if llm_prediction["value"]:
675
678
  try:
676
- page_number = int(llm_prediction["page_number"])
679
+ _page_number = int(llm_prediction["page_number"])
677
680
  except: # noqa: E722
678
- page_number = -1
679
- return (llm_prediction["value"], page_number)
681
+ _page_number = -1
682
+ return (llm_prediction["value"], _page_number)
680
683
  return None
681
684
 
682
685
  for key, value in llm_prediction.items():
683
686
  llm_prediction[key] = llm_prediction_to_tuples(
684
- llm_prediction.get(key, value), number_of_pages
687
+ llm_prediction.get(key, value), number_of_pages, page_number
685
688
  )
686
689
 
687
690
  elif isinstance(llm_prediction, list):
688
691
  for i, item in enumerate(llm_prediction):
689
- llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
692
+ llm_prediction[i] = llm_prediction_to_tuples(
693
+ item, number_of_pages, page_number
694
+ )
690
695
 
691
696
  return llm_prediction
@@ -136,7 +136,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
136
136
  ] = "Dasbachstraße 15, 54292 Trier, Germany"
137
137
 
138
138
 
139
- def process_partner_invoice(params, aggregated_data, document_type_code):
139
+ async def process_partner_invoice(params, aggregated_data, document_type_code):
140
140
  """Process the partner invoice data."""
141
141
  # Post process bundeskasse invoices
142
142
  if document_type_code == "bundeskasse":
@@ -160,21 +160,75 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
160
160
  reverse_charge_info["formattedValue"] = reverse_charge_value
161
161
  reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
162
162
 
163
- # Process each line item
164
- for line_item in line_items:
165
- if line_item.get("lineItemDescription", None) is not None:
166
- line_item["itemCode"] = associate_forto_item_code(
167
- line_item["lineItemDescription"]["formattedValue"],
168
- params,
169
- )
163
+ # Process everything in one go
164
+ processed_items = await process_line_items_batch(params, line_items, reverse_charge)
165
+
166
+ # Update your main data structure
167
+ aggregated_data["lineItem"] = processed_items
170
168
 
171
- # Add page number for the consistency
172
- line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
173
169
 
174
- if reverse_charge:
175
- # Distribute reverseChargeSentence to all line items
176
- line_item["reverseChargeSentence"] = reverse_charge
177
- line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
170
+ async def process_line_items_batch(
171
+ params: dict, line_items: list[dict], reverse_charge=None
172
+ ):
173
+ """
174
+ Processes all line items efficiently using a "Split-Apply-Combine" strategy.
175
+ """
176
+ # To store items that need external API lookup
177
+ pending_line_items = {}
178
+
179
+ # Check Fuzzy Matching
180
+ for i, item in enumerate(line_items):
181
+ description_obj = item.get("lineItemDescription")
182
+
183
+ if not description_obj or not description_obj.get("formattedValue"):
184
+ continue
185
+ # Get the formatted description text
186
+ desc = description_obj["formattedValue"]
187
+
188
+ # Find Fuzzy Match
189
+ matched_code = find_matching_lineitem(
190
+ desc,
191
+ params["lookup_data"]["item_code"],
192
+ params["fuzzy_threshold_item_code"],
193
+ )
194
+
195
+ if matched_code:
196
+ # Set the code to the line item
197
+ item["itemCode"] = {
198
+ "documentValue": desc,
199
+ "formattedValue": matched_code,
200
+ "page": description_obj.get("page"),
201
+ }
202
+ else:
203
+ # Store for batch API call
204
+ pending_line_items[i] = desc
205
+
206
+ # Batch API Call for Embedding lookups
207
+ if pending_line_items:
208
+ values_to_fetch = list(set(pending_line_items.values()))
209
+ logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
210
+
211
+ # Await the batch response {"desc1": "code1", "desc2": "code2"}
212
+ api_results = await get_tms_mappings(
213
+ input_list=values_to_fetch, embedding_type="line_items"
214
+ )
215
+
216
+ # Merge API results back into original list
217
+ for index, desc in pending_line_items.items():
218
+ # Get result from API response, or None if API failed for that item
219
+ forto_code = api_results.get(desc)
220
+
221
+ # Update the original item
222
+ line_items[index]["itemCode"] = {
223
+ "documentValue": desc,
224
+ "formattedValue": forto_code, # Might be None if API failed
225
+ "page": line_items[index]["lineItemDescription"].get("page"),
226
+ }
227
+
228
+ # Add reverse charge here if exists
229
+ if reverse_charge:
230
+ [item.update({"reverseChargeSentence": reverse_charge}) for item in line_items]
231
+ return line_items
178
232
 
179
233
 
180
234
  def compute_score(args):
@@ -250,32 +304,41 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
250
304
  return kvp_dict.get(best_match, None)
251
305
 
252
306
 
253
- def associate_forto_item_code(input_string, params):
307
+ async def associate_forto_item_code(line_item_data, params):
254
308
  """
255
- Finds a match for the input string using fuzzy matching first, then embedding fallback.
256
-
257
- 1. Tries to find a fuzzy match for input_string against the keys in
258
- mapping_data using RapidFuzz, requiring a score >= fuzzy_threshold.
259
- 2. If found, returns the corresponding value from mapping_data.
260
- 3. If not found above threshold, calls the embedding_fallback function.
261
-
309
+ Associates Forto item codes to a list of line item descriptions.
262
310
  Args:
263
- input_string: The string to find a match for.
264
- params: Parameters containing the lookup data and fuzzy threshold.
311
+ line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
312
+ params (dict): Parameters containing lookup data and thresholds.
265
313
 
266
314
  Returns:
267
- The matched value (from fuzzy match or embedding), or None if no match found.
315
+ list: A list of dictionaries with 'description' and 'itemCode' keys.
268
316
  """
269
- # Get the Forto item code using fuzzy matching
270
- forto_item_code = find_matching_lineitem(
271
- new_lineitem=input_string,
272
- kvp_dict=params["lookup_data"]["item_code"], # TODO: Parse the KVP dictionary
273
- threshold=params["fuzzy_threshold_item_code"],
274
- )
275
317
 
276
- if forto_item_code is None:
277
- # 2. Fallback to embedding function if no good fuzzy match
278
- forto_item_code = get_tms_mappings(input_string, "line_items")
318
+ result = []
319
+ pending_line_items = {}
320
+ for desc, f_desc in line_item_data.items():
321
+ # Get the Forto item code using fuzzy matching
322
+ code = find_matching_lineitem(
323
+ new_lineitem=f_desc,
324
+ kvp_dict=params["lookup_data"]["item_code"],
325
+ threshold=params["fuzzy_threshold_item_code"],
326
+ )
327
+ if code:
328
+ result.append({"description": desc, "itemCode": code})
329
+ else:
330
+ pending_line_items[desc] = f_desc
331
+
332
+ # Batch API Call for Embedding lookups
333
+ if pending_line_items:
334
+ api_results = await get_tms_mappings(
335
+ input_list=list(pending_line_items.values()),
336
+ embedding_type="line_items",
337
+ )
338
+
339
+ # Merge API results back into original list
340
+ for desc, f_desc in pending_line_items.items():
341
+ code = api_results.get(f_desc)
342
+ result.append({"description": desc, "itemCode": code})
279
343
 
280
- result = {"documentValue": input_string, "formattedValue": forto_item_code}
281
344
  return result
@@ -53,7 +53,7 @@ Your role is to accurately extract specific entities from these invoices to supp
53
53
  - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
54
54
  - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
55
55
  - quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
56
- - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
56
+ - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
57
57
 
58
58
  - hblNumber and mblNumber:
59
59
  - The Master Bill of Lading number. Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", or "HBL No.".
@@ -81,6 +81,7 @@ Your role is to accurately extract specific entities from these invoices to supp
81
81
 
82
82
  IMPORTANT NOTE:
83
83
  - Ensure all extracted values are directly from the document. Do not make assumptions, modifications or calculations.
84
+ - Do not split the quantity into different line items. e.g., if quantity is 2 or 2 CTR or 2 BIL, do not create 2 separate line items with quantity 1 each.
84
85
  - Do not normalize or modify any entity values.
85
86
  - Pay attention to the line item details and paymentInformation, as they may vary significantly across different invoices.
86
87
 
@@ -6,16 +6,16 @@ import json
6
6
  import os
7
7
  import pickle
8
8
  from datetime import datetime
9
- from typing import Literal
9
+ from typing import Any, Dict, List, Literal, Optional
10
10
 
11
+ import httpx
11
12
  import numpy as np
12
13
  import openpyxl
13
14
  import pandas as pd
14
- import requests
15
15
  from google.cloud import documentai_v1beta3 as docu_ai_beta
16
16
  from pypdf import PdfReader, PdfWriter
17
17
 
18
- from src.io import get_storage_client, logger
18
+ from src.io import bq_logs, get_storage_client, logger
19
19
 
20
20
 
21
21
  def get_pdf_page_count(pdf_bytes):
@@ -31,29 +31,6 @@ def get_pdf_page_count(pdf_bytes):
31
31
  return len(reader.pages)
32
32
 
33
33
 
34
- def bq_logs(data_to_insert, params):
35
- """Insert logs into Google BigQuery.
36
-
37
- Args:
38
- data_to_insert (list): The data to insert into BigQuery.
39
- params (dict): The parameters dictionary.
40
- """
41
- # Use the pre-initialized BigQuery client
42
- bq_client = params["bq_client"]
43
- # Get the table string
44
- table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
45
-
46
- logger.info(f"Log table: {table_string}")
47
- # Insert the rows into the table
48
- insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
49
-
50
- # Check if there were any errors inserting the rows
51
- if not insert_logs:
52
- logger.info("New rows have been added.")
53
- else:
54
- logger.info("Errors occurred while inserting rows: ", insert_logs)
55
-
56
-
57
34
  async def get_data_set_schema_from_docai(
58
35
  schema_client, project_id=None, location=None, processor_id=None, name=None
59
36
  ):
@@ -383,9 +360,9 @@ def extract_top_pages(pdf_bytes, num_pages=4):
383
360
  return output.getvalue()
384
361
 
385
362
 
386
- def get_tms_mappings(
387
- input_list: list[str], embedding_type: str, llm_ports: list[str] = None
388
- ):
363
+ async def get_tms_mappings(
364
+ input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
365
+ ) -> Dict[str, Any]:
389
366
  """Get TMS mappings for the given values.
390
367
 
391
368
  Args:
@@ -395,39 +372,56 @@ def get_tms_mappings(
395
372
  llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
396
373
 
397
374
  Returns:
398
- dict: A dictionary with the mapping results.
375
+ dict or string: A dictionary or a string with the mapping results.
399
376
  """
400
- # To test the API locally, port-forward the embedding service in the sandbox to 8080:80
401
- # If you want to launch uvicorn from the tms-embedding repo, then use --port 8080 in the config file
402
377
  base_url = (
403
378
  "http://0.0.0.0:8080/"
404
379
  if os.getenv("CLUSTER") is None
405
380
  else "http://tms-mappings.api.svc.cluster.local./"
406
381
  )
407
382
 
383
+ # Ensure clean inputs
384
+ if not input_list:
385
+ return {}
386
+
408
387
  # Ensure input_list is a list
409
388
  if not isinstance(input_list, list):
410
389
  input_list = [input_list]
411
390
 
412
391
  # Always send a dict with named keys
413
392
  payload = {embedding_type: input_list}
393
+
414
394
  if llm_ports:
415
395
  payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
416
396
 
417
397
  # Make the POST request to the TMS mappings API
418
- url = f"{base_url}/{embedding_type}"
419
- response = requests.post(url=url, json=payload)
420
-
421
- if response.status_code != 200:
422
- logger.error(
423
- f"Error from TMS mappings API: {response.status_code} - {response.text}"
424
- )
425
-
426
- formatted_values = (
427
- response.json().get("response", {}).get("data", {}).get(input_list[0], None)
428
- )
429
-
430
- return formatted_values
398
+ url = f"{base_url}{embedding_type}"
399
+
400
+ # Use a timeout so the code doesn't hang forever
401
+ timeout = httpx.Timeout(60.0, connect=10.0)
402
+
403
+ async with httpx.AsyncClient(timeout=timeout) as client:
404
+ try:
405
+ response = await client.post(url, json=payload)
406
+ response.raise_for_status()
407
+
408
+ # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
409
+ if embedding_type == "line_items":
410
+ # For line_items, return the full data mapping
411
+ return response.json().get("response", {}).get("data", {})
412
+ else:
413
+ return (
414
+ response.json()
415
+ .get("response", {})
416
+ .get("data", {})
417
+ .get(input_list[0], None)
418
+ )
419
+
420
+ except httpx.HTTPStatusError as exc:
421
+ logger.error(
422
+ f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
423
+ )
424
+ return {}
431
425
 
432
426
 
433
427
  def transform_schema_strings(schema):
@@ -502,3 +496,21 @@ def estimate_page_count(sheet):
502
496
  else:
503
497
  return None
504
498
  return np.ceil(pg_cnt / 500)
499
+
500
+
501
+ def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
502
+ """Split PDF into smaller page chunks."""
503
+ pdf = PdfReader(io.BytesIO(file_content))
504
+ total_pages = len(pdf.pages)
505
+
506
+ # TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
507
+ for i in range(0, total_pages, chunk_size):
508
+ writer = PdfWriter()
509
+ for j in range(i, min(i + chunk_size, total_pages)):
510
+ writer.add_page(pdf.pages[j])
511
+
512
+ buffer = io.BytesIO()
513
+ writer.write(buffer)
514
+ buffer.seek(0)
515
+
516
+ yield buffer.getvalue()