data-science-document-ai 1.52.1__tar.gz → 1.54.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/constants.py +2 -1
  4. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/pdf_processing.py +5 -2
  5. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_partner_invoice.py +76 -23
  6. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/setup.py +6 -0
  7. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/constants_sandbox.py +0 -0
  8. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/docai.py +0 -0
  9. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/docai_processor_config.yaml +0 -0
  10. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/excel_processing.py +0 -0
  11. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/io.py +0 -0
  12. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/llm.py +0 -0
  13. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/log_setup.py +0 -0
  14. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/common.py +0 -0
  15. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  16. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  17. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
  18. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
  19. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  20. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  21. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  22. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  23. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  24. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  25. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  26. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  27. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  28. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  29. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  30. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  31. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  32. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  33. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  34. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  35. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  36. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  37. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
  38. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  39. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  40. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  41. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  42. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  43. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  44. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
  46. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  47. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
  48. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  49. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  50. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  51. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  52. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  53. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  54. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  55. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
  56. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  57. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/prompt_library.py +0 -0
  58. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/tms.py +0 -0
  59. {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.52.1
3
+ Version: 1.54.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.52.1"
3
+ version = "1.54.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -20,10 +20,11 @@ project_parameters = {
20
20
  # Fuzzy lookup
21
21
  "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
22
22
  "item_code_lookup": "line_item_kvp_table.json",
23
+ "intermodal_partners": "intermodal_partners.json",
23
24
  "invoice_classification_lookup": "invoice_classification.json",
24
25
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
26
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 90,
27
+ "fuzzy_threshold_item_code": 92,
27
28
  "fuzzy_threshold_reverse_charge": 80,
28
29
  "fuzzy_threshold_invoice_classification": 70,
29
30
  # Chunking params
@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
32
32
  from src.prompts.prompt_library import prompt_library
33
33
  from src.utils import (
34
34
  extract_top_pages,
35
- generate_schema_structure,
36
35
  get_pdf_page_count,
37
36
  get_processor_name,
38
37
  run_background_tasks,
@@ -258,7 +257,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
258
257
  ):
259
258
  tasks.append(
260
259
  process_chunk_with_retry(
261
- chunk, prompt, response_schema, llm_client, input_doc_type
260
+ chunk,
261
+ prompt,
262
+ response_schema,
263
+ llm_client,
264
+ input_doc_type,
262
265
  )
263
266
  )
264
267
 
@@ -1,4 +1,6 @@
1
1
  """This module contains the postprocessing functions for the partner invoice."""
2
+ from collections import defaultdict
3
+
2
4
  from rapidfuzz import fuzz, process
3
5
 
4
6
  from src.io import logger
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
143
145
  ] = "Dasbachstraße 15, 54292 Trier, Germany"
144
146
 
145
147
 
148
+ def select_unique_bank_account(bank_account):
149
+ # Select the unique bank account if multiple are present
150
+ if isinstance(bank_account, list) and bank_account:
151
+ best = defaultdict(lambda: None)
152
+
153
+ for item in bank_account:
154
+ dv = item["documentValue"]
155
+ if best[dv] is None or item["page"] < best[dv]["page"]:
156
+ best[dv] = item
157
+
158
+ unique = list(best.values())
159
+ return unique
160
+
161
+
146
162
  async def process_partner_invoice(params, aggregated_data, document_type_code):
147
163
  """Process the partner invoice data."""
148
164
  # Post process bundeskasse invoices
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
150
166
  post_process_bundeskasse(aggregated_data)
151
167
  return
152
168
 
169
+ if "bankAccount" in aggregated_data:
170
+ aggregated_data["bankAccount"] = select_unique_bank_account(
171
+ aggregated_data["bankAccount"]
172
+ )
173
+
153
174
  line_items = aggregated_data.get("lineItem", [])
154
175
  # Add debug logging
155
176
  logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
@@ -167,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
167
188
  reverse_charge_info["formattedValue"] = reverse_charge_value
168
189
  reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
169
190
 
191
+ # Partner Name
192
+ partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
193
+
170
194
  # Process everything in one go
171
- processed_items = await process_line_items_batch(params, line_items, reverse_charge)
195
+ processed_items = await process_line_items_batch(
196
+ params, line_items, reverse_charge, partner_name
197
+ )
172
198
 
173
199
  # Update your main data structure
174
200
  aggregated_data["lineItem"] = processed_items
175
201
 
176
202
 
177
203
  async def process_line_items_batch(
178
- params: dict, line_items: list[dict], reverse_charge=None
204
+ params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
179
205
  ):
180
206
  """
181
207
  Processes all line items efficiently using a "Split-Apply-Combine" strategy.
@@ -213,23 +239,12 @@ async def process_line_items_batch(
213
239
 
214
240
  # Batch API Call for Embedding lookups
215
241
  if pending_line_items:
216
- values_to_fetch = list(set(pending_line_items.values()))
217
- logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
218
-
219
- # Await the batch response {"desc1": "code1", "desc2": "code2"}
220
- api_results = await get_tms_mappings(
221
- input_list=values_to_fetch, embedding_type="line_items"
222
- )
242
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
223
243
 
224
- # Merge API results back into original list
225
244
  for index, desc in pending_line_items.items():
226
- # Get result from API response, or None if API failed for that item
227
- forto_code = api_results.get(desc)
228
-
229
- # Update the original item
230
245
  line_items[index]["itemCode"] = {
231
246
  "documentValue": desc,
232
- "formattedValue": forto_code, # Might be None if API failed
247
+ "formattedValue": code_map.get(desc),
233
248
  "page": line_items[index]["lineItemDescription"].get("page"),
234
249
  }
235
250
 
@@ -323,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
323
338
  return None
324
339
 
325
340
 
326
- async def associate_forto_item_code(line_item_data, params):
341
+ async def associate_forto_item_code(line_item_data, params, partner_name=None):
327
342
  """
328
343
  Associates Forto item codes to a list of line item descriptions.
329
344
  Args:
330
345
  line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
331
346
  params (dict): Parameters containing lookup data and thresholds.
347
+ partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
332
348
 
333
349
  Returns:
334
350
  list: A list of dictionaries with 'description' and 'itemCode' keys.
@@ -350,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
350
366
 
351
367
  # Batch API Call for Embedding lookups
352
368
  if pending_line_items:
353
- api_results = await get_tms_mappings(
354
- input_list=list(pending_line_items.values()),
355
- embedding_type="line_items",
356
- )
369
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
357
370
 
358
- # Merge API results back into original list
359
371
  for desc, f_desc in pending_line_items.items():
360
- code = api_results.get(f_desc)
361
- result.append({"description": desc, "itemCode": code})
372
+ result.append(
373
+ {
374
+ "description": desc,
375
+ "itemCode": code_map.get(f_desc),
376
+ }
377
+ )
378
+
379
+ return result
380
+
381
+
382
+ async def fetch_line_item_codes(
383
+ pending_line_items: dict,
384
+ partner_name: str | None,
385
+ params: dict,
386
+ ):
387
+ """Returns: {original_description: mapped_code_or_None}"""
388
+ t_mode = (
389
+ find_matching_lineitem(
390
+ partner_name.upper(),
391
+ params["lookup_data"]["intermodal_partners"],
392
+ threshold=87,
393
+ )
394
+ if partner_name
395
+ else None
396
+ )
397
+
398
+ unique_descs = list(set(pending_line_items.values()))
399
+ logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
400
+
401
+ # Build API input map
402
+ api_input_map = {
403
+ desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
404
+ }
362
405
 
406
+ api_results = await get_tms_mappings(
407
+ input_list=list(api_input_map.values()),
408
+ embedding_type="line_items",
409
+ )
410
+
411
+ # Normalize response back to original descriptions
412
+ result = {
413
+ original_desc: api_results.get(api_desc)
414
+ for original_desc, api_desc in api_input_map.items()
415
+ }
363
416
  return result
@@ -184,6 +184,9 @@ def setup_lookup_data(params):
184
184
  input_path_item_code = (
185
185
  f'{params["g_model_fuzzy_lookup_folder"]}/{params["item_code_lookup"]}'
186
186
  )
187
+ input_path_intermodal_partners = (
188
+ f'{params["g_model_fuzzy_lookup_folder"]}/{params["intermodal_partners"]}'
189
+ )
187
190
  input_path_invoice_classification = f'{params["g_model_fuzzy_lookup_folder"]}/{params["invoice_classification_lookup"]}' # noqa: E501
188
191
  input_path_reverse_charge = f'{params["g_model_fuzzy_lookup_folder"]}/{params["reverse_charge_sentence_lookup"]}'
189
192
 
@@ -194,6 +197,9 @@ def setup_lookup_data(params):
194
197
  return json.loads(downloaded_data)
195
198
 
196
199
  data["item_code"] = download_json_from_bucket(input_path_item_code)
200
+ data["intermodal_partners"] = download_json_from_bucket(
201
+ input_path_intermodal_partners
202
+ )
197
203
  data["invoice_classification"] = download_json_from_bucket(
198
204
  input_path_invoice_classification
199
205
  )