data-science-document-ai 2.1.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/pdf_processing.py +1 -0
  4. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/common.py +41 -0
  5. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/constants.py +0 -0
  6. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/constants_sandbox.py +0 -0
  7. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/docai.py +0 -0
  8. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/docai_processor_config.yaml +0 -0
  9. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/excel_processing.py +0 -0
  10. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/io.py +0 -0
  11. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/llm.py +0 -0
  12. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/log_setup.py +0 -0
  13. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  14. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  15. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/postprocess_partner_invoice.py +0 -0
  16. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
  17. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
  18. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/cma-cgm/placeholders.json +0 -0
  19. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/cma-cgm/prompt.txt +0 -0
  20. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/cosco/placeholders.json +0 -0
  21. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/cosco/prompt.txt +0 -0
  22. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  23. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  24. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  25. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  26. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/hmm/placeholders.json +0 -0
  27. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/hmm/prompt.txt +0 -0
  28. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  29. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  30. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  31. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  32. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/one/placeholders.json +0 -0
  33. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/one/prompt.txt +0 -0
  34. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  35. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  36. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  37. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  38. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/system.txt +0 -0
  39. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  40. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  41. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  42. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  43. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  44. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  45. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
  46. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  47. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  48. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  49. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  50. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  51. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  52. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  53. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
  54. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  55. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
  56. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  57. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  58. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  59. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  60. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  61. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  62. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  63. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
  64. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  65. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/prompt_library.py +0 -0
  66. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/setup.py +0 -0
  67. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/tms.py +0 -0
  68. {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 2.1.0
3
+ Version: 2.2.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "2.1.0"
3
+ version = "2.2.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -261,6 +261,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
261
261
  if use_chunking
262
262
  else [file_content]
263
263
  ):
264
+ logger.info(f"chunking the document into {number_of_pages} pages....")
264
265
  tasks.append(
265
266
  process_chunk_with_retry(
266
267
  chunk, prompt, response_schema, llm_client, input_doc_type, retries=3
@@ -727,6 +727,9 @@ async def format_all_entities(result, document_type_code, params, mime_type):
727
727
  # intact — the customs-pool filter there depends on it.
728
728
  if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
729
729
  await process_partner_invoice(params, aggregated_data, document_type_code)
730
+ # Resolve the vendor to its legal-entity code so document-capture
731
+ # receives the code directly instead of a (possibly ambiguous) name.
732
+ await set_vendor_legal_entity_code(aggregated_data)
730
733
 
731
734
  if document_type_code in ["bookingConfirmation"]:
732
735
  aggregated_data["legalEntity"] = await get_legal_entity(
@@ -769,6 +772,44 @@ async def get_legal_entity(name, address=None):
769
772
  }
770
773
 
771
774
 
775
+ async def set_vendor_legal_entity_code(aggregated_data):
776
+ """Overwrite the invoice vendor's ``formattedValue`` with its legal-entity code.
777
+
778
+ Invoices identify the vendor by a legal-entity code (e.g. ``LE33894``) so two
779
+ partners that share a name stay distinct downstream. The human-readable name
780
+ is preserved in ``documentValue`` (kept searchable in document-capture); only
781
+ ``formattedValue`` is replaced, and only when a code is resolved — otherwise
782
+ the name is left untouched so the field degrades gracefully.
783
+
784
+ Only the name is sent to the lookup: adding the address reduces mapping
785
+ accuracy.
786
+
787
+ Args:
788
+ aggregated_data (dict): The formatted extraction result. Mutated in place.
789
+ """
790
+ vendor = aggregated_data.get("vendorName")
791
+ if not isinstance(vendor, dict):
792
+ return
793
+
794
+ # formattedValue holds the canonical name post-formatting; fall back to the
795
+ # raw documentValue when it is missing.
796
+ formatted_name = vendor.get("formattedValue")
797
+ document_name = vendor.get("documentValue")
798
+ name = formatted_name or document_name
799
+ if not name:
800
+ return
801
+
802
+ legal_entity = await get_legal_entity(name)
803
+ code = legal_entity.get("formattedValue")
804
+ if code:
805
+ # Retain the human-readable name in documentValue before formattedValue
806
+ # is replaced by the code, so the original name is never lost (e.g. when
807
+ # only formattedValue was populated).
808
+ if not document_name:
809
+ vendor["documentValue"] = name
810
+ vendor["formattedValue"] = code
811
+
812
+
772
813
  def process_location_nodes(obj):
773
814
  """Add terminal and depot codes to the extracted data."""
774
815