data-science-document-ai 2.1.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/PKG-INFO +1 -1
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/pyproject.toml +1 -1
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/pdf_processing.py +1 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/common.py +41 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/constants.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/docai.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/excel_processing.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/io.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/llm.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/log_setup.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/postprocess_partner_invoice.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/cma-cgm/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/cma-cgm/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/cosco/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/cosco/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/hmm/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/hmm/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/one/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/one/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/system.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/setup.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/tms.py +0 -0
- {data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.2.0"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -261,6 +261,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
261
261
|
if use_chunking
|
|
262
262
|
else [file_content]
|
|
263
263
|
):
|
|
264
|
+
logger.info(f"chunking the document into {number_of_pages} pages....")
|
|
264
265
|
tasks.append(
|
|
265
266
|
process_chunk_with_retry(
|
|
266
267
|
chunk, prompt, response_schema, llm_client, input_doc_type, retries=3
|
{data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/postprocessing/common.py
RENAMED
|
@@ -727,6 +727,9 @@ async def format_all_entities(result, document_type_code, params, mime_type):
|
|
|
727
727
|
# intact — the customs-pool filter there depends on it.
|
|
728
728
|
if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
|
|
729
729
|
await process_partner_invoice(params, aggregated_data, document_type_code)
|
|
730
|
+
# Resolve the vendor to its legal-entity code so document-capture
|
|
731
|
+
# receives the code directly instead of a (possibly ambiguous) name.
|
|
732
|
+
await set_vendor_legal_entity_code(aggregated_data)
|
|
730
733
|
|
|
731
734
|
if document_type_code in ["bookingConfirmation"]:
|
|
732
735
|
aggregated_data["legalEntity"] = await get_legal_entity(
|
|
@@ -769,6 +772,44 @@ async def get_legal_entity(name, address=None):
|
|
|
769
772
|
}
|
|
770
773
|
|
|
771
774
|
|
|
775
|
+
async def set_vendor_legal_entity_code(aggregated_data):
|
|
776
|
+
"""Overwrite the invoice vendor's ``formattedValue`` with its legal-entity code.
|
|
777
|
+
|
|
778
|
+
Invoices identify the vendor by a legal-entity code (e.g. ``LE33894``) so two
|
|
779
|
+
partners that share a name stay distinct downstream. The human-readable name
|
|
780
|
+
is preserved in ``documentValue`` (kept searchable in document-capture); only
|
|
781
|
+
``formattedValue`` is replaced, and only when a code is resolved — otherwise
|
|
782
|
+
the name is left untouched so the field degrades gracefully.
|
|
783
|
+
|
|
784
|
+
Only the name is sent to the lookup: adding the address reduces mapping
|
|
785
|
+
accuracy.
|
|
786
|
+
|
|
787
|
+
Args:
|
|
788
|
+
aggregated_data (dict): The formatted extraction result. Mutated in place.
|
|
789
|
+
"""
|
|
790
|
+
vendor = aggregated_data.get("vendorName")
|
|
791
|
+
if not isinstance(vendor, dict):
|
|
792
|
+
return
|
|
793
|
+
|
|
794
|
+
# formattedValue holds the canonical name post-formatting; fall back to the
|
|
795
|
+
# raw documentValue when it is missing.
|
|
796
|
+
formatted_name = vendor.get("formattedValue")
|
|
797
|
+
document_name = vendor.get("documentValue")
|
|
798
|
+
name = formatted_name or document_name
|
|
799
|
+
if not name:
|
|
800
|
+
return
|
|
801
|
+
|
|
802
|
+
legal_entity = await get_legal_entity(name)
|
|
803
|
+
code = legal_entity.get("formattedValue")
|
|
804
|
+
if code:
|
|
805
|
+
# Retain the human-readable name in documentValue before formattedValue
|
|
806
|
+
# is replaced by the code, so the original name is never lost (e.g. when
|
|
807
|
+
# only formattedValue was populated).
|
|
808
|
+
if not document_name:
|
|
809
|
+
vendor["documentValue"] = name
|
|
810
|
+
vendor["formattedValue"] = code
|
|
811
|
+
|
|
812
|
+
|
|
772
813
|
def process_location_nodes(obj):
|
|
773
814
|
"""Add terminal and depot codes to the extracted data."""
|
|
774
815
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-2.1.0 → data_science_document_ai-2.2.0}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|