data-science-document-ai 1.43.5__tar.gz → 1.43.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/PKG-INFO +1 -1
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/pyproject.toml +1 -1
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/partnerInvoice/other/prompt.txt +2 -1
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/utils.py +34 -23
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/constants.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/docai.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/excel_processing.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/io.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/llm.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/log_setup.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/pdf_processing.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/common.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/postprocess_partner_invoice.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/setup.py +0 -0
- {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/tms.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.43.
|
|
3
|
+
version = "1.43.7"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -53,7 +53,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
53
53
|
- totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
|
|
54
54
|
- totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
|
|
55
55
|
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
|
|
56
|
-
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
|
|
56
|
+
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
|
|
57
57
|
|
|
58
58
|
- hblNumber and mblNumber:
|
|
59
59
|
- The Master Bill of Lading number. Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", or "HBL No.".
|
|
@@ -81,6 +81,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
81
81
|
|
|
82
82
|
IMPORTANT NOTE:
|
|
83
83
|
- Ensure all extracted values are directly from the document. Do not make assumptions, modifications or calculations.
|
|
84
|
+
- Do not split the quantity into different line items. e.g., if quantity is 2 or 2 CTR or 2 BIL, do not create 2 separate line items with quantity 1 each.
|
|
84
85
|
- Do not normalize or modify any entity values.
|
|
85
86
|
- Pay attention to the line item details and paymentInformation, as they may vary significantly across different invoices.
|
|
86
87
|
|
|
@@ -443,12 +443,23 @@ def transform_schema_strings(schema):
|
|
|
443
443
|
Returns:
|
|
444
444
|
dict: The transformed schema dictionary.
|
|
445
445
|
"""
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
446
|
+
if not isinstance(schema, dict):
|
|
447
|
+
return schema
|
|
448
|
+
|
|
449
|
+
schema_type = schema.get("type")
|
|
450
|
+
if not schema_type:
|
|
451
|
+
return schema
|
|
452
|
+
|
|
453
|
+
# Base case: STRING → OBJECT (only if not already transformed)
|
|
454
|
+
if schema_type.upper() == "STRING":
|
|
455
|
+
return {
|
|
449
456
|
"type": "OBJECT",
|
|
450
457
|
"properties": {
|
|
451
|
-
"value": {
|
|
458
|
+
"value": {
|
|
459
|
+
"type": "STRING",
|
|
460
|
+
"nullable": schema.get("nullable", False),
|
|
461
|
+
"description": schema.get("description", ""),
|
|
462
|
+
},
|
|
452
463
|
"page_number": {
|
|
453
464
|
"type": "STRING",
|
|
454
465
|
"description": "Number of a page where the value was found in the document starting from 0.",
|
|
@@ -457,29 +468,29 @@ def transform_schema_strings(schema):
|
|
|
457
468
|
"required": [],
|
|
458
469
|
}
|
|
459
470
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
471
|
+
# Skip already transformed OBJECT (has both 'value' & 'page_number')
|
|
472
|
+
if (
|
|
473
|
+
schema_type.upper() == "OBJECT"
|
|
474
|
+
and "properties" in schema
|
|
475
|
+
and {"value", "page_number"}.issubset(schema["properties"].keys())
|
|
476
|
+
):
|
|
477
|
+
return schema
|
|
465
478
|
|
|
479
|
+
# Recursive case for OBJECT
|
|
480
|
+
if schema_type.upper() == "OBJECT" and "properties" in schema:
|
|
481
|
+
new_schema = schema.copy()
|
|
482
|
+
new_schema["properties"] = {
|
|
483
|
+
k: transform_schema_strings(v) for k, v in schema["properties"].items()
|
|
484
|
+
}
|
|
466
485
|
return new_schema
|
|
467
486
|
|
|
468
|
-
# Recursive case
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
return transformed_schema
|
|
474
|
-
|
|
475
|
-
# Recursive case: if the schema is a list
|
|
476
|
-
elif isinstance(schema, dict) and schema.get("type").upper() == "ARRAY":
|
|
477
|
-
schema["items"] = transform_schema_strings(schema["items"])
|
|
478
|
-
return schema
|
|
487
|
+
# Recursive case for ARRAY
|
|
488
|
+
if schema_type.upper() == "ARRAY" and "items" in schema:
|
|
489
|
+
new_schema = schema.copy()
|
|
490
|
+
new_schema["items"] = transform_schema_strings(schema["items"])
|
|
491
|
+
return new_schema
|
|
479
492
|
|
|
480
|
-
|
|
481
|
-
else:
|
|
482
|
-
return schema
|
|
493
|
+
return schema
|
|
483
494
|
|
|
484
495
|
|
|
485
496
|
def estimate_page_count(sheet):
|
|
File without changes
|
{data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/common.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|