data-science-document-ai 1.45.0__tar.gz → 1.45.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/PKG-INFO +1 -1
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/pyproject.toml +1 -1
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/postprocessing/common.py +10 -4
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/customsInvoice/other/prompt.txt +1 -1
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/partnerInvoice/other/prompt.txt +2 -1
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/constants.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/docai.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/excel_processing.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/io.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/llm.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/log_setup.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/pdf_processing.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/postprocessing/postprocess_partner_invoice.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/setup.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/tms.py +0 -0
- {data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.45.
|
|
3
|
+
version = "1.45.1"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
{data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/postprocessing/common.py
RENAMED
|
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
|
|
|
134
134
|
formatted_value: string
|
|
135
135
|
|
|
136
136
|
"""
|
|
137
|
+
# Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
|
|
138
|
+
value = remove_unwanted_patterns(data_field_value)
|
|
139
|
+
|
|
137
140
|
formatted_value = ""
|
|
138
|
-
for c in
|
|
141
|
+
for c in value:
|
|
139
142
|
if c.isnumeric() or c in [",", ".", "-"]:
|
|
140
143
|
formatted_value += c
|
|
141
144
|
|
|
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
|
|
|
320
323
|
lineitem = lineitem.replace("HIGH CUBE", "")
|
|
321
324
|
|
|
322
325
|
# Remove container size e.g., 20FT, 40HC, etc.
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
+
pattern = [
|
|
327
|
+
f"{s}{t}"
|
|
328
|
+
for s in ("20|22|40|45".split("|"))
|
|
329
|
+
for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
|
|
330
|
+
]
|
|
331
|
+
lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
|
|
326
332
|
|
|
327
333
|
return lineitem
|
|
328
334
|
|
|
@@ -54,7 +54,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
54
54
|
- unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
|
|
55
55
|
- totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
|
|
56
56
|
- totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
|
|
57
|
-
- quantity: The quantity of the item or service provided in the line item.
|
|
57
|
+
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
|
|
58
58
|
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
|
|
59
59
|
|
|
60
60
|
- hblNumber and mblNumber:
|
|
@@ -52,7 +52,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
52
52
|
- unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
|
|
53
53
|
- totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
|
|
54
54
|
- totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
|
|
55
|
-
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and
|
|
55
|
+
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
|
|
56
56
|
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
|
|
57
57
|
|
|
58
58
|
- hblNumber and mblNumber:
|
|
@@ -68,6 +68,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
68
68
|
- Example:
|
|
69
69
|
- "COSCO SHIPPING Lines Italy, Poland, or France S.R.L. – Genova Office – As Agent For COSCO SHIPPING Lines Co.,Ltd."
|
|
70
70
|
- vendorName: COSCO SHIPPING Lines Co.,Ltd.
|
|
71
|
+
- From Hapag-Lloyd invoices, look for "Ballindamm 25" address to extract the vendorAddress.
|
|
71
72
|
|
|
72
73
|
- agentName: Name of the agent. Agencies are offices authorized to act on behalf of a company. This details usually available including the branch name of the parent company name in the invoice.
|
|
73
74
|
- agentKeyWord:
|
|
File without changes
|
{data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.45.0 → data_science_document_ai-1.45.1}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|