data-science-document-ai 1.42.1__tar.gz → 1.42.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/PKG-INFO +1 -1
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/pyproject.toml +1 -1
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/excel_processing.py +2 -1
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/postprocessing/common.py +1 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/postprocessing/postprocess_partner_invoice.py +5 -30
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/partnerInvoice/other/prompt.txt +1 -1
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/constants.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/docai.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/io.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/llm.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/log_setup.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/pdf_processing.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/setup.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/tms.py +0 -0
- {data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.42.
|
|
3
|
+
version = "1.42.3"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -20,7 +20,8 @@ async def extract_data_from_sheet(
|
|
|
20
20
|
params, sheet_name, sheet, response_schema, doc_type=None
|
|
21
21
|
):
|
|
22
22
|
logger.info(f"Processing sheet: {sheet_name}")
|
|
23
|
-
excel_content = pd.DataFrame(sheet.values)
|
|
23
|
+
excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
|
|
24
|
+
|
|
24
25
|
# Convert to Markdown format for the LLM model
|
|
25
26
|
worksheet = (
|
|
26
27
|
"This is from a excel. Pay attention to the cell position:\n"
|
{data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/postprocessing/common.py
RENAMED
|
@@ -456,6 +456,7 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
456
456
|
elif "quantity" in entity_key:
|
|
457
457
|
if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
|
|
458
458
|
# For partner invoice, quantity can be mentioned as whole number
|
|
459
|
+
# Apply decimal convertor for 46,45 --> 46.45 but not for 1.000 --> 1000
|
|
459
460
|
formatted_value = decimal_convertor(
|
|
460
461
|
extract_number(entity_value), quantity=True
|
|
461
462
|
)
|
|
@@ -138,36 +138,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
138
138
|
|
|
139
139
|
def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
140
140
|
"""Process the partner invoice data."""
|
|
141
|
-
# Post process
|
|
142
|
-
# TODO: Remove this block of code after migrating to LLM completely and update the placeholder in the prompt library
|
|
143
|
-
if "containerNumber" in aggregated_data and isinstance(
|
|
144
|
-
aggregated_data["containerNumber"], dict
|
|
145
|
-
):
|
|
146
|
-
container_number = aggregated_data.get("containerNumber", {}).get(
|
|
147
|
-
"formattedValue", None
|
|
148
|
-
)
|
|
149
|
-
if container_number:
|
|
150
|
-
aggregated_data["containerNumber"] = (
|
|
151
|
-
[
|
|
152
|
-
{
|
|
153
|
-
"documentValue": aggregated_data.get("containerNumber", {}).get(
|
|
154
|
-
"documentValue", ""
|
|
155
|
-
),
|
|
156
|
-
"formattedValue": ctr_number,
|
|
157
|
-
}
|
|
158
|
-
for ctr_number in container_number
|
|
159
|
-
]
|
|
160
|
-
if isinstance(container_number, list)
|
|
161
|
-
else [
|
|
162
|
-
{
|
|
163
|
-
"documentValue": aggregated_data.get("containerNumber", {}).get(
|
|
164
|
-
"documentValue", ""
|
|
165
|
-
),
|
|
166
|
-
"formattedValue": container_number,
|
|
167
|
-
}
|
|
168
|
-
]
|
|
169
|
-
)
|
|
170
|
-
|
|
141
|
+
# Post process bundeskasse invoices
|
|
171
142
|
if document_type_code == "bundeskasse":
|
|
172
143
|
post_process_bundeskasse(aggregated_data)
|
|
173
144
|
return
|
|
@@ -197,9 +168,13 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
197
168
|
params,
|
|
198
169
|
)
|
|
199
170
|
|
|
171
|
+
# Add page number for the consistency
|
|
172
|
+
line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
|
|
173
|
+
|
|
200
174
|
if reverse_charge:
|
|
201
175
|
# Distribute reverseChargeSentence to all line items
|
|
202
176
|
line_item["reverseChargeSentence"] = reverse_charge
|
|
177
|
+
line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
|
|
203
178
|
|
|
204
179
|
|
|
205
180
|
def compute_score(args):
|
|
@@ -52,7 +52,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
52
52
|
- unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
|
|
53
53
|
- totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
|
|
54
54
|
- totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
|
|
55
|
-
- quantity: The quantity of the item or service provided in the line item.
|
|
55
|
+
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
|
|
56
56
|
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
|
|
57
57
|
|
|
58
58
|
- hblNumber and mblNumber:
|
|
File without changes
|
{data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.42.1 → data_science_document_ai-1.42.3}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|