data-science-document-ai 1.52.1__tar.gz → 1.54.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/PKG-INFO +1 -1
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/pyproject.toml +1 -1
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/constants.py +2 -1
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/pdf_processing.py +5 -2
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_partner_invoice.py +76 -23
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/setup.py +6 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/docai.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/excel_processing.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/io.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/llm.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/log_setup.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/common.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/tms.py +0 -0
- {data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.54.0"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -20,10 +20,11 @@ project_parameters = {
|
|
|
20
20
|
# Fuzzy lookup
|
|
21
21
|
"g_model_fuzzy_lookup_folder": "fuzzy_lookup",
|
|
22
22
|
"item_code_lookup": "line_item_kvp_table.json",
|
|
23
|
+
"intermodal_partners": "intermodal_partners.json",
|
|
23
24
|
"invoice_classification_lookup": "invoice_classification.json",
|
|
24
25
|
"reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
|
|
25
26
|
# Fuzzy logic params
|
|
26
|
-
"fuzzy_threshold_item_code":
|
|
27
|
+
"fuzzy_threshold_item_code": 92,
|
|
27
28
|
"fuzzy_threshold_reverse_charge": 80,
|
|
28
29
|
"fuzzy_threshold_invoice_classification": 70,
|
|
29
30
|
# Chunking params
|
|
@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
|
|
|
32
32
|
from src.prompts.prompt_library import prompt_library
|
|
33
33
|
from src.utils import (
|
|
34
34
|
extract_top_pages,
|
|
35
|
-
generate_schema_structure,
|
|
36
35
|
get_pdf_page_count,
|
|
37
36
|
get_processor_name,
|
|
38
37
|
run_background_tasks,
|
|
@@ -258,7 +257,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
258
257
|
):
|
|
259
258
|
tasks.append(
|
|
260
259
|
process_chunk_with_retry(
|
|
261
|
-
chunk,
|
|
260
|
+
chunk,
|
|
261
|
+
prompt,
|
|
262
|
+
response_schema,
|
|
263
|
+
llm_client,
|
|
264
|
+
input_doc_type,
|
|
262
265
|
)
|
|
263
266
|
)
|
|
264
267
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
"""This module contains the postprocessing functions for the partner invoice."""
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
2
4
|
from rapidfuzz import fuzz, process
|
|
3
5
|
|
|
4
6
|
from src.io import logger
|
|
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
143
145
|
] = "Dasbachstraße 15, 54292 Trier, Germany"
|
|
144
146
|
|
|
145
147
|
|
|
148
|
+
def select_unique_bank_account(bank_account):
|
|
149
|
+
# Select the unique bank account if multiple are present
|
|
150
|
+
if isinstance(bank_account, list) and bank_account:
|
|
151
|
+
best = defaultdict(lambda: None)
|
|
152
|
+
|
|
153
|
+
for item in bank_account:
|
|
154
|
+
dv = item["documentValue"]
|
|
155
|
+
if best[dv] is None or item["page"] < best[dv]["page"]:
|
|
156
|
+
best[dv] = item
|
|
157
|
+
|
|
158
|
+
unique = list(best.values())
|
|
159
|
+
return unique
|
|
160
|
+
|
|
161
|
+
|
|
146
162
|
async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
147
163
|
"""Process the partner invoice data."""
|
|
148
164
|
# Post process bundeskasse invoices
|
|
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
150
166
|
post_process_bundeskasse(aggregated_data)
|
|
151
167
|
return
|
|
152
168
|
|
|
169
|
+
if "bankAccount" in aggregated_data:
|
|
170
|
+
aggregated_data["bankAccount"] = select_unique_bank_account(
|
|
171
|
+
aggregated_data["bankAccount"]
|
|
172
|
+
)
|
|
173
|
+
|
|
153
174
|
line_items = aggregated_data.get("lineItem", [])
|
|
154
175
|
# Add debug logging
|
|
155
176
|
logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
|
|
@@ -167,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
167
188
|
reverse_charge_info["formattedValue"] = reverse_charge_value
|
|
168
189
|
reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
|
|
169
190
|
|
|
191
|
+
# Partner Name
|
|
192
|
+
partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
|
|
193
|
+
|
|
170
194
|
# Process everything in one go
|
|
171
|
-
processed_items = await process_line_items_batch(
|
|
195
|
+
processed_items = await process_line_items_batch(
|
|
196
|
+
params, line_items, reverse_charge, partner_name
|
|
197
|
+
)
|
|
172
198
|
|
|
173
199
|
# Update your main data structure
|
|
174
200
|
aggregated_data["lineItem"] = processed_items
|
|
175
201
|
|
|
176
202
|
|
|
177
203
|
async def process_line_items_batch(
|
|
178
|
-
params: dict, line_items: list[dict], reverse_charge=None
|
|
204
|
+
params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
|
|
179
205
|
):
|
|
180
206
|
"""
|
|
181
207
|
Processes all line items efficiently using a "Split-Apply-Combine" strategy.
|
|
@@ -213,23 +239,12 @@ async def process_line_items_batch(
|
|
|
213
239
|
|
|
214
240
|
# Batch API Call for Embedding lookups
|
|
215
241
|
if pending_line_items:
|
|
216
|
-
|
|
217
|
-
logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
|
|
218
|
-
|
|
219
|
-
# Await the batch response {"desc1": "code1", "desc2": "code2"}
|
|
220
|
-
api_results = await get_tms_mappings(
|
|
221
|
-
input_list=values_to_fetch, embedding_type="line_items"
|
|
222
|
-
)
|
|
242
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
223
243
|
|
|
224
|
-
# Merge API results back into original list
|
|
225
244
|
for index, desc in pending_line_items.items():
|
|
226
|
-
# Get result from API response, or None if API failed for that item
|
|
227
|
-
forto_code = api_results.get(desc)
|
|
228
|
-
|
|
229
|
-
# Update the original item
|
|
230
245
|
line_items[index]["itemCode"] = {
|
|
231
246
|
"documentValue": desc,
|
|
232
|
-
"formattedValue":
|
|
247
|
+
"formattedValue": code_map.get(desc),
|
|
233
248
|
"page": line_items[index]["lineItemDescription"].get("page"),
|
|
234
249
|
}
|
|
235
250
|
|
|
@@ -323,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
|
323
338
|
return None
|
|
324
339
|
|
|
325
340
|
|
|
326
|
-
async def associate_forto_item_code(line_item_data, params):
|
|
341
|
+
async def associate_forto_item_code(line_item_data, params, partner_name=None):
|
|
327
342
|
"""
|
|
328
343
|
Associates Forto item codes to a list of line item descriptions.
|
|
329
344
|
Args:
|
|
330
345
|
line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
|
|
331
346
|
params (dict): Parameters containing lookup data and thresholds.
|
|
347
|
+
partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
|
|
332
348
|
|
|
333
349
|
Returns:
|
|
334
350
|
list: A list of dictionaries with 'description' and 'itemCode' keys.
|
|
@@ -350,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
|
|
|
350
366
|
|
|
351
367
|
# Batch API Call for Embedding lookups
|
|
352
368
|
if pending_line_items:
|
|
353
|
-
|
|
354
|
-
input_list=list(pending_line_items.values()),
|
|
355
|
-
embedding_type="line_items",
|
|
356
|
-
)
|
|
369
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
357
370
|
|
|
358
|
-
# Merge API results back into original list
|
|
359
371
|
for desc, f_desc in pending_line_items.items():
|
|
360
|
-
|
|
361
|
-
|
|
372
|
+
result.append(
|
|
373
|
+
{
|
|
374
|
+
"description": desc,
|
|
375
|
+
"itemCode": code_map.get(f_desc),
|
|
376
|
+
}
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
return result
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
async def fetch_line_item_codes(
|
|
383
|
+
pending_line_items: dict,
|
|
384
|
+
partner_name: str | None,
|
|
385
|
+
params: dict,
|
|
386
|
+
):
|
|
387
|
+
"""Returns: {original_description: mapped_code_or_None}"""
|
|
388
|
+
t_mode = (
|
|
389
|
+
find_matching_lineitem(
|
|
390
|
+
partner_name.upper(),
|
|
391
|
+
params["lookup_data"]["intermodal_partners"],
|
|
392
|
+
threshold=87,
|
|
393
|
+
)
|
|
394
|
+
if partner_name
|
|
395
|
+
else None
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
unique_descs = list(set(pending_line_items.values()))
|
|
399
|
+
logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
|
|
400
|
+
|
|
401
|
+
# Build API input map
|
|
402
|
+
api_input_map = {
|
|
403
|
+
desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
|
|
404
|
+
}
|
|
362
405
|
|
|
406
|
+
api_results = await get_tms_mappings(
|
|
407
|
+
input_list=list(api_input_map.values()),
|
|
408
|
+
embedding_type="line_items",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Normalize response back to original descriptions
|
|
412
|
+
result = {
|
|
413
|
+
original_desc: api_results.get(api_desc)
|
|
414
|
+
for original_desc, api_desc in api_input_map.items()
|
|
415
|
+
}
|
|
363
416
|
return result
|
|
@@ -184,6 +184,9 @@ def setup_lookup_data(params):
|
|
|
184
184
|
input_path_item_code = (
|
|
185
185
|
f'{params["g_model_fuzzy_lookup_folder"]}/{params["item_code_lookup"]}'
|
|
186
186
|
)
|
|
187
|
+
input_path_intermodal_partners = (
|
|
188
|
+
f'{params["g_model_fuzzy_lookup_folder"]}/{params["intermodal_partners"]}'
|
|
189
|
+
)
|
|
187
190
|
input_path_invoice_classification = f'{params["g_model_fuzzy_lookup_folder"]}/{params["invoice_classification_lookup"]}' # noqa: E501
|
|
188
191
|
input_path_reverse_charge = f'{params["g_model_fuzzy_lookup_folder"]}/{params["reverse_charge_sentence_lookup"]}'
|
|
189
192
|
|
|
@@ -194,6 +197,9 @@ def setup_lookup_data(params):
|
|
|
194
197
|
return json.loads(downloaded_data)
|
|
195
198
|
|
|
196
199
|
data["item_code"] = download_json_from_bucket(input_path_item_code)
|
|
200
|
+
data["intermodal_partners"] = download_json_from_bucket(
|
|
201
|
+
input_path_intermodal_partners
|
|
202
|
+
)
|
|
197
203
|
data["invoice_classification"] = download_json_from_bucket(
|
|
198
204
|
input_path_invoice_classification
|
|
199
205
|
)
|
{data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/postprocessing/common.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.52.1 → data_science_document_ai-1.54.0}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|