data-science-document-ai 1.53.0__py3-none-any.whl → 1.54.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.53.0.dist-info → data_science_document_ai-1.54.0.dist-info}/METADATA +1 -1
- {data_science_document_ai-1.53.0.dist-info → data_science_document_ai-1.54.0.dist-info}/RECORD +7 -7
- src/constants.py +2 -1
- src/pdf_processing.py +6 -23
- src/postprocessing/postprocess_partner_invoice.py +55 -23
- src/setup.py +6 -0
- {data_science_document_ai-1.53.0.dist-info → data_science_document_ai-1.54.0.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.53.0.dist-info → data_science_document_ai-1.54.0.dist-info}/RECORD
RENAMED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
src/constants.py,sha256=
|
|
1
|
+
src/constants.py,sha256=H43Az9AtoBKfcq9yY4TQQJY8DfdILV5kXy8EMtRaWYA,3583
|
|
2
2
|
src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
|
|
3
3
|
src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
|
|
4
4
|
src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
|
|
@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
|
|
|
6
6
|
src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
|
|
7
7
|
src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=
|
|
9
|
+
src/pdf_processing.py,sha256=81fS2xL36n9QgB7DpXe7SCS-Lyz11cFDgccYMK3ZVkk,20026
|
|
10
10
|
src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
|
-
src/postprocessing/postprocess_partner_invoice.py,sha256=
|
|
13
|
+
src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
|
|
14
14
|
src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
|
|
15
15
|
src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
|
|
16
16
|
src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
@@ -52,9 +52,9 @@ src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93
|
|
|
52
52
|
src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
|
|
53
53
|
src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
|
|
54
54
|
src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
|
|
55
|
-
src/setup.py,sha256=
|
|
55
|
+
src/setup.py,sha256=yb0Pz1RI-uId5lEjgQrj1Pqo9FvwG9vs0HXRVbyST2M,7186
|
|
56
56
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
57
57
|
src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
|
|
58
|
-
data_science_document_ai-1.
|
|
59
|
-
data_science_document_ai-1.
|
|
60
|
-
data_science_document_ai-1.
|
|
58
|
+
data_science_document_ai-1.54.0.dist-info/METADATA,sha256=EoZfH8hyvq8E4wa_xfpRNE6oWmoYTIQycLj8itwuMiY,2152
|
|
59
|
+
data_science_document_ai-1.54.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
60
|
+
data_science_document_ai-1.54.0.dist-info/RECORD,,
|
src/constants.py
CHANGED
|
@@ -20,10 +20,11 @@ project_parameters = {
|
|
|
20
20
|
# Fuzzy lookup
|
|
21
21
|
"g_model_fuzzy_lookup_folder": "fuzzy_lookup",
|
|
22
22
|
"item_code_lookup": "line_item_kvp_table.json",
|
|
23
|
+
"intermodal_partners": "intermodal_partners.json",
|
|
23
24
|
"invoice_classification_lookup": "invoice_classification.json",
|
|
24
25
|
"reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
|
|
25
26
|
# Fuzzy logic params
|
|
26
|
-
"fuzzy_threshold_item_code":
|
|
27
|
+
"fuzzy_threshold_item_code": 92,
|
|
27
28
|
"fuzzy_threshold_reverse_charge": 80,
|
|
28
29
|
"fuzzy_threshold_invoice_classification": 70,
|
|
29
30
|
# Chunking params
|
src/pdf_processing.py
CHANGED
|
@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
|
|
|
32
32
|
from src.prompts.prompt_library import prompt_library
|
|
33
33
|
from src.utils import (
|
|
34
34
|
extract_top_pages,
|
|
35
|
-
generate_schema_structure,
|
|
36
35
|
get_pdf_page_count,
|
|
37
36
|
get_processor_name,
|
|
38
37
|
run_background_tasks,
|
|
@@ -250,7 +249,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
250
249
|
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
251
250
|
|
|
252
251
|
tasks = []
|
|
253
|
-
semaphore = asyncio.Semaphore(50)
|
|
254
252
|
# Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
|
|
255
253
|
for chunk in (
|
|
256
254
|
split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
|
|
@@ -258,8 +256,12 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
258
256
|
else [file_content]
|
|
259
257
|
):
|
|
260
258
|
tasks.append(
|
|
261
|
-
|
|
262
|
-
|
|
259
|
+
process_chunk_with_retry(
|
|
260
|
+
chunk,
|
|
261
|
+
prompt,
|
|
262
|
+
response_schema,
|
|
263
|
+
llm_client,
|
|
264
|
+
input_doc_type,
|
|
263
265
|
)
|
|
264
266
|
)
|
|
265
267
|
|
|
@@ -271,25 +273,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
271
273
|
return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
|
|
272
274
|
|
|
273
275
|
|
|
274
|
-
async def process_chunk_with_semaphore(
|
|
275
|
-
semaphore,
|
|
276
|
-
chunk_content,
|
|
277
|
-
prompt,
|
|
278
|
-
response_schema,
|
|
279
|
-
llm_client,
|
|
280
|
-
input_doc_type,
|
|
281
|
-
):
|
|
282
|
-
"""Process a chunk with a semaphore to limit concurrency."""
|
|
283
|
-
async with semaphore:
|
|
284
|
-
return await process_chunk_with_retry(
|
|
285
|
-
chunk_content,
|
|
286
|
-
prompt,
|
|
287
|
-
response_schema,
|
|
288
|
-
llm_client,
|
|
289
|
-
input_doc_type,
|
|
290
|
-
)
|
|
291
|
-
|
|
292
|
-
|
|
293
276
|
async def process_chunk_with_retry(
|
|
294
277
|
chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
|
|
295
278
|
):
|
|
@@ -188,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
188
188
|
reverse_charge_info["formattedValue"] = reverse_charge_value
|
|
189
189
|
reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
|
|
190
190
|
|
|
191
|
+
# Partner Name
|
|
192
|
+
partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
|
|
193
|
+
|
|
191
194
|
# Process everything in one go
|
|
192
|
-
processed_items = await process_line_items_batch(
|
|
195
|
+
processed_items = await process_line_items_batch(
|
|
196
|
+
params, line_items, reverse_charge, partner_name
|
|
197
|
+
)
|
|
193
198
|
|
|
194
199
|
# Update your main data structure
|
|
195
200
|
aggregated_data["lineItem"] = processed_items
|
|
196
201
|
|
|
197
202
|
|
|
198
203
|
async def process_line_items_batch(
|
|
199
|
-
params: dict, line_items: list[dict], reverse_charge=None
|
|
204
|
+
params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
|
|
200
205
|
):
|
|
201
206
|
"""
|
|
202
207
|
Processes all line items efficiently using a "Split-Apply-Combine" strategy.
|
|
@@ -234,23 +239,12 @@ async def process_line_items_batch(
|
|
|
234
239
|
|
|
235
240
|
# Batch API Call for Embedding lookups
|
|
236
241
|
if pending_line_items:
|
|
237
|
-
|
|
238
|
-
logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
|
|
239
|
-
|
|
240
|
-
# Await the batch response {"desc1": "code1", "desc2": "code2"}
|
|
241
|
-
api_results = await get_tms_mappings(
|
|
242
|
-
input_list=values_to_fetch, embedding_type="line_items"
|
|
243
|
-
)
|
|
242
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
244
243
|
|
|
245
|
-
# Merge API results back into original list
|
|
246
244
|
for index, desc in pending_line_items.items():
|
|
247
|
-
# Get result from API response, or None if API failed for that item
|
|
248
|
-
forto_code = api_results.get(desc)
|
|
249
|
-
|
|
250
|
-
# Update the original item
|
|
251
245
|
line_items[index]["itemCode"] = {
|
|
252
246
|
"documentValue": desc,
|
|
253
|
-
"formattedValue":
|
|
247
|
+
"formattedValue": code_map.get(desc),
|
|
254
248
|
"page": line_items[index]["lineItemDescription"].get("page"),
|
|
255
249
|
}
|
|
256
250
|
|
|
@@ -344,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
|
344
338
|
return None
|
|
345
339
|
|
|
346
340
|
|
|
347
|
-
async def associate_forto_item_code(line_item_data, params):
|
|
341
|
+
async def associate_forto_item_code(line_item_data, params, partner_name=None):
|
|
348
342
|
"""
|
|
349
343
|
Associates Forto item codes to a list of line item descriptions.
|
|
350
344
|
Args:
|
|
351
345
|
line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
|
|
352
346
|
params (dict): Parameters containing lookup data and thresholds.
|
|
347
|
+
partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
|
|
353
348
|
|
|
354
349
|
Returns:
|
|
355
350
|
list: A list of dictionaries with 'description' and 'itemCode' keys.
|
|
@@ -371,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
|
|
|
371
366
|
|
|
372
367
|
# Batch API Call for Embedding lookups
|
|
373
368
|
if pending_line_items:
|
|
374
|
-
|
|
375
|
-
input_list=list(pending_line_items.values()),
|
|
376
|
-
embedding_type="line_items",
|
|
377
|
-
)
|
|
369
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
378
370
|
|
|
379
|
-
# Merge API results back into original list
|
|
380
371
|
for desc, f_desc in pending_line_items.items():
|
|
381
|
-
|
|
382
|
-
|
|
372
|
+
result.append(
|
|
373
|
+
{
|
|
374
|
+
"description": desc,
|
|
375
|
+
"itemCode": code_map.get(f_desc),
|
|
376
|
+
}
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
return result
|
|
380
|
+
|
|
383
381
|
|
|
382
|
+
async def fetch_line_item_codes(
|
|
383
|
+
pending_line_items: dict,
|
|
384
|
+
partner_name: str | None,
|
|
385
|
+
params: dict,
|
|
386
|
+
):
|
|
387
|
+
"""Returns: {original_description: mapped_code_or_None}"""
|
|
388
|
+
t_mode = (
|
|
389
|
+
find_matching_lineitem(
|
|
390
|
+
partner_name.upper(),
|
|
391
|
+
params["lookup_data"]["intermodal_partners"],
|
|
392
|
+
threshold=87,
|
|
393
|
+
)
|
|
394
|
+
if partner_name
|
|
395
|
+
else None
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
unique_descs = list(set(pending_line_items.values()))
|
|
399
|
+
logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
|
|
400
|
+
|
|
401
|
+
# Build API input map
|
|
402
|
+
api_input_map = {
|
|
403
|
+
desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
api_results = await get_tms_mappings(
|
|
407
|
+
input_list=list(api_input_map.values()),
|
|
408
|
+
embedding_type="line_items",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Normalize response back to original descriptions
|
|
412
|
+
result = {
|
|
413
|
+
original_desc: api_results.get(api_desc)
|
|
414
|
+
for original_desc, api_desc in api_input_map.items()
|
|
415
|
+
}
|
|
384
416
|
return result
|
src/setup.py
CHANGED
|
@@ -184,6 +184,9 @@ def setup_lookup_data(params):
|
|
|
184
184
|
input_path_item_code = (
|
|
185
185
|
f'{params["g_model_fuzzy_lookup_folder"]}/{params["item_code_lookup"]}'
|
|
186
186
|
)
|
|
187
|
+
input_path_intermodal_partners = (
|
|
188
|
+
f'{params["g_model_fuzzy_lookup_folder"]}/{params["intermodal_partners"]}'
|
|
189
|
+
)
|
|
187
190
|
input_path_invoice_classification = f'{params["g_model_fuzzy_lookup_folder"]}/{params["invoice_classification_lookup"]}' # noqa: E501
|
|
188
191
|
input_path_reverse_charge = f'{params["g_model_fuzzy_lookup_folder"]}/{params["reverse_charge_sentence_lookup"]}'
|
|
189
192
|
|
|
@@ -194,6 +197,9 @@ def setup_lookup_data(params):
|
|
|
194
197
|
return json.loads(downloaded_data)
|
|
195
198
|
|
|
196
199
|
data["item_code"] = download_json_from_bucket(input_path_item_code)
|
|
200
|
+
data["intermodal_partners"] = download_json_from_bucket(
|
|
201
|
+
input_path_intermodal_partners
|
|
202
|
+
)
|
|
197
203
|
data["invoice_classification"] = download_json_from_bucket(
|
|
198
204
|
input_path_invoice_classification
|
|
199
205
|
)
|
{data_science_document_ai-1.53.0.dist-info → data_science_document_ai-1.54.0.dist-info}/WHEEL
RENAMED
|
File without changes
|