data-science-document-ai 1.43.6__tar.gz → 1.45.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/PKG-INFO +1 -1
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/pyproject.toml +1 -1
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/constants.py +4 -1
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/excel_processing.py +1 -2
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/io.py +23 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/pdf_processing.py +117 -40
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/common.py +132 -25
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_partner_invoice.py +121 -55
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/customsInvoice/other/prompt.txt +1 -1
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/partnerInvoice/other/prompt.txt +4 -2
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/utils.py +63 -41
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/docai.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/llm.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/log_setup.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/setup.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/tms.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.45.2"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -23,9 +23,12 @@ project_parameters = {
|
|
|
23
23
|
"invoice_classification_lookup": "invoice_classification.json",
|
|
24
24
|
"reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
|
|
25
25
|
# Fuzzy logic params
|
|
26
|
-
"fuzzy_threshold_item_code":
|
|
26
|
+
"fuzzy_threshold_item_code": 90,
|
|
27
27
|
"fuzzy_threshold_reverse_charge": 80,
|
|
28
28
|
"fuzzy_threshold_invoice_classification": 70,
|
|
29
|
+
# Chunking params
|
|
30
|
+
"chunk_size": 1, # page (do not change this without changing the page number logic)
|
|
31
|
+
"chunk_after": 10, # pages
|
|
29
32
|
# Big Query
|
|
30
33
|
"g_ai_gbq_db_schema": "document_ai",
|
|
31
34
|
"g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
|
|
@@ -4,8 +4,6 @@ import logging
|
|
|
4
4
|
|
|
5
5
|
from ddtrace import tracer
|
|
6
6
|
|
|
7
|
-
from src.postprocessing.common import llm_prediction_to_tuples
|
|
8
|
-
|
|
9
7
|
logger = logging.getLogger(__name__)
|
|
10
8
|
|
|
11
9
|
import asyncio
|
|
@@ -78,6 +76,7 @@ async def extract_data_from_excel(
|
|
|
78
76
|
"bundeskasse",
|
|
79
77
|
"commercialInvoice",
|
|
80
78
|
"packingList",
|
|
79
|
+
"bookingConfirmation",
|
|
81
80
|
]
|
|
82
81
|
else generate_schema_structure(params, input_doc_type)
|
|
83
82
|
)
|
|
@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
|
|
|
156
156
|
return result
|
|
157
157
|
|
|
158
158
|
|
|
159
|
+
def bq_logs(data_to_insert, params):
|
|
160
|
+
"""Insert logs into Google BigQuery.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
data_to_insert (list): The data to insert into BigQuery.
|
|
164
|
+
params (dict): The parameters dictionary.
|
|
165
|
+
"""
|
|
166
|
+
# Use the pre-initialized BigQuery client
|
|
167
|
+
bq_client = params["bq_client"]
|
|
168
|
+
# Get the table string
|
|
169
|
+
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
170
|
+
|
|
171
|
+
logger.info(f"Log table: {table_string}")
|
|
172
|
+
# Insert the rows into the table
|
|
173
|
+
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
174
|
+
|
|
175
|
+
# Check if there were any errors inserting the rows
|
|
176
|
+
if not insert_logs:
|
|
177
|
+
logger.info("New rows have been added.")
|
|
178
|
+
else:
|
|
179
|
+
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
180
|
+
|
|
181
|
+
|
|
159
182
|
# type: ignore
|
|
@@ -36,6 +36,7 @@ from src.utils import (
|
|
|
36
36
|
get_pdf_page_count,
|
|
37
37
|
get_processor_name,
|
|
38
38
|
run_background_tasks,
|
|
39
|
+
split_pdf_into_chunks,
|
|
39
40
|
transform_schema_strings,
|
|
40
41
|
validate_based_on_schema,
|
|
41
42
|
)
|
|
@@ -195,15 +196,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
195
196
|
result (dict): The structured data extracted from the document, formatted as JSON.
|
|
196
197
|
"""
|
|
197
198
|
# Bundeskasse invoices contains all the required information in the first 3 pages.
|
|
198
|
-
|
|
199
|
-
extract_top_pages(file_content, num_pages=5)
|
|
200
|
-
if input_doc_type == "bundeskasse"
|
|
201
|
-
else file_content
|
|
202
|
-
)
|
|
203
|
-
number_of_pages = get_pdf_page_count(file_content)
|
|
199
|
+
if input_doc_type == "bundeskasse":
|
|
200
|
+
file_content = extract_top_pages(file_content, num_pages=5)
|
|
204
201
|
|
|
205
|
-
|
|
206
|
-
|
|
202
|
+
number_of_pages = get_pdf_page_count(file_content)
|
|
203
|
+
logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
|
|
207
204
|
|
|
208
205
|
# get the schema placeholder from the Doc AI and generate the response structure
|
|
209
206
|
response_schema = (
|
|
@@ -215,26 +212,28 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
215
212
|
"bundeskasse",
|
|
216
213
|
"commercialInvoice",
|
|
217
214
|
"packingList",
|
|
215
|
+
"bookingConfirmation",
|
|
218
216
|
]
|
|
219
217
|
else generate_schema_structure(params, input_doc_type)
|
|
220
218
|
)
|
|
221
219
|
|
|
222
220
|
carrier = "other"
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
)
|
|
229
|
-
carrier_schema = prompt_library.library["preprocessing"]["carrier"][
|
|
230
|
-
"placeholders"
|
|
231
|
-
][input_doc_type]
|
|
221
|
+
carrier_schema = (
|
|
222
|
+
prompt_library.library.get("preprocessing", {})
|
|
223
|
+
.get("carrier", {})
|
|
224
|
+
.get("placeholders", {})
|
|
225
|
+
.get(input_doc_type)
|
|
226
|
+
)
|
|
232
227
|
|
|
228
|
+
if carrier_schema:
|
|
233
229
|
carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
|
|
234
230
|
carrier_prompt = carrier_prompt.replace(
|
|
235
231
|
"DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
|
|
236
232
|
)
|
|
237
233
|
|
|
234
|
+
# convert file_content to required document
|
|
235
|
+
document = llm_client.prepare_document_for_gemini(file_content)
|
|
236
|
+
|
|
238
237
|
# identify carrier for customized prompting
|
|
239
238
|
carrier = await identify_carrier(
|
|
240
239
|
document,
|
|
@@ -244,37 +243,115 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
244
243
|
doc_type=input_doc_type,
|
|
245
244
|
)
|
|
246
245
|
|
|
247
|
-
|
|
248
|
-
response_schema = prompt_library.library[input_doc_type][carrier][
|
|
249
|
-
"placeholders"
|
|
250
|
-
]
|
|
251
|
-
|
|
246
|
+
# Select prompt
|
|
252
247
|
if (
|
|
253
|
-
input_doc_type in prompt_library.library
|
|
254
|
-
|
|
248
|
+
input_doc_type not in prompt_library.library
|
|
249
|
+
or carrier not in prompt_library.library[input_doc_type]
|
|
255
250
|
):
|
|
256
|
-
|
|
257
|
-
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
251
|
+
return {}
|
|
258
252
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
response_schema = transform_schema_strings(response_schema)
|
|
253
|
+
# get the related prompt from predefined prompt library
|
|
254
|
+
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
262
255
|
|
|
263
|
-
|
|
264
|
-
|
|
256
|
+
# Add page-number extraction for moderately large docs
|
|
257
|
+
use_chunking = number_of_pages >= params["chunk_after"]
|
|
265
258
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
259
|
+
# Update schema and prompt to extract value-page_number pairs
|
|
260
|
+
if not use_chunking and number_of_pages > 1:
|
|
261
|
+
response_schema = transform_schema_strings(response_schema)
|
|
262
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
263
|
+
|
|
264
|
+
tasks = []
|
|
265
|
+
# Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
|
|
266
|
+
for chunk in (
|
|
267
|
+
split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
|
|
268
|
+
if use_chunking
|
|
269
|
+
else [file_content]
|
|
270
|
+
):
|
|
271
|
+
tasks.append(
|
|
272
|
+
process_chunk_with_retry(
|
|
273
|
+
chunk, prompt, response_schema, llm_client, input_doc_type
|
|
274
|
+
)
|
|
272
275
|
)
|
|
273
276
|
|
|
274
|
-
|
|
277
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
275
278
|
|
|
276
|
-
|
|
277
|
-
|
|
279
|
+
if use_chunking:
|
|
280
|
+
return merge_llm_results(results, response_schema)
|
|
281
|
+
else:
|
|
282
|
+
return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
async def process_chunk_with_retry(
|
|
286
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
|
|
287
|
+
):
|
|
288
|
+
"""Process a chunk with retries in case of failure."""
|
|
289
|
+
for attempt in range(1, retries + 1):
|
|
290
|
+
try:
|
|
291
|
+
return await process_chunk(
|
|
292
|
+
chunk_content=chunk_content,
|
|
293
|
+
prompt=prompt,
|
|
294
|
+
response_schema=response_schema,
|
|
295
|
+
llm_client=llm_client,
|
|
296
|
+
input_doc_type=input_doc_type,
|
|
297
|
+
)
|
|
298
|
+
except Exception as e:
|
|
299
|
+
logger.error(f"Chunk failed on attempt {attempt}: {e}")
|
|
300
|
+
if attempt == retries:
|
|
301
|
+
raise
|
|
302
|
+
await asyncio.sleep(1) # small backoff
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
async def process_chunk(
|
|
306
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type
|
|
307
|
+
):
|
|
308
|
+
"""Process a chunk with Gemini."""
|
|
309
|
+
document = llm_client.prepare_document_for_gemini(chunk_content)
|
|
310
|
+
return await llm_client.get_unified_json_genai(
|
|
311
|
+
prompt=prompt,
|
|
312
|
+
document=document,
|
|
313
|
+
response_schema=response_schema,
|
|
314
|
+
doc_type=input_doc_type,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def merge_llm_results(results, response_schema):
|
|
319
|
+
"""Merge LLM results from multiple chunks."""
|
|
320
|
+
merged = {}
|
|
321
|
+
for i, result in enumerate(results):
|
|
322
|
+
if not isinstance(result, dict):
|
|
323
|
+
continue
|
|
324
|
+
# Add page number to all values coming from this chunk
|
|
325
|
+
result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
|
|
326
|
+
|
|
327
|
+
# Merge the result into the final merged dictionary
|
|
328
|
+
for key, value in result.items():
|
|
329
|
+
field_type = (
|
|
330
|
+
response_schema["properties"].get(key, {}).get("type", "").upper()
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
if key not in merged:
|
|
334
|
+
if field_type == "ARRAY":
|
|
335
|
+
# append the values as a list
|
|
336
|
+
merged[key] = (
|
|
337
|
+
value if isinstance(value, list) else ([value] if value else [])
|
|
338
|
+
)
|
|
339
|
+
else:
|
|
340
|
+
merged[key] = value
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
if field_type == "ARRAY":
|
|
344
|
+
# append list contents across chunks
|
|
345
|
+
if isinstance(value, list):
|
|
346
|
+
merged[key].extend(value)
|
|
347
|
+
else:
|
|
348
|
+
merged[key].append(value)
|
|
349
|
+
|
|
350
|
+
# take first non-null value only
|
|
351
|
+
if merged[key] in (None, "", [], {}):
|
|
352
|
+
merged[key] = value
|
|
353
|
+
|
|
354
|
+
return merged
|
|
278
355
|
|
|
279
356
|
|
|
280
357
|
async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
|
{data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/postprocessing/common.py
RENAMED
|
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
|
|
|
12
12
|
from src.io import logger
|
|
13
13
|
from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
|
|
14
14
|
from src.prompts.prompt_library import prompt_library
|
|
15
|
-
from src.utils import get_tms_mappings
|
|
15
|
+
from src.utils import batch_fetch_all_mappings, get_tms_mappings
|
|
16
16
|
|
|
17
17
|
tms_domain = os.environ["TMS_DOMAIN"]
|
|
18
18
|
|
|
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
|
|
|
134
134
|
formatted_value: string
|
|
135
135
|
|
|
136
136
|
"""
|
|
137
|
+
# Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
|
|
138
|
+
value = remove_unwanted_patterns(data_field_value)
|
|
139
|
+
|
|
137
140
|
formatted_value = ""
|
|
138
|
-
for c in
|
|
141
|
+
for c in value:
|
|
139
142
|
if c.isnumeric() or c in [",", ".", "-"]:
|
|
140
143
|
formatted_value += c
|
|
141
144
|
|
|
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
|
|
|
320
323
|
lineitem = lineitem.replace("HIGH CUBE", "")
|
|
321
324
|
|
|
322
325
|
# Remove container size e.g., 20FT, 40HC, etc.
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
+
pattern = [
|
|
327
|
+
f"{s}{t}"
|
|
328
|
+
for s in ("20|22|40|45".split("|"))
|
|
329
|
+
for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
|
|
330
|
+
]
|
|
331
|
+
lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
|
|
326
332
|
|
|
327
333
|
return lineitem
|
|
328
334
|
|
|
@@ -372,18 +378,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
372
378
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
373
379
|
|
|
374
380
|
|
|
375
|
-
async def format_label(
|
|
381
|
+
async def format_label(
|
|
382
|
+
entity_k,
|
|
383
|
+
entity_value,
|
|
384
|
+
document_type_code,
|
|
385
|
+
params,
|
|
386
|
+
mime_type,
|
|
387
|
+
container_map,
|
|
388
|
+
terminal_map,
|
|
389
|
+
depot_map,
|
|
390
|
+
):
|
|
376
391
|
llm_client = params["LlmClient"]
|
|
377
392
|
if isinstance(entity_value, dict): # if it's a nested entity
|
|
378
393
|
format_tasks = [
|
|
379
|
-
format_label(
|
|
394
|
+
format_label(
|
|
395
|
+
sub_k,
|
|
396
|
+
sub_v,
|
|
397
|
+
document_type_code,
|
|
398
|
+
params,
|
|
399
|
+
mime_type,
|
|
400
|
+
container_map,
|
|
401
|
+
terminal_map,
|
|
402
|
+
depot_map,
|
|
403
|
+
)
|
|
380
404
|
for sub_k, sub_v in entity_value.items()
|
|
381
405
|
]
|
|
382
406
|
return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
|
|
383
407
|
if isinstance(entity_value, list):
|
|
384
408
|
format_tasks = await asyncio.gather(
|
|
385
409
|
*[
|
|
386
|
-
format_label(
|
|
410
|
+
format_label(
|
|
411
|
+
entity_k,
|
|
412
|
+
sub_v,
|
|
413
|
+
document_type_code,
|
|
414
|
+
params,
|
|
415
|
+
mime_type,
|
|
416
|
+
container_map,
|
|
417
|
+
terminal_map,
|
|
418
|
+
depot_map,
|
|
419
|
+
)
|
|
387
420
|
for sub_v in entity_value
|
|
388
421
|
]
|
|
389
422
|
)
|
|
@@ -405,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
|
|
|
405
438
|
)
|
|
406
439
|
|
|
407
440
|
elif (entity_key == "containertype") or (entity_key == "containersize"):
|
|
408
|
-
formatted_value =
|
|
441
|
+
formatted_value = container_map.get(entity_value)
|
|
409
442
|
|
|
410
443
|
elif check_formatting_rule(entity_k, document_type_code, "terminal"):
|
|
411
|
-
formatted_value =
|
|
444
|
+
formatted_value = terminal_map.get(entity_value)
|
|
412
445
|
|
|
413
446
|
elif check_formatting_rule(entity_k, document_type_code, "depot"):
|
|
414
|
-
formatted_value =
|
|
447
|
+
formatted_value = depot_map.get(entity_value)
|
|
415
448
|
|
|
416
449
|
elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
|
|
417
450
|
try:
|
|
@@ -507,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
|
|
|
507
540
|
"""Get port code using AI model."""
|
|
508
541
|
port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
|
|
509
542
|
|
|
510
|
-
|
|
543
|
+
result = await get_tms_mappings(port, "ports", port_llm)
|
|
544
|
+
return result.get(port, None)
|
|
511
545
|
|
|
512
546
|
|
|
513
547
|
async def get_port_code_llm(port: str, llm_client, doc_type=None):
|
|
@@ -598,6 +632,74 @@ def decimal_convertor(value, quantity=False):
|
|
|
598
632
|
return value
|
|
599
633
|
|
|
600
634
|
|
|
635
|
+
async def collect_mapping_requests(entity_value, document_type_code):
|
|
636
|
+
"""Collect all unique container types, terminals, and depots from the entity value."""
|
|
637
|
+
# Sets to store unique values
|
|
638
|
+
container_types = set()
|
|
639
|
+
terminals = set()
|
|
640
|
+
depots = set()
|
|
641
|
+
|
|
642
|
+
def walk(key, value):
|
|
643
|
+
key_lower = key.lower()
|
|
644
|
+
|
|
645
|
+
# nested dict
|
|
646
|
+
if isinstance(value, dict):
|
|
647
|
+
for k, v in value.items():
|
|
648
|
+
walk(k, v)
|
|
649
|
+
|
|
650
|
+
# list of values
|
|
651
|
+
elif isinstance(value, list):
|
|
652
|
+
for item in value:
|
|
653
|
+
walk(key, item)
|
|
654
|
+
|
|
655
|
+
# leaf node
|
|
656
|
+
else:
|
|
657
|
+
if key_lower in ("containertype", "containersize"):
|
|
658
|
+
# Take only "20DV" from ('20DV', 0) if it's a tuple
|
|
659
|
+
container_types.add(value[0]) if isinstance(
|
|
660
|
+
value, tuple
|
|
661
|
+
) else container_types.add(value)
|
|
662
|
+
|
|
663
|
+
elif check_formatting_rule(key, document_type_code, "terminal"):
|
|
664
|
+
terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
|
|
665
|
+
value
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
elif check_formatting_rule(key, document_type_code, "depot"):
|
|
669
|
+
depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
|
|
670
|
+
|
|
671
|
+
walk("root", entity_value)
|
|
672
|
+
|
|
673
|
+
return container_types, terminals, depots
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
async def format_all_labels(entity_data, document_type_code, params, mime_type):
|
|
677
|
+
"""Format all labels in the entity data using cached mappings."""
|
|
678
|
+
# Collect all mapping values needed
|
|
679
|
+
container_req, terminal_req, depot_req = await collect_mapping_requests(
|
|
680
|
+
entity_data, document_type_code
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Batch fetch mappings
|
|
684
|
+
container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
|
|
685
|
+
container_req, terminal_req, depot_req
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Format labels using cached mappings
|
|
689
|
+
_, result = await format_label(
|
|
690
|
+
"root",
|
|
691
|
+
entity_data,
|
|
692
|
+
document_type_code,
|
|
693
|
+
params,
|
|
694
|
+
mime_type,
|
|
695
|
+
container_map,
|
|
696
|
+
terminal_map,
|
|
697
|
+
depot_map,
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
return _, result
|
|
701
|
+
|
|
702
|
+
|
|
601
703
|
async def format_all_entities(result, document_type_code, params, mime_type):
|
|
602
704
|
"""Format the entity values in the result dictionary."""
|
|
603
705
|
# Since we treat `customsInvoice` same as `partnerInvoice`
|
|
@@ -613,13 +715,13 @@ async def format_all_entities(result, document_type_code, params, mime_type):
|
|
|
613
715
|
return {}
|
|
614
716
|
|
|
615
717
|
# Format all entities recursively
|
|
616
|
-
_, aggregated_data = await
|
|
617
|
-
|
|
718
|
+
_, aggregated_data = await format_all_labels(
|
|
719
|
+
result, document_type_code, params, mime_type
|
|
618
720
|
)
|
|
619
721
|
|
|
620
722
|
# Process partner invoice on lineitem mapping and reverse charge sentence
|
|
621
723
|
if document_type_code in ["partnerInvoice", "bundeskasse"]:
|
|
622
|
-
process_partner_invoice(params, aggregated_data, document_type_code)
|
|
724
|
+
await process_partner_invoice(params, aggregated_data, document_type_code)
|
|
623
725
|
|
|
624
726
|
logger.info("Data Extraction completed successfully")
|
|
625
727
|
return aggregated_data
|
|
@@ -651,41 +753,46 @@ def remove_stop_words(lineitem: str):
|
|
|
651
753
|
)
|
|
652
754
|
|
|
653
755
|
|
|
654
|
-
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
|
|
756
|
+
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
|
|
655
757
|
"""Convert LLM prediction dictionary to tuples of (value, page_number)."""
|
|
656
|
-
|
|
657
758
|
# If only 1 page, simply pair each value with page number 0
|
|
658
759
|
if number_of_pages == 1:
|
|
760
|
+
effective_page = 0 if page_number is None else page_number
|
|
659
761
|
if isinstance(llm_prediction, dict):
|
|
660
762
|
return {
|
|
661
|
-
k: llm_prediction_to_tuples(
|
|
763
|
+
k: llm_prediction_to_tuples(
|
|
764
|
+
v, number_of_pages, page_number=effective_page
|
|
765
|
+
)
|
|
662
766
|
for k, v in llm_prediction.items()
|
|
663
767
|
}
|
|
664
768
|
elif isinstance(llm_prediction, list):
|
|
665
769
|
return [
|
|
666
|
-
llm_prediction_to_tuples(v, number_of_pages)
|
|
770
|
+
llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
|
|
771
|
+
for v in llm_prediction
|
|
667
772
|
]
|
|
668
773
|
else:
|
|
669
|
-
return (llm_prediction,
|
|
774
|
+
return (llm_prediction, effective_page) if llm_prediction else None
|
|
670
775
|
|
|
671
776
|
# logic for multi-page predictions
|
|
672
777
|
if isinstance(llm_prediction, dict):
|
|
673
778
|
if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
|
|
674
779
|
if llm_prediction["value"]:
|
|
675
780
|
try:
|
|
676
|
-
|
|
781
|
+
_page_number = int(llm_prediction["page_number"])
|
|
677
782
|
except: # noqa: E722
|
|
678
|
-
|
|
679
|
-
return (llm_prediction["value"],
|
|
783
|
+
_page_number = -1
|
|
784
|
+
return (llm_prediction["value"], _page_number)
|
|
680
785
|
return None
|
|
681
786
|
|
|
682
787
|
for key, value in llm_prediction.items():
|
|
683
788
|
llm_prediction[key] = llm_prediction_to_tuples(
|
|
684
|
-
llm_prediction.get(key, value), number_of_pages
|
|
789
|
+
llm_prediction.get(key, value), number_of_pages, page_number
|
|
685
790
|
)
|
|
686
791
|
|
|
687
792
|
elif isinstance(llm_prediction, list):
|
|
688
793
|
for i, item in enumerate(llm_prediction):
|
|
689
|
-
llm_prediction[i] = llm_prediction_to_tuples(
|
|
794
|
+
llm_prediction[i] = llm_prediction_to_tuples(
|
|
795
|
+
item, number_of_pages, page_number
|
|
796
|
+
)
|
|
690
797
|
|
|
691
798
|
return llm_prediction
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
"""This module contains the postprocessing functions for the partner invoice."""
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
from fuzzywuzzy import fuzz
|
|
2
|
+
from rapidfuzz import fuzz, process
|
|
5
3
|
|
|
6
4
|
from src.io import logger
|
|
7
5
|
from src.utils import get_tms_mappings
|
|
@@ -136,7 +134,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
136
134
|
] = "Dasbachstraße 15, 54292 Trier, Germany"
|
|
137
135
|
|
|
138
136
|
|
|
139
|
-
def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
137
|
+
async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
140
138
|
"""Process the partner invoice data."""
|
|
141
139
|
# Post process bundeskasse invoices
|
|
142
140
|
if document_type_code == "bundeskasse":
|
|
@@ -160,27 +158,80 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
160
158
|
reverse_charge_info["formattedValue"] = reverse_charge_value
|
|
161
159
|
reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
|
|
162
160
|
|
|
163
|
-
# Process
|
|
164
|
-
|
|
165
|
-
if line_item.get("lineItemDescription", None) is not None:
|
|
166
|
-
line_item["itemCode"] = associate_forto_item_code(
|
|
167
|
-
line_item["lineItemDescription"]["formattedValue"],
|
|
168
|
-
params,
|
|
169
|
-
)
|
|
161
|
+
# Process everything in one go
|
|
162
|
+
processed_items = await process_line_items_batch(params, line_items, reverse_charge)
|
|
170
163
|
|
|
171
|
-
|
|
172
|
-
|
|
164
|
+
# Update your main data structure
|
|
165
|
+
aggregated_data["lineItem"] = processed_items
|
|
173
166
|
|
|
174
|
-
if reverse_charge:
|
|
175
|
-
# Distribute reverseChargeSentence to all line items
|
|
176
|
-
line_item["reverseChargeSentence"] = reverse_charge
|
|
177
|
-
line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
|
|
178
167
|
|
|
168
|
+
async def process_line_items_batch(
|
|
169
|
+
params: dict, line_items: list[dict], reverse_charge=None
|
|
170
|
+
):
|
|
171
|
+
"""
|
|
172
|
+
Processes all line items efficiently using a "Split-Apply-Combine" strategy.
|
|
173
|
+
"""
|
|
174
|
+
# To store items that need external API lookup
|
|
175
|
+
pending_line_items = {}
|
|
176
|
+
|
|
177
|
+
# Check Fuzzy Matching
|
|
178
|
+
logger.info(f"Mapping line item codes with Fuzzy matching....")
|
|
179
|
+
for i, item in enumerate(line_items):
|
|
180
|
+
description_obj = item.get("lineItemDescription")
|
|
181
|
+
|
|
182
|
+
if not description_obj or not description_obj.get("formattedValue"):
|
|
183
|
+
continue
|
|
184
|
+
# Get the formatted description text
|
|
185
|
+
desc = description_obj["formattedValue"]
|
|
186
|
+
|
|
187
|
+
# Find Fuzzy Match
|
|
188
|
+
matched_code = find_matching_lineitem(
|
|
189
|
+
desc,
|
|
190
|
+
params["lookup_data"]["item_code"],
|
|
191
|
+
params["fuzzy_threshold_item_code"],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if matched_code:
|
|
195
|
+
# Set the code to the line item
|
|
196
|
+
item["itemCode"] = {
|
|
197
|
+
"documentValue": desc,
|
|
198
|
+
"formattedValue": matched_code,
|
|
199
|
+
"page": description_obj.get("page"),
|
|
200
|
+
}
|
|
201
|
+
else:
|
|
202
|
+
# Store for batch API call
|
|
203
|
+
pending_line_items[i] = desc
|
|
204
|
+
|
|
205
|
+
# Batch API Call for Embedding lookups
|
|
206
|
+
if pending_line_items:
|
|
207
|
+
values_to_fetch = list(set(pending_line_items.values()))
|
|
208
|
+
logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
|
|
209
|
+
|
|
210
|
+
# Await the batch response {"desc1": "code1", "desc2": "code2"}
|
|
211
|
+
api_results = await get_tms_mappings(
|
|
212
|
+
input_list=values_to_fetch, embedding_type="line_items"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Merge API results back into original list
|
|
216
|
+
for index, desc in pending_line_items.items():
|
|
217
|
+
# Get result from API response, or None if API failed for that item
|
|
218
|
+
forto_code = api_results.get(desc)
|
|
219
|
+
|
|
220
|
+
# Update the original item
|
|
221
|
+
line_items[index]["itemCode"] = {
|
|
222
|
+
"documentValue": desc,
|
|
223
|
+
"formattedValue": forto_code, # Might be None if API failed
|
|
224
|
+
"page": line_items[index]["lineItemDescription"].get("page"),
|
|
225
|
+
}
|
|
179
226
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
227
|
+
# Add reverse charge here if exists
|
|
228
|
+
if reverse_charge:
|
|
229
|
+
[
|
|
230
|
+
item.update({"reverseChargeSentence": reverse_charge})
|
|
231
|
+
for item in line_items
|
|
232
|
+
if item["itemCode"]["formattedValue"] != "CDU"
|
|
233
|
+
]
|
|
234
|
+
return line_items
|
|
184
235
|
|
|
185
236
|
|
|
186
237
|
def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
|
|
@@ -195,16 +246,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
|
|
|
195
246
|
tuple: (best_match, score) if above threshold, else (None, 0)
|
|
196
247
|
"""
|
|
197
248
|
# Use multiprocessing to find the best match
|
|
198
|
-
|
|
199
|
-
|
|
249
|
+
result = process.extractOne(
|
|
250
|
+
target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
|
|
251
|
+
)
|
|
200
252
|
|
|
201
|
-
|
|
202
|
-
|
|
253
|
+
if result is None:
|
|
254
|
+
return None, False
|
|
203
255
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
if
|
|
207
|
-
|
|
256
|
+
match, score, index = result
|
|
257
|
+
|
|
258
|
+
# return best_match if the best match score is above a threshold (e.g., 80)
|
|
259
|
+
if match:
|
|
260
|
+
return match, True
|
|
208
261
|
|
|
209
262
|
return None, False
|
|
210
263
|
|
|
@@ -236,46 +289,59 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
|
236
289
|
Returns:
|
|
237
290
|
str: The best matching 'Forto SLI' value from the dictionary.
|
|
238
291
|
"""
|
|
239
|
-
new_lineitem = new_lineitem.upper()
|
|
240
|
-
|
|
241
292
|
# Check if the new line item is already in the dictionary
|
|
242
293
|
if new_lineitem in kvp_dict:
|
|
243
294
|
return kvp_dict[new_lineitem]
|
|
244
295
|
|
|
245
296
|
# Get the best fuzzy match score for the extracted line item
|
|
246
|
-
|
|
247
|
-
new_lineitem,
|
|
297
|
+
match, _ = get_fuzzy_match_score(
|
|
298
|
+
new_lineitem,
|
|
299
|
+
list(kvp_dict.keys()),
|
|
300
|
+
threshold,
|
|
248
301
|
)
|
|
249
302
|
|
|
250
|
-
|
|
303
|
+
if match:
|
|
304
|
+
# find the code from the kvp_dict
|
|
305
|
+
return kvp_dict[match]
|
|
251
306
|
|
|
307
|
+
return None
|
|
252
308
|
|
|
253
|
-
def associate_forto_item_code(input_string, params):
|
|
254
|
-
"""
|
|
255
|
-
Finds a match for the input string using fuzzy matching first, then embedding fallback.
|
|
256
|
-
|
|
257
|
-
1. Tries to find a fuzzy match for input_string against the keys in
|
|
258
|
-
mapping_data using RapidFuzz, requiring a score >= fuzzy_threshold.
|
|
259
|
-
2. If found, returns the corresponding value from mapping_data.
|
|
260
|
-
3. If not found above threshold, calls the embedding_fallback function.
|
|
261
309
|
|
|
310
|
+
async def associate_forto_item_code(line_item_data, params):
|
|
311
|
+
"""
|
|
312
|
+
Associates Forto item codes to a list of line item descriptions.
|
|
262
313
|
Args:
|
|
263
|
-
|
|
264
|
-
params: Parameters containing
|
|
314
|
+
line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
|
|
315
|
+
params (dict): Parameters containing lookup data and thresholds.
|
|
265
316
|
|
|
266
317
|
Returns:
|
|
267
|
-
|
|
318
|
+
list: A list of dictionaries with 'description' and 'itemCode' keys.
|
|
268
319
|
"""
|
|
269
|
-
# Get the Forto item code using fuzzy matching
|
|
270
|
-
forto_item_code = find_matching_lineitem(
|
|
271
|
-
new_lineitem=input_string,
|
|
272
|
-
kvp_dict=params["lookup_data"]["item_code"], # TODO: Parse the KVP dictionary
|
|
273
|
-
threshold=params["fuzzy_threshold_item_code"],
|
|
274
|
-
)
|
|
275
320
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
321
|
+
result = []
|
|
322
|
+
pending_line_items = {}
|
|
323
|
+
for desc, f_desc in line_item_data.items():
|
|
324
|
+
# Get the Forto item code using fuzzy matching
|
|
325
|
+
code = find_matching_lineitem(
|
|
326
|
+
new_lineitem=f_desc,
|
|
327
|
+
kvp_dict=params["lookup_data"]["item_code"],
|
|
328
|
+
threshold=params["fuzzy_threshold_item_code"],
|
|
329
|
+
)
|
|
330
|
+
if code:
|
|
331
|
+
result.append({"description": desc, "itemCode": code})
|
|
332
|
+
else:
|
|
333
|
+
pending_line_items[desc] = f_desc
|
|
334
|
+
|
|
335
|
+
# Batch API Call for Embedding lookups
|
|
336
|
+
if pending_line_items:
|
|
337
|
+
api_results = await get_tms_mappings(
|
|
338
|
+
input_list=list(pending_line_items.values()),
|
|
339
|
+
embedding_type="line_items",
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Merge API results back into original list
|
|
343
|
+
for desc, f_desc in pending_line_items.items():
|
|
344
|
+
code = api_results.get(f_desc)
|
|
345
|
+
result.append({"description": desc, "itemCode": code})
|
|
279
346
|
|
|
280
|
-
result = {"documentValue": input_string, "formattedValue": forto_item_code}
|
|
281
347
|
return result
|
|
@@ -54,7 +54,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
54
54
|
- unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
|
|
55
55
|
- totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
|
|
56
56
|
- totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
|
|
57
|
-
- quantity: The quantity of the item or service provided in the line item.
|
|
57
|
+
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
|
|
58
58
|
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
|
|
59
59
|
|
|
60
60
|
- hblNumber and mblNumber:
|
|
@@ -52,8 +52,8 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
52
52
|
- unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
|
|
53
53
|
- totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
|
|
54
54
|
- totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
|
|
55
|
-
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and
|
|
56
|
-
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
|
|
55
|
+
- quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
|
|
56
|
+
- containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
|
|
57
57
|
|
|
58
58
|
- hblNumber and mblNumber:
|
|
59
59
|
- The Master Bill of Lading number. Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", or "HBL No.".
|
|
@@ -68,6 +68,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
68
68
|
- Example:
|
|
69
69
|
- "COSCO SHIPPING Lines Italy, Poland, or France S.R.L. – Genova Office – As Agent For COSCO SHIPPING Lines Co.,Ltd."
|
|
70
70
|
- vendorName: COSCO SHIPPING Lines Co.,Ltd.
|
|
71
|
+
- From Hapag-Lloyd invoices, look for "Ballindamm 25" address to extract the vendorAddress.
|
|
71
72
|
|
|
72
73
|
- agentName: Name of the agent. Agencies are offices authorized to act on behalf of a company. This details usually available including the branch name of the parent company name in the invoice.
|
|
73
74
|
- agentKeyWord:
|
|
@@ -81,6 +82,7 @@ Your role is to accurately extract specific entities from these invoices to supp
|
|
|
81
82
|
|
|
82
83
|
IMPORTANT NOTE:
|
|
83
84
|
- Ensure all extracted values are directly from the document. Do not make assumptions, modifications or calculations.
|
|
85
|
+
- Do not split the quantity into different line items. e.g., if quantity is 2 or 2 CTR or 2 BIL, do not create 2 separate line items with quantity 1 each.
|
|
84
86
|
- Do not normalize or modify any entity values.
|
|
85
87
|
- Pay attention to the line item details and paymentInformation, as they may vary significantly across different invoices.
|
|
86
88
|
|
|
@@ -6,16 +6,16 @@ import json
|
|
|
6
6
|
import os
|
|
7
7
|
import pickle
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from typing import Literal
|
|
9
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
10
10
|
|
|
11
|
+
import httpx
|
|
11
12
|
import numpy as np
|
|
12
13
|
import openpyxl
|
|
13
14
|
import pandas as pd
|
|
14
|
-
import requests
|
|
15
15
|
from google.cloud import documentai_v1beta3 as docu_ai_beta
|
|
16
16
|
from pypdf import PdfReader, PdfWriter
|
|
17
17
|
|
|
18
|
-
from src.io import get_storage_client, logger
|
|
18
|
+
from src.io import bq_logs, get_storage_client, logger
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def get_pdf_page_count(pdf_bytes):
|
|
@@ -31,29 +31,6 @@ def get_pdf_page_count(pdf_bytes):
|
|
|
31
31
|
return len(reader.pages)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def bq_logs(data_to_insert, params):
|
|
35
|
-
"""Insert logs into Google BigQuery.
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
data_to_insert (list): The data to insert into BigQuery.
|
|
39
|
-
params (dict): The parameters dictionary.
|
|
40
|
-
"""
|
|
41
|
-
# Use the pre-initialized BigQuery client
|
|
42
|
-
bq_client = params["bq_client"]
|
|
43
|
-
# Get the table string
|
|
44
|
-
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
45
|
-
|
|
46
|
-
logger.info(f"Log table: {table_string}")
|
|
47
|
-
# Insert the rows into the table
|
|
48
|
-
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
49
|
-
|
|
50
|
-
# Check if there were any errors inserting the rows
|
|
51
|
-
if not insert_logs:
|
|
52
|
-
logger.info("New rows have been added.")
|
|
53
|
-
else:
|
|
54
|
-
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
55
|
-
|
|
56
|
-
|
|
57
34
|
async def get_data_set_schema_from_docai(
|
|
58
35
|
schema_client, project_id=None, location=None, processor_id=None, name=None
|
|
59
36
|
):
|
|
@@ -383,9 +360,9 @@ def extract_top_pages(pdf_bytes, num_pages=4):
|
|
|
383
360
|
return output.getvalue()
|
|
384
361
|
|
|
385
362
|
|
|
386
|
-
def get_tms_mappings(
|
|
387
|
-
input_list:
|
|
388
|
-
):
|
|
363
|
+
async def get_tms_mappings(
|
|
364
|
+
input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
|
|
365
|
+
) -> Dict[str, Any]:
|
|
389
366
|
"""Get TMS mappings for the given values.
|
|
390
367
|
|
|
391
368
|
Args:
|
|
@@ -395,39 +372,66 @@ def get_tms_mappings(
|
|
|
395
372
|
llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
|
|
396
373
|
|
|
397
374
|
Returns:
|
|
398
|
-
dict: A dictionary with the mapping results.
|
|
375
|
+
dict or string: A dictionary or a string with the mapping results.
|
|
399
376
|
"""
|
|
400
|
-
# To test the API locally, port-forward the embedding service in the sandbox to 8080:80
|
|
401
|
-
# If you want to launch uvicorn from the tms-embedding repo, then use --port 8080 in the config file
|
|
402
377
|
base_url = (
|
|
403
378
|
"http://0.0.0.0:8080/"
|
|
404
379
|
if os.getenv("CLUSTER") is None
|
|
405
380
|
else "http://tms-mappings.api.svc.cluster.local./"
|
|
406
381
|
)
|
|
407
382
|
|
|
383
|
+
# Ensure clean inputs
|
|
384
|
+
if not input_list:
|
|
385
|
+
return {}
|
|
386
|
+
|
|
408
387
|
# Ensure input_list is a list
|
|
409
388
|
if not isinstance(input_list, list):
|
|
410
389
|
input_list = [input_list]
|
|
411
390
|
|
|
412
391
|
# Always send a dict with named keys
|
|
413
392
|
payload = {embedding_type: input_list}
|
|
393
|
+
|
|
414
394
|
if llm_ports:
|
|
415
395
|
payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
|
|
416
396
|
|
|
417
397
|
# Make the POST request to the TMS mappings API
|
|
418
|
-
url = f"{base_url}
|
|
419
|
-
response = requests.post(url=url, json=payload)
|
|
398
|
+
url = f"{base_url}{embedding_type}"
|
|
420
399
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
400
|
+
# Use a timeout so the code doesn't hang forever
|
|
401
|
+
timeout = httpx.Timeout(60.0, connect=10.0)
|
|
402
|
+
|
|
403
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
404
|
+
try:
|
|
405
|
+
response = await client.post(url, json=payload)
|
|
406
|
+
response.raise_for_status()
|
|
425
407
|
|
|
426
|
-
|
|
427
|
-
|
|
408
|
+
# Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
|
|
409
|
+
return response.json().get("response", {}).get("data", {})
|
|
410
|
+
|
|
411
|
+
except httpx.HTTPStatusError as exc:
|
|
412
|
+
logger.error(
|
|
413
|
+
f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
|
|
414
|
+
)
|
|
415
|
+
return {}
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
async def batch_fetch_all_mappings(container_types, terminals, depots):
|
|
419
|
+
"""Batch fetch all mappings for container types, terminals, and depots."""
|
|
420
|
+
# run batch calls concurrently
|
|
421
|
+
results = await asyncio.gather(
|
|
422
|
+
get_tms_mappings(list(container_types), "container_types"),
|
|
423
|
+
get_tms_mappings(list(terminals), "terminals"),
|
|
424
|
+
get_tms_mappings(list(depots), "depots"),
|
|
428
425
|
)
|
|
429
426
|
|
|
430
|
-
|
|
427
|
+
batch_container_map, batch_terminal_map, batch_depot_map = results
|
|
428
|
+
|
|
429
|
+
# Convert lists of tuples to dicts if necessary
|
|
430
|
+
return (
|
|
431
|
+
dict(batch_container_map or {}),
|
|
432
|
+
dict(batch_terminal_map or {}),
|
|
433
|
+
dict(batch_depot_map or {}),
|
|
434
|
+
)
|
|
431
435
|
|
|
432
436
|
|
|
433
437
|
def transform_schema_strings(schema):
|
|
@@ -502,3 +506,21 @@ def estimate_page_count(sheet):
|
|
|
502
506
|
else:
|
|
503
507
|
return None
|
|
504
508
|
return np.ceil(pg_cnt / 500)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
|
|
512
|
+
"""Split PDF into smaller page chunks."""
|
|
513
|
+
pdf = PdfReader(io.BytesIO(file_content))
|
|
514
|
+
total_pages = len(pdf.pages)
|
|
515
|
+
|
|
516
|
+
# TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
|
|
517
|
+
for i in range(0, total_pages, chunk_size):
|
|
518
|
+
writer = PdfWriter()
|
|
519
|
+
for j in range(i, min(i + chunk_size, total_pages)):
|
|
520
|
+
writer.add_page(pdf.pages[j])
|
|
521
|
+
|
|
522
|
+
buffer = io.BytesIO()
|
|
523
|
+
writer.write(buffer)
|
|
524
|
+
buffer.seek(0)
|
|
525
|
+
|
|
526
|
+
yield buffer.getvalue()
|
{data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.6 → data_science_document_ai-1.45.2}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|