data-science-document-ai 1.43.7__tar.gz → 1.44.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/PKG-INFO +1 -1
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/pyproject.toml +1 -1
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/constants.py +3 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/excel_processing.py +1 -2
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/io.py +23 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/pdf_processing.py +116 -40
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/postprocessing/common.py +20 -15
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/postprocessing/postprocess_partner_invoice.py +98 -35
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/utils.py +57 -45
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/docai.py +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/llm.py +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/log_setup.py +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/setup.py +0 -0
- {data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/tms.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.44.0"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -26,6 +26,9 @@ project_parameters = {
|
|
|
26
26
|
"fuzzy_threshold_item_code": 70,
|
|
27
27
|
"fuzzy_threshold_reverse_charge": 80,
|
|
28
28
|
"fuzzy_threshold_invoice_classification": 70,
|
|
29
|
+
# Chunking params
|
|
30
|
+
"chunk_size": 1, # page (do not change this without changing the page number logic)
|
|
31
|
+
"chunk_after": 10, # pages
|
|
29
32
|
# Big Query
|
|
30
33
|
"g_ai_gbq_db_schema": "document_ai",
|
|
31
34
|
"g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
|
|
@@ -4,8 +4,6 @@ import logging
|
|
|
4
4
|
|
|
5
5
|
from ddtrace import tracer
|
|
6
6
|
|
|
7
|
-
from src.postprocessing.common import llm_prediction_to_tuples
|
|
8
|
-
|
|
9
7
|
logger = logging.getLogger(__name__)
|
|
10
8
|
|
|
11
9
|
import asyncio
|
|
@@ -78,6 +76,7 @@ async def extract_data_from_excel(
|
|
|
78
76
|
"bundeskasse",
|
|
79
77
|
"commercialInvoice",
|
|
80
78
|
"packingList",
|
|
79
|
+
"bookingConfirmation",
|
|
81
80
|
]
|
|
82
81
|
else generate_schema_structure(params, input_doc_type)
|
|
83
82
|
)
|
|
@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
|
|
|
156
156
|
return result
|
|
157
157
|
|
|
158
158
|
|
|
159
|
+
def bq_logs(data_to_insert, params):
|
|
160
|
+
"""Insert logs into Google BigQuery.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
data_to_insert (list): The data to insert into BigQuery.
|
|
164
|
+
params (dict): The parameters dictionary.
|
|
165
|
+
"""
|
|
166
|
+
# Use the pre-initialized BigQuery client
|
|
167
|
+
bq_client = params["bq_client"]
|
|
168
|
+
# Get the table string
|
|
169
|
+
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
170
|
+
|
|
171
|
+
logger.info(f"Log table: {table_string}")
|
|
172
|
+
# Insert the rows into the table
|
|
173
|
+
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
174
|
+
|
|
175
|
+
# Check if there were any errors inserting the rows
|
|
176
|
+
if not insert_logs:
|
|
177
|
+
logger.info("New rows have been added.")
|
|
178
|
+
else:
|
|
179
|
+
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
180
|
+
|
|
181
|
+
|
|
159
182
|
# type: ignore
|
|
@@ -36,6 +36,7 @@ from src.utils import (
|
|
|
36
36
|
get_pdf_page_count,
|
|
37
37
|
get_processor_name,
|
|
38
38
|
run_background_tasks,
|
|
39
|
+
split_pdf_into_chunks,
|
|
39
40
|
transform_schema_strings,
|
|
40
41
|
validate_based_on_schema,
|
|
41
42
|
)
|
|
@@ -195,15 +196,10 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
195
196
|
result (dict): The structured data extracted from the document, formatted as JSON.
|
|
196
197
|
"""
|
|
197
198
|
# Bundeskasse invoices contains all the required information in the first 3 pages.
|
|
198
|
-
|
|
199
|
-
extract_top_pages(file_content, num_pages=5)
|
|
200
|
-
if input_doc_type == "bundeskasse"
|
|
201
|
-
else file_content
|
|
202
|
-
)
|
|
203
|
-
number_of_pages = get_pdf_page_count(file_content)
|
|
199
|
+
if input_doc_type == "bundeskasse":
|
|
200
|
+
file_content = extract_top_pages(file_content, num_pages=5)
|
|
204
201
|
|
|
205
|
-
|
|
206
|
-
document = llm_client.prepare_document_for_gemini(file_content)
|
|
202
|
+
number_of_pages = get_pdf_page_count(file_content)
|
|
207
203
|
|
|
208
204
|
# get the schema placeholder from the Doc AI and generate the response structure
|
|
209
205
|
response_schema = (
|
|
@@ -215,26 +211,28 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
215
211
|
"bundeskasse",
|
|
216
212
|
"commercialInvoice",
|
|
217
213
|
"packingList",
|
|
214
|
+
"bookingConfirmation",
|
|
218
215
|
]
|
|
219
216
|
else generate_schema_structure(params, input_doc_type)
|
|
220
217
|
)
|
|
221
218
|
|
|
222
219
|
carrier = "other"
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
)
|
|
229
|
-
carrier_schema = prompt_library.library["preprocessing"]["carrier"][
|
|
230
|
-
"placeholders"
|
|
231
|
-
][input_doc_type]
|
|
220
|
+
carrier_schema = (
|
|
221
|
+
prompt_library.library.get("preprocessing", {})
|
|
222
|
+
.get("carrier", {})
|
|
223
|
+
.get("placeholders", {})
|
|
224
|
+
.get(input_doc_type)
|
|
225
|
+
)
|
|
232
226
|
|
|
227
|
+
if carrier_schema:
|
|
233
228
|
carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
|
|
234
229
|
carrier_prompt = carrier_prompt.replace(
|
|
235
230
|
"DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
|
|
236
231
|
)
|
|
237
232
|
|
|
233
|
+
# convert file_content to required document
|
|
234
|
+
document = llm_client.prepare_document_for_gemini(file_content)
|
|
235
|
+
|
|
238
236
|
# identify carrier for customized prompting
|
|
239
237
|
carrier = await identify_carrier(
|
|
240
238
|
document,
|
|
@@ -244,37 +242,115 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
244
242
|
doc_type=input_doc_type,
|
|
245
243
|
)
|
|
246
244
|
|
|
247
|
-
|
|
248
|
-
response_schema = prompt_library.library[input_doc_type][carrier][
|
|
249
|
-
"placeholders"
|
|
250
|
-
]
|
|
251
|
-
|
|
245
|
+
# Select prompt
|
|
252
246
|
if (
|
|
253
|
-
input_doc_type in prompt_library.library
|
|
254
|
-
|
|
247
|
+
input_doc_type not in prompt_library.library
|
|
248
|
+
or carrier not in prompt_library.library[input_doc_type]
|
|
255
249
|
):
|
|
256
|
-
|
|
257
|
-
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
250
|
+
return {}
|
|
258
251
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
response_schema = transform_schema_strings(response_schema)
|
|
252
|
+
# get the related prompt from predefined prompt library
|
|
253
|
+
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
262
254
|
|
|
263
|
-
|
|
264
|
-
|
|
255
|
+
# Add page-number extraction for moderately large docs
|
|
256
|
+
use_chunking = number_of_pages >= params["chunk_after"]
|
|
265
257
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
258
|
+
# Update schema and prompt to extract value-page_number pairs
|
|
259
|
+
if not use_chunking and number_of_pages > 1:
|
|
260
|
+
response_schema = transform_schema_strings(response_schema)
|
|
261
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
262
|
+
|
|
263
|
+
tasks = []
|
|
264
|
+
# Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
|
|
265
|
+
for chunk in (
|
|
266
|
+
split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
|
|
267
|
+
if use_chunking
|
|
268
|
+
else [file_content]
|
|
269
|
+
):
|
|
270
|
+
tasks.append(
|
|
271
|
+
process_chunk_with_retry(
|
|
272
|
+
chunk, prompt, response_schema, llm_client, input_doc_type
|
|
273
|
+
)
|
|
272
274
|
)
|
|
273
275
|
|
|
274
|
-
|
|
276
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
275
277
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
+
if use_chunking:
|
|
279
|
+
return merge_llm_results(results, response_schema)
|
|
280
|
+
else:
|
|
281
|
+
return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
async def process_chunk_with_retry(
|
|
285
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
|
|
286
|
+
):
|
|
287
|
+
"""Process a chunk with retries in case of failure."""
|
|
288
|
+
for attempt in range(1, retries + 1):
|
|
289
|
+
try:
|
|
290
|
+
return await process_chunk(
|
|
291
|
+
chunk_content=chunk_content,
|
|
292
|
+
prompt=prompt,
|
|
293
|
+
response_schema=response_schema,
|
|
294
|
+
llm_client=llm_client,
|
|
295
|
+
input_doc_type=input_doc_type,
|
|
296
|
+
)
|
|
297
|
+
except Exception as e:
|
|
298
|
+
logger.error(f"Chunk failed on attempt {attempt}: {e}")
|
|
299
|
+
if attempt == retries:
|
|
300
|
+
raise
|
|
301
|
+
await asyncio.sleep(1) # small backoff
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
async def process_chunk(
|
|
305
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type
|
|
306
|
+
):
|
|
307
|
+
"""Process a chunk with Gemini."""
|
|
308
|
+
document = llm_client.prepare_document_for_gemini(chunk_content)
|
|
309
|
+
return await llm_client.get_unified_json_genai(
|
|
310
|
+
prompt=prompt,
|
|
311
|
+
document=document,
|
|
312
|
+
response_schema=response_schema,
|
|
313
|
+
doc_type=input_doc_type,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def merge_llm_results(results, response_schema):
|
|
318
|
+
"""Merge LLM results from multiple chunks."""
|
|
319
|
+
merged = {}
|
|
320
|
+
for i, result in enumerate(results):
|
|
321
|
+
if not isinstance(result, dict):
|
|
322
|
+
continue
|
|
323
|
+
# Add page number to all values coming from this chunk
|
|
324
|
+
result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
|
|
325
|
+
|
|
326
|
+
# Merge the result into the final merged dictionary
|
|
327
|
+
for key, value in result.items():
|
|
328
|
+
field_type = (
|
|
329
|
+
response_schema["properties"].get(key, {}).get("type", "").upper()
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
if key not in merged:
|
|
333
|
+
if field_type == "ARRAY":
|
|
334
|
+
# append the values as a list
|
|
335
|
+
merged[key] = (
|
|
336
|
+
value if isinstance(value, list) else ([value] if value else [])
|
|
337
|
+
)
|
|
338
|
+
else:
|
|
339
|
+
merged[key] = value
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
if field_type == "ARRAY":
|
|
343
|
+
# append list contents across chunks
|
|
344
|
+
if isinstance(value, list):
|
|
345
|
+
merged[key].extend(value)
|
|
346
|
+
else:
|
|
347
|
+
merged[key].append(value)
|
|
348
|
+
|
|
349
|
+
# take first non-null value only
|
|
350
|
+
if merged[key] in (None, "", [], {}):
|
|
351
|
+
merged[key] = value
|
|
352
|
+
|
|
353
|
+
return merged
|
|
278
354
|
|
|
279
355
|
|
|
280
356
|
async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
|
{data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/postprocessing/common.py
RENAMED
|
@@ -405,13 +405,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
|
|
|
405
405
|
)
|
|
406
406
|
|
|
407
407
|
elif (entity_key == "containertype") or (entity_key == "containersize"):
|
|
408
|
-
formatted_value = get_tms_mappings(entity_value, "container_types")
|
|
408
|
+
formatted_value = await get_tms_mappings(entity_value, "container_types")
|
|
409
409
|
|
|
410
410
|
elif check_formatting_rule(entity_k, document_type_code, "terminal"):
|
|
411
|
-
formatted_value = get_tms_mappings(entity_value, "terminals")
|
|
411
|
+
formatted_value = await get_tms_mappings(entity_value, "terminals")
|
|
412
412
|
|
|
413
413
|
elif check_formatting_rule(entity_k, document_type_code, "depot"):
|
|
414
|
-
formatted_value = get_tms_mappings(entity_value, "depots")
|
|
414
|
+
formatted_value = await get_tms_mappings(entity_value, "depots")
|
|
415
415
|
|
|
416
416
|
elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
|
|
417
417
|
try:
|
|
@@ -507,7 +507,7 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
|
|
|
507
507
|
"""Get port code using AI model."""
|
|
508
508
|
port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
|
|
509
509
|
|
|
510
|
-
return get_tms_mappings(port, "ports", port_llm)
|
|
510
|
+
return await get_tms_mappings(port, "ports", port_llm)
|
|
511
511
|
|
|
512
512
|
|
|
513
513
|
async def get_port_code_llm(port: str, llm_client, doc_type=None):
|
|
@@ -619,7 +619,7 @@ async def format_all_entities(result, document_type_code, params, mime_type):
|
|
|
619
619
|
|
|
620
620
|
# Process partner invoice on lineitem mapping and reverse charge sentence
|
|
621
621
|
if document_type_code in ["partnerInvoice", "bundeskasse"]:
|
|
622
|
-
process_partner_invoice(params, aggregated_data, document_type_code)
|
|
622
|
+
await process_partner_invoice(params, aggregated_data, document_type_code)
|
|
623
623
|
|
|
624
624
|
logger.info("Data Extraction completed successfully")
|
|
625
625
|
return aggregated_data
|
|
@@ -651,41 +651,46 @@ def remove_stop_words(lineitem: str):
|
|
|
651
651
|
)
|
|
652
652
|
|
|
653
653
|
|
|
654
|
-
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
|
|
654
|
+
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
|
|
655
655
|
"""Convert LLM prediction dictionary to tuples of (value, page_number)."""
|
|
656
|
-
|
|
657
656
|
# If only 1 page, simply pair each value with page number 0
|
|
658
657
|
if number_of_pages == 1:
|
|
658
|
+
effective_page = 0 if page_number is None else page_number
|
|
659
659
|
if isinstance(llm_prediction, dict):
|
|
660
660
|
return {
|
|
661
|
-
k: llm_prediction_to_tuples(
|
|
661
|
+
k: llm_prediction_to_tuples(
|
|
662
|
+
v, number_of_pages, page_number=effective_page
|
|
663
|
+
)
|
|
662
664
|
for k, v in llm_prediction.items()
|
|
663
665
|
}
|
|
664
666
|
elif isinstance(llm_prediction, list):
|
|
665
667
|
return [
|
|
666
|
-
llm_prediction_to_tuples(v, number_of_pages)
|
|
668
|
+
llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
|
|
669
|
+
for v in llm_prediction
|
|
667
670
|
]
|
|
668
671
|
else:
|
|
669
|
-
return (llm_prediction,
|
|
672
|
+
return (llm_prediction, effective_page) if llm_prediction else None
|
|
670
673
|
|
|
671
674
|
# logic for multi-page predictions
|
|
672
675
|
if isinstance(llm_prediction, dict):
|
|
673
676
|
if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
|
|
674
677
|
if llm_prediction["value"]:
|
|
675
678
|
try:
|
|
676
|
-
|
|
679
|
+
_page_number = int(llm_prediction["page_number"])
|
|
677
680
|
except: # noqa: E722
|
|
678
|
-
|
|
679
|
-
return (llm_prediction["value"],
|
|
681
|
+
_page_number = -1
|
|
682
|
+
return (llm_prediction["value"], _page_number)
|
|
680
683
|
return None
|
|
681
684
|
|
|
682
685
|
for key, value in llm_prediction.items():
|
|
683
686
|
llm_prediction[key] = llm_prediction_to_tuples(
|
|
684
|
-
llm_prediction.get(key, value), number_of_pages
|
|
687
|
+
llm_prediction.get(key, value), number_of_pages, page_number
|
|
685
688
|
)
|
|
686
689
|
|
|
687
690
|
elif isinstance(llm_prediction, list):
|
|
688
691
|
for i, item in enumerate(llm_prediction):
|
|
689
|
-
llm_prediction[i] = llm_prediction_to_tuples(
|
|
692
|
+
llm_prediction[i] = llm_prediction_to_tuples(
|
|
693
|
+
item, number_of_pages, page_number
|
|
694
|
+
)
|
|
690
695
|
|
|
691
696
|
return llm_prediction
|
|
@@ -136,7 +136,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
136
136
|
] = "Dasbachstraße 15, 54292 Trier, Germany"
|
|
137
137
|
|
|
138
138
|
|
|
139
|
-
def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
139
|
+
async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
140
140
|
"""Process the partner invoice data."""
|
|
141
141
|
# Post process bundeskasse invoices
|
|
142
142
|
if document_type_code == "bundeskasse":
|
|
@@ -160,21 +160,75 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
160
160
|
reverse_charge_info["formattedValue"] = reverse_charge_value
|
|
161
161
|
reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
|
|
162
162
|
|
|
163
|
-
# Process
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
params,
|
|
169
|
-
)
|
|
163
|
+
# Process everything in one go
|
|
164
|
+
processed_items = await process_line_items_batch(params, line_items, reverse_charge)
|
|
165
|
+
|
|
166
|
+
# Update your main data structure
|
|
167
|
+
aggregated_data["lineItem"] = processed_items
|
|
170
168
|
|
|
171
|
-
# Add page number for the consistency
|
|
172
|
-
line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
|
|
173
169
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
170
|
+
async def process_line_items_batch(
|
|
171
|
+
params: dict, line_items: list[dict], reverse_charge=None
|
|
172
|
+
):
|
|
173
|
+
"""
|
|
174
|
+
Processes all line items efficiently using a "Split-Apply-Combine" strategy.
|
|
175
|
+
"""
|
|
176
|
+
# To store items that need external API lookup
|
|
177
|
+
pending_line_items = {}
|
|
178
|
+
|
|
179
|
+
# Check Fuzzy Matching
|
|
180
|
+
for i, item in enumerate(line_items):
|
|
181
|
+
description_obj = item.get("lineItemDescription")
|
|
182
|
+
|
|
183
|
+
if not description_obj or not description_obj.get("formattedValue"):
|
|
184
|
+
continue
|
|
185
|
+
# Get the formatted description text
|
|
186
|
+
desc = description_obj["formattedValue"]
|
|
187
|
+
|
|
188
|
+
# Find Fuzzy Match
|
|
189
|
+
matched_code = find_matching_lineitem(
|
|
190
|
+
desc,
|
|
191
|
+
params["lookup_data"]["item_code"],
|
|
192
|
+
params["fuzzy_threshold_item_code"],
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
if matched_code:
|
|
196
|
+
# Set the code to the line item
|
|
197
|
+
item["itemCode"] = {
|
|
198
|
+
"documentValue": desc,
|
|
199
|
+
"formattedValue": matched_code,
|
|
200
|
+
"page": description_obj.get("page"),
|
|
201
|
+
}
|
|
202
|
+
else:
|
|
203
|
+
# Store for batch API call
|
|
204
|
+
pending_line_items[i] = desc
|
|
205
|
+
|
|
206
|
+
# Batch API Call for Embedding lookups
|
|
207
|
+
if pending_line_items:
|
|
208
|
+
values_to_fetch = list(set(pending_line_items.values()))
|
|
209
|
+
logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
|
|
210
|
+
|
|
211
|
+
# Await the batch response {"desc1": "code1", "desc2": "code2"}
|
|
212
|
+
api_results = await get_tms_mappings(
|
|
213
|
+
input_list=values_to_fetch, embedding_type="line_items"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Merge API results back into original list
|
|
217
|
+
for index, desc in pending_line_items.items():
|
|
218
|
+
# Get result from API response, or None if API failed for that item
|
|
219
|
+
forto_code = api_results.get(desc)
|
|
220
|
+
|
|
221
|
+
# Update the original item
|
|
222
|
+
line_items[index]["itemCode"] = {
|
|
223
|
+
"documentValue": desc,
|
|
224
|
+
"formattedValue": forto_code, # Might be None if API failed
|
|
225
|
+
"page": line_items[index]["lineItemDescription"].get("page"),
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
# Add reverse charge here if exists
|
|
229
|
+
if reverse_charge:
|
|
230
|
+
[item.update({"reverseChargeSentence": reverse_charge}) for item in line_items]
|
|
231
|
+
return line_items
|
|
178
232
|
|
|
179
233
|
|
|
180
234
|
def compute_score(args):
|
|
@@ -250,32 +304,41 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
|
250
304
|
return kvp_dict.get(best_match, None)
|
|
251
305
|
|
|
252
306
|
|
|
253
|
-
def associate_forto_item_code(
|
|
307
|
+
async def associate_forto_item_code(line_item_data, params):
|
|
254
308
|
"""
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
1. Tries to find a fuzzy match for input_string against the keys in
|
|
258
|
-
mapping_data using RapidFuzz, requiring a score >= fuzzy_threshold.
|
|
259
|
-
2. If found, returns the corresponding value from mapping_data.
|
|
260
|
-
3. If not found above threshold, calls the embedding_fallback function.
|
|
261
|
-
|
|
309
|
+
Associates Forto item codes to a list of line item descriptions.
|
|
262
310
|
Args:
|
|
263
|
-
|
|
264
|
-
params: Parameters containing
|
|
311
|
+
line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
|
|
312
|
+
params (dict): Parameters containing lookup data and thresholds.
|
|
265
313
|
|
|
266
314
|
Returns:
|
|
267
|
-
|
|
315
|
+
list: A list of dictionaries with 'description' and 'itemCode' keys.
|
|
268
316
|
"""
|
|
269
|
-
# Get the Forto item code using fuzzy matching
|
|
270
|
-
forto_item_code = find_matching_lineitem(
|
|
271
|
-
new_lineitem=input_string,
|
|
272
|
-
kvp_dict=params["lookup_data"]["item_code"], # TODO: Parse the KVP dictionary
|
|
273
|
-
threshold=params["fuzzy_threshold_item_code"],
|
|
274
|
-
)
|
|
275
317
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
318
|
+
result = []
|
|
319
|
+
pending_line_items = {}
|
|
320
|
+
for desc, f_desc in line_item_data.items():
|
|
321
|
+
# Get the Forto item code using fuzzy matching
|
|
322
|
+
code = find_matching_lineitem(
|
|
323
|
+
new_lineitem=f_desc,
|
|
324
|
+
kvp_dict=params["lookup_data"]["item_code"],
|
|
325
|
+
threshold=params["fuzzy_threshold_item_code"],
|
|
326
|
+
)
|
|
327
|
+
if code:
|
|
328
|
+
result.append({"description": desc, "itemCode": code})
|
|
329
|
+
else:
|
|
330
|
+
pending_line_items[desc] = f_desc
|
|
331
|
+
|
|
332
|
+
# Batch API Call for Embedding lookups
|
|
333
|
+
if pending_line_items:
|
|
334
|
+
api_results = await get_tms_mappings(
|
|
335
|
+
input_list=list(pending_line_items.values()),
|
|
336
|
+
embedding_type="line_items",
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Merge API results back into original list
|
|
340
|
+
for desc, f_desc in pending_line_items.items():
|
|
341
|
+
code = api_results.get(f_desc)
|
|
342
|
+
result.append({"description": desc, "itemCode": code})
|
|
279
343
|
|
|
280
|
-
result = {"documentValue": input_string, "formattedValue": forto_item_code}
|
|
281
344
|
return result
|
|
@@ -6,16 +6,16 @@ import json
|
|
|
6
6
|
import os
|
|
7
7
|
import pickle
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from typing import Literal
|
|
9
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
10
10
|
|
|
11
|
+
import httpx
|
|
11
12
|
import numpy as np
|
|
12
13
|
import openpyxl
|
|
13
14
|
import pandas as pd
|
|
14
|
-
import requests
|
|
15
15
|
from google.cloud import documentai_v1beta3 as docu_ai_beta
|
|
16
16
|
from pypdf import PdfReader, PdfWriter
|
|
17
17
|
|
|
18
|
-
from src.io import get_storage_client, logger
|
|
18
|
+
from src.io import bq_logs, get_storage_client, logger
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def get_pdf_page_count(pdf_bytes):
|
|
@@ -31,29 +31,6 @@ def get_pdf_page_count(pdf_bytes):
|
|
|
31
31
|
return len(reader.pages)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def bq_logs(data_to_insert, params):
|
|
35
|
-
"""Insert logs into Google BigQuery.
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
data_to_insert (list): The data to insert into BigQuery.
|
|
39
|
-
params (dict): The parameters dictionary.
|
|
40
|
-
"""
|
|
41
|
-
# Use the pre-initialized BigQuery client
|
|
42
|
-
bq_client = params["bq_client"]
|
|
43
|
-
# Get the table string
|
|
44
|
-
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
45
|
-
|
|
46
|
-
logger.info(f"Log table: {table_string}")
|
|
47
|
-
# Insert the rows into the table
|
|
48
|
-
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
49
|
-
|
|
50
|
-
# Check if there were any errors inserting the rows
|
|
51
|
-
if not insert_logs:
|
|
52
|
-
logger.info("New rows have been added.")
|
|
53
|
-
else:
|
|
54
|
-
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
55
|
-
|
|
56
|
-
|
|
57
34
|
async def get_data_set_schema_from_docai(
|
|
58
35
|
schema_client, project_id=None, location=None, processor_id=None, name=None
|
|
59
36
|
):
|
|
@@ -383,9 +360,9 @@ def extract_top_pages(pdf_bytes, num_pages=4):
|
|
|
383
360
|
return output.getvalue()
|
|
384
361
|
|
|
385
362
|
|
|
386
|
-
def get_tms_mappings(
|
|
387
|
-
input_list:
|
|
388
|
-
):
|
|
363
|
+
async def get_tms_mappings(
|
|
364
|
+
input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
|
|
365
|
+
) -> Dict[str, Any]:
|
|
389
366
|
"""Get TMS mappings for the given values.
|
|
390
367
|
|
|
391
368
|
Args:
|
|
@@ -395,39 +372,56 @@ def get_tms_mappings(
|
|
|
395
372
|
llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
|
|
396
373
|
|
|
397
374
|
Returns:
|
|
398
|
-
dict: A dictionary with the mapping results.
|
|
375
|
+
dict or string: A dictionary or a string with the mapping results.
|
|
399
376
|
"""
|
|
400
|
-
# To test the API locally, port-forward the embedding service in the sandbox to 8080:80
|
|
401
|
-
# If you want to launch uvicorn from the tms-embedding repo, then use --port 8080 in the config file
|
|
402
377
|
base_url = (
|
|
403
378
|
"http://0.0.0.0:8080/"
|
|
404
379
|
if os.getenv("CLUSTER") is None
|
|
405
380
|
else "http://tms-mappings.api.svc.cluster.local./"
|
|
406
381
|
)
|
|
407
382
|
|
|
383
|
+
# Ensure clean inputs
|
|
384
|
+
if not input_list:
|
|
385
|
+
return {}
|
|
386
|
+
|
|
408
387
|
# Ensure input_list is a list
|
|
409
388
|
if not isinstance(input_list, list):
|
|
410
389
|
input_list = [input_list]
|
|
411
390
|
|
|
412
391
|
# Always send a dict with named keys
|
|
413
392
|
payload = {embedding_type: input_list}
|
|
393
|
+
|
|
414
394
|
if llm_ports:
|
|
415
395
|
payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
|
|
416
396
|
|
|
417
397
|
# Make the POST request to the TMS mappings API
|
|
418
|
-
url = f"{base_url}
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
398
|
+
url = f"{base_url}{embedding_type}"
|
|
399
|
+
|
|
400
|
+
# Use a timeout so the code doesn't hang forever
|
|
401
|
+
timeout = httpx.Timeout(60.0, connect=10.0)
|
|
402
|
+
|
|
403
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
404
|
+
try:
|
|
405
|
+
response = await client.post(url, json=payload)
|
|
406
|
+
response.raise_for_status()
|
|
407
|
+
|
|
408
|
+
# Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
|
|
409
|
+
if embedding_type == "line_items":
|
|
410
|
+
# For line_items, return the full data mapping
|
|
411
|
+
return response.json().get("response", {}).get("data", {})
|
|
412
|
+
else:
|
|
413
|
+
return (
|
|
414
|
+
response.json()
|
|
415
|
+
.get("response", {})
|
|
416
|
+
.get("data", {})
|
|
417
|
+
.get(input_list[0], None)
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
except httpx.HTTPStatusError as exc:
|
|
421
|
+
logger.error(
|
|
422
|
+
f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
|
|
423
|
+
)
|
|
424
|
+
return {}
|
|
431
425
|
|
|
432
426
|
|
|
433
427
|
def transform_schema_strings(schema):
|
|
@@ -502,3 +496,21 @@ def estimate_page_count(sheet):
|
|
|
502
496
|
else:
|
|
503
497
|
return None
|
|
504
498
|
return np.ceil(pg_cnt / 500)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
|
|
502
|
+
"""Split PDF into smaller page chunks."""
|
|
503
|
+
pdf = PdfReader(io.BytesIO(file_content))
|
|
504
|
+
total_pages = len(pdf.pages)
|
|
505
|
+
|
|
506
|
+
# TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
|
|
507
|
+
for i in range(0, total_pages, chunk_size):
|
|
508
|
+
writer = PdfWriter()
|
|
509
|
+
for j in range(i, min(i + chunk_size, total_pages)):
|
|
510
|
+
writer.add_page(pdf.pages[j])
|
|
511
|
+
|
|
512
|
+
buffer = io.BytesIO()
|
|
513
|
+
writer.write(buffer)
|
|
514
|
+
buffer.seek(0)
|
|
515
|
+
|
|
516
|
+
yield buffer.getvalue()
|
{data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.7 → data_science_document_ai-1.44.0}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|