data-science-document-ai 1.41.0__tar.gz → 1.43.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/PKG-INFO +2 -2
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/pyproject.toml +2 -2
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/constants.py +0 -2
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/docai.py +14 -5
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/docai_processor_config.yaml +0 -14
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/excel_processing.py +40 -12
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/io.py +46 -1
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/llm.py +10 -3
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/pdf_processing.py +91 -29
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/postprocessing/common.py +75 -29
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/postprocessing/postprocess_partner_invoice.py +5 -30
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bundeskasse/other/placeholders.json +1 -1
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bundeskasse/other/prompt.txt +2 -2
- data_science_document_ai-1.43.6/src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/customsInvoice/other/placeholders.json +1 -1
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/customsInvoice/other/prompt.txt +3 -3
- data_science_document_ai-1.43.6/src/prompts/library/packingList/other/placeholders.json +98 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/partnerInvoice/other/placeholders.json +39 -12
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/partnerInvoice/other/prompt.txt +4 -17
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/prompt_library.py +0 -4
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/setup.py +5 -1
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/utils.py +72 -39
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/log_setup.py +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/tms.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-science-document-ai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.43.6
|
|
4
4
|
Summary: "Document AI repo for data science"
|
|
5
5
|
Author: Naomi Nguyen
|
|
6
6
|
Author-email: naomi.nguyen@forto.com
|
|
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
|
|
38
38
|
Requires-Dist: pgzip (>=0.3.5,<0.4.0)
|
|
39
39
|
Requires-Dist: pyarrow (==16.1.0)
|
|
40
40
|
Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
|
|
41
|
-
Requires-Dist:
|
|
41
|
+
Requires-Dist: pypdf (>=6.1.2,<7.0.0)
|
|
42
42
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
|
43
43
|
Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
|
|
44
44
|
Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.43.6"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -48,7 +48,7 @@ rapidfuzz = "^3.12.2"
|
|
|
48
48
|
fuzzywuzzy = "^0.18.0"
|
|
49
49
|
nltk = "^3.9.1"
|
|
50
50
|
pgzip = "^0.3.5"
|
|
51
|
-
|
|
51
|
+
pypdf = "^6.1.2"
|
|
52
52
|
|
|
53
53
|
[tool.poetry.dev-dependencies]
|
|
54
54
|
jupyter = "^1.0.0"
|
|
@@ -3,11 +3,16 @@ import re
|
|
|
3
3
|
|
|
4
4
|
from google.cloud import documentai
|
|
5
5
|
|
|
6
|
-
from src.io import
|
|
6
|
+
from src.io import (
|
|
7
|
+
delete_folder_from_bucket,
|
|
8
|
+
get_gcp_labels,
|
|
9
|
+
logger,
|
|
10
|
+
upload_pdf_to_bucket,
|
|
11
|
+
)
|
|
7
12
|
from src.utils import cache_on_disk
|
|
8
13
|
|
|
9
14
|
|
|
10
|
-
async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
15
|
+
async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
|
|
11
16
|
"""Process the PDF using Document AI.
|
|
12
17
|
|
|
13
18
|
Args:
|
|
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
15
20
|
client: The Document AI client.
|
|
16
21
|
processor_name (str): The name of the processor to be used.
|
|
17
22
|
e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
|
|
23
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
18
24
|
|
|
19
25
|
Returns:
|
|
20
26
|
The processed document.
|
|
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
24
30
|
content=image_content, mime_type="application/pdf"
|
|
25
31
|
)
|
|
26
32
|
|
|
27
|
-
# Configure the process request
|
|
33
|
+
# Configure the process request with labels for cost tracking
|
|
28
34
|
request = documentai.ProcessRequest(
|
|
29
35
|
name=processor_name,
|
|
30
36
|
raw_document=raw_document, # field_mask=field_mask
|
|
37
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
31
38
|
)
|
|
32
39
|
result = await cache_on_disk(client.process_document, request=request)
|
|
33
40
|
|
|
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
35
42
|
|
|
36
43
|
|
|
37
44
|
async def _batch_process_pdf_w_docai(
|
|
38
|
-
params, image_content, client, processor_name, timeout=1200
|
|
45
|
+
params, image_content, client, processor_name, timeout=1200, doc_type=None
|
|
39
46
|
):
|
|
40
47
|
"""Process the PDF using Document AI Batch Process API.
|
|
41
48
|
|
|
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
|
|
|
45
52
|
processor_name (str): The name of the processor to be used.
|
|
46
53
|
e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
|
|
47
54
|
timeout (int, optional): The timeout in seconds. Defaults to 1200.
|
|
55
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
48
56
|
|
|
49
57
|
Returns:
|
|
50
58
|
The processed document.
|
|
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
|
|
|
72
80
|
# Where to write results
|
|
73
81
|
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
|
|
74
82
|
|
|
75
|
-
# The full resource name of the processor
|
|
83
|
+
# The full resource name of the processor with labels for cost tracking
|
|
76
84
|
request = documentai.BatchProcessRequest(
|
|
77
85
|
name=processor_name,
|
|
78
86
|
input_documents=input_config,
|
|
79
87
|
document_output_config=output_config,
|
|
88
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
80
89
|
)
|
|
81
90
|
|
|
82
91
|
# BatchProcess returns a Long Running Operation (LRO)
|
{data_science_document_ai-1.41.0 → data_science_document_ai-1.43.6}/src/docai_processor_config.yaml
RENAMED
|
@@ -13,20 +13,6 @@ model_config:
|
|
|
13
13
|
author: "igor.tonko@forto.com"
|
|
14
14
|
created_date: ""
|
|
15
15
|
|
|
16
|
-
packingList:
|
|
17
|
-
- id: "d967005bd9d45aeb"
|
|
18
|
-
details:
|
|
19
|
-
display_name: "doc_cap_packingList"
|
|
20
|
-
author: "kumar.rajendrababu@forto.com"
|
|
21
|
-
created_date: ""
|
|
22
|
-
|
|
23
|
-
commercialInvoice:
|
|
24
|
-
- id: "7d37236207f75758"
|
|
25
|
-
details:
|
|
26
|
-
display_name: "doc_cap_commercialInvoice"
|
|
27
|
-
author: "kumar.rajendrababu@forto.com"
|
|
28
|
-
created_date: ""
|
|
29
|
-
|
|
30
16
|
finalMbL:
|
|
31
17
|
- id: "1eda2f22d64b1b89"
|
|
32
18
|
details:
|
|
@@ -2,23 +2,28 @@
|
|
|
2
2
|
# flake8: noqa: E402
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
|
+
from ddtrace import tracer
|
|
6
|
+
|
|
5
7
|
from src.postprocessing.common import llm_prediction_to_tuples
|
|
6
8
|
|
|
7
9
|
logger = logging.getLogger(__name__)
|
|
8
10
|
|
|
9
11
|
import asyncio
|
|
10
|
-
import json
|
|
11
12
|
|
|
12
13
|
import numpy as np
|
|
13
14
|
import pandas as pd
|
|
14
15
|
|
|
15
16
|
from src.llm import prompt_excel_extraction
|
|
16
|
-
from src.
|
|
17
|
+
from src.prompts.prompt_library import prompt_library
|
|
18
|
+
from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
|
|
17
19
|
|
|
18
20
|
|
|
19
|
-
async def extract_data_from_sheet(
|
|
21
|
+
async def extract_data_from_sheet(
|
|
22
|
+
llm_client, sheet_name, sheet, response_schema, doc_type=None
|
|
23
|
+
):
|
|
20
24
|
logger.info(f"Processing sheet: {sheet_name}")
|
|
21
|
-
excel_content = pd.DataFrame(sheet.values)
|
|
25
|
+
excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
|
|
26
|
+
|
|
22
27
|
# Convert to Markdown format for the LLM model
|
|
23
28
|
worksheet = (
|
|
24
29
|
"This is from a excel. Pay attention to the cell position:\n"
|
|
@@ -29,9 +34,10 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
|
|
|
29
34
|
prompt_docai = prompt_excel_extraction(worksheet)
|
|
30
35
|
|
|
31
36
|
try:
|
|
32
|
-
result = await
|
|
37
|
+
result = await llm_client.get_unified_json_genai(
|
|
33
38
|
prompt_docai,
|
|
34
39
|
response_schema=response_schema,
|
|
40
|
+
doc_type=doc_type,
|
|
35
41
|
)
|
|
36
42
|
except Exception as e:
|
|
37
43
|
result = {}
|
|
@@ -45,6 +51,7 @@ async def extract_data_from_excel(
|
|
|
45
51
|
input_doc_type,
|
|
46
52
|
file_content,
|
|
47
53
|
mime_type,
|
|
54
|
+
llm_client,
|
|
48
55
|
):
|
|
49
56
|
"""Extract data from the Excel file.
|
|
50
57
|
|
|
@@ -53,6 +60,7 @@ async def extract_data_from_excel(
|
|
|
53
60
|
input_doc_type (str): The type of the document.
|
|
54
61
|
file_content (bytes): The content of the Excel file to process.
|
|
55
62
|
mime_type (str): The MIME type of the file.
|
|
63
|
+
llm_client: The LLM client to use for data extraction.
|
|
56
64
|
|
|
57
65
|
Returns:
|
|
58
66
|
formatted_data (list): A list of dictionaries containing the extracted data.
|
|
@@ -61,22 +69,42 @@ async def extract_data_from_excel(
|
|
|
61
69
|
|
|
62
70
|
"""
|
|
63
71
|
# Generate the response structure
|
|
64
|
-
response_schema =
|
|
72
|
+
response_schema = (
|
|
73
|
+
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
74
|
+
if input_doc_type
|
|
75
|
+
in [
|
|
76
|
+
"partnerInvoice",
|
|
77
|
+
"customsInvoice",
|
|
78
|
+
"bundeskasse",
|
|
79
|
+
"commercialInvoice",
|
|
80
|
+
"packingList",
|
|
81
|
+
]
|
|
82
|
+
else generate_schema_structure(params, input_doc_type)
|
|
83
|
+
)
|
|
65
84
|
|
|
66
85
|
# Load the Excel file and get ONLY the "visible" sheet names
|
|
67
86
|
sheets, workbook = get_excel_sheets(file_content, mime_type)
|
|
68
87
|
|
|
88
|
+
# Track the number of sheets in dd-trace
|
|
89
|
+
span = tracer.current_span()
|
|
90
|
+
if span:
|
|
91
|
+
estimated_page_counts = [
|
|
92
|
+
estimate_page_count(workbook[sheet]) for sheet in sheets
|
|
93
|
+
]
|
|
94
|
+
est_page_count = sum(estimated_page_counts)
|
|
95
|
+
span.set_metric("est_page_count", est_page_count)
|
|
96
|
+
|
|
69
97
|
# Excel files may contain multiple sheets. Extract data from each sheet
|
|
70
98
|
sheet_extract_tasks = [
|
|
71
99
|
extract_data_from_sheet(
|
|
72
|
-
|
|
100
|
+
llm_client,
|
|
101
|
+
sheet_name,
|
|
102
|
+
workbook[sheet_name],
|
|
103
|
+
response_schema,
|
|
104
|
+
doc_type=input_doc_type,
|
|
73
105
|
)
|
|
74
106
|
for sheet_name in sheets
|
|
75
107
|
]
|
|
76
108
|
extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
|
|
77
109
|
|
|
78
|
-
|
|
79
|
-
extracted_data = llm_prediction_to_tuples(extracted_data)
|
|
80
|
-
stored_data = json.dumps(extracted_data)
|
|
81
|
-
|
|
82
|
-
return extracted_data, stored_data, params["gemini_params"]["model_id"]
|
|
110
|
+
return extracted_data, extracted_data, llm_client.model_id
|
|
@@ -12,13 +12,55 @@ from pathlib import Path
|
|
|
12
12
|
from google.cloud import bigquery, storage
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def get_gcp_labels(**extra_labels):
|
|
16
|
+
"""Generate standardized GCP labels for cost tracking.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
**extra_labels: Additional custom labels
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
|
|
23
|
+
"""
|
|
24
|
+
project_name = os.getenv("PROJECT_NAME")
|
|
25
|
+
|
|
26
|
+
# If not set, detect once and cache it
|
|
27
|
+
if not project_name:
|
|
28
|
+
# Try pyproject.toml first
|
|
29
|
+
try:
|
|
30
|
+
import toml
|
|
31
|
+
|
|
32
|
+
pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
|
|
33
|
+
if pyproject_path.exists():
|
|
34
|
+
config = toml.load(pyproject_path)
|
|
35
|
+
project_name = config.get("tool", {}).get("poetry", {}).get("name")
|
|
36
|
+
except Exception:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
# Fallback to unknown
|
|
40
|
+
if not project_name:
|
|
41
|
+
project_name = "unknown"
|
|
42
|
+
|
|
43
|
+
# Cache it
|
|
44
|
+
os.environ["PROJECT_NAME"] = project_name
|
|
45
|
+
|
|
46
|
+
labels = {
|
|
47
|
+
"ds-project-name": project_name.lower(),
|
|
48
|
+
"ds-env": os.getenv("CLUSTER", "local").lower(),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Add any extra labels
|
|
52
|
+
labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
|
|
53
|
+
|
|
54
|
+
return labels
|
|
55
|
+
|
|
56
|
+
|
|
15
57
|
def get_bq_client(params):
|
|
16
58
|
"""Get Google BigQuery client."""
|
|
17
59
|
bq_client = bigquery.Client(project=params["g_ai_project_name"])
|
|
18
60
|
job_config = bigquery.QueryJobConfig(
|
|
19
61
|
allow_large_results=True,
|
|
20
62
|
# flatten_results=True,
|
|
21
|
-
labels=
|
|
63
|
+
labels=get_gcp_labels(),
|
|
22
64
|
)
|
|
23
65
|
return bq_client, job_config
|
|
24
66
|
|
|
@@ -112,3 +154,6 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
|
|
|
112
154
|
Path(directory).mkdir(parents=True, exist_ok=True)
|
|
113
155
|
blob.download_to_filename(directory_local / Path(blob.name))
|
|
114
156
|
return result
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# type: ignore
|
|
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
|
|
|
15
15
|
Part,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
+
from src.io import get_gcp_labels
|
|
18
19
|
from src.utils import cache_on_disk
|
|
19
20
|
|
|
20
21
|
|
|
@@ -69,6 +70,7 @@ class LlmClient:
|
|
|
69
70
|
document: str = None,
|
|
70
71
|
response_schema: dict = None,
|
|
71
72
|
response_mime_type: str = "application/json",
|
|
73
|
+
doc_type: str = None,
|
|
72
74
|
):
|
|
73
75
|
"""Ask the Gemini model a question.
|
|
74
76
|
|
|
@@ -76,6 +78,7 @@ class LlmClient:
|
|
|
76
78
|
prompt (str): The prompt to send to the model.
|
|
77
79
|
document (str, optional): An optional document to provide context.
|
|
78
80
|
response_schema (dict, optional): Defines a specific response schema for the model.
|
|
81
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
79
82
|
|
|
80
83
|
Returns:
|
|
81
84
|
str: The response from the model.
|
|
@@ -96,12 +99,13 @@ class LlmClient:
|
|
|
96
99
|
# Prepare inputs for the model
|
|
97
100
|
inputs = [document, prompt] if document else prompt
|
|
98
101
|
|
|
99
|
-
# Generate the response
|
|
102
|
+
# Generate the response with labels for cost tracking
|
|
100
103
|
model_response = await cache_on_disk(
|
|
101
104
|
self.geminy_client.generate_content_async,
|
|
102
105
|
contents=inputs,
|
|
103
106
|
generation_config=config,
|
|
104
107
|
safety_settings=self.safety_config,
|
|
108
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
105
109
|
)
|
|
106
110
|
|
|
107
111
|
response_text = model_response.text
|
|
@@ -113,7 +117,7 @@ class LlmClient:
|
|
|
113
117
|
return "{}"
|
|
114
118
|
|
|
115
119
|
async def get_unified_json_genai(
|
|
116
|
-
self, prompt, document=None, response_schema=None, model="gemini"
|
|
120
|
+
self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
|
|
117
121
|
):
|
|
118
122
|
"""Send a prompt to a Google Cloud AI Platform model and returns the generated json.
|
|
119
123
|
|
|
@@ -122,6 +126,7 @@ class LlmClient:
|
|
|
122
126
|
document: Content of the PDF document
|
|
123
127
|
response_schema: The schema to use for the response
|
|
124
128
|
model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
|
|
129
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
125
130
|
|
|
126
131
|
Returns:
|
|
127
132
|
dict: The generated json from the model.
|
|
@@ -131,7 +136,9 @@ class LlmClient:
|
|
|
131
136
|
response = await self.ask_chatgpt(prompt, document, response_schema)
|
|
132
137
|
else:
|
|
133
138
|
# Default to Gemini
|
|
134
|
-
response = await self.ask_gemini(
|
|
139
|
+
response = await self.ask_gemini(
|
|
140
|
+
prompt, document, response_schema, doc_type=doc_type
|
|
141
|
+
)
|
|
135
142
|
|
|
136
143
|
try:
|
|
137
144
|
return json.loads(response)
|
|
@@ -9,12 +9,17 @@ logger = logging.getLogger(__name__)
|
|
|
9
9
|
import asyncio
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
|
|
12
|
+
from ddtrace import tracer
|
|
12
13
|
from fastapi import HTTPException
|
|
13
14
|
from google.cloud.documentai_v1 import Document as docaiv1_document
|
|
14
15
|
|
|
15
16
|
from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
|
|
16
17
|
from src.excel_processing import extract_data_from_excel
|
|
17
|
-
from src.postprocessing.common import
|
|
18
|
+
from src.postprocessing.common import (
|
|
19
|
+
format_all_entities,
|
|
20
|
+
llm_prediction_to_tuples,
|
|
21
|
+
remove_none_values,
|
|
22
|
+
)
|
|
18
23
|
from src.postprocessing.postprocess_booking_confirmation import (
|
|
19
24
|
postprocess_booking_confirmation,
|
|
20
25
|
)
|
|
@@ -28,14 +33,17 @@ from src.prompts.prompt_library import prompt_library
|
|
|
28
33
|
from src.utils import (
|
|
29
34
|
extract_top_pages,
|
|
30
35
|
generate_schema_structure,
|
|
36
|
+
get_pdf_page_count,
|
|
31
37
|
get_processor_name,
|
|
32
38
|
run_background_tasks,
|
|
39
|
+
transform_schema_strings,
|
|
33
40
|
validate_based_on_schema,
|
|
34
|
-
transform_schema_strings
|
|
35
41
|
)
|
|
36
42
|
|
|
37
43
|
|
|
38
|
-
async def process_file_w_docai(
|
|
44
|
+
async def process_file_w_docai(
|
|
45
|
+
params, image_content, client, processor_name, doc_type=None
|
|
46
|
+
):
|
|
39
47
|
"""
|
|
40
48
|
Process a file using Document AI.
|
|
41
49
|
|
|
@@ -44,6 +52,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
44
52
|
image_content (bytes): The file to be processed. It can be bytes object.
|
|
45
53
|
client: The Document AI client.
|
|
46
54
|
processor_name (str): The name of the processor to be used.
|
|
55
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
47
56
|
|
|
48
57
|
Returns:
|
|
49
58
|
The processed document.
|
|
@@ -55,7 +64,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
55
64
|
|
|
56
65
|
try:
|
|
57
66
|
logger.info("Processing document...")
|
|
58
|
-
result = await _process_pdf_w_docai(
|
|
67
|
+
result = await _process_pdf_w_docai(
|
|
68
|
+
image_content, client, processor_name, doc_type=doc_type
|
|
69
|
+
)
|
|
59
70
|
except Exception as e:
|
|
60
71
|
if e.reason == "PAGE_LIMIT_EXCEEDED":
|
|
61
72
|
logger.warning(
|
|
@@ -64,7 +75,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
64
75
|
# Process the document in batch method (offline processing)
|
|
65
76
|
try:
|
|
66
77
|
result = await _batch_process_pdf_w_docai(
|
|
67
|
-
params, image_content, client, processor_name
|
|
78
|
+
params, image_content, client, processor_name, doc_type=doc_type
|
|
68
79
|
)
|
|
69
80
|
except Exception as batch_e:
|
|
70
81
|
logger.error(f"Error processing document {batch_e}.")
|
|
@@ -94,7 +105,7 @@ async def extract_data_from_pdf_w_docai(
|
|
|
94
105
|
)
|
|
95
106
|
|
|
96
107
|
result = await process_file_w_docai(
|
|
97
|
-
params, file_content, processor_client, processor_name
|
|
108
|
+
params, file_content, processor_client, processor_name, doc_type=input_doc_type
|
|
98
109
|
)
|
|
99
110
|
|
|
100
111
|
# Create an entity object to store the result in gcs
|
|
@@ -105,16 +116,22 @@ async def extract_data_from_pdf_w_docai(
|
|
|
105
116
|
# Extract entities from the result
|
|
106
117
|
for entity in result.entities:
|
|
107
118
|
value = (
|
|
108
|
-
{
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
119
|
+
{
|
|
120
|
+
child.type_: (
|
|
121
|
+
child.mention_text,
|
|
122
|
+
child.page_anchor.page_refs[0].page
|
|
123
|
+
if hasattr(child.page_anchor.page_refs[0], "page")
|
|
124
|
+
else 0,
|
|
125
|
+
)
|
|
126
|
+
for child in entity.properties
|
|
127
|
+
}
|
|
113
128
|
if entity.properties
|
|
114
|
-
else (
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
129
|
+
else (
|
|
130
|
+
entity.mention_text,
|
|
131
|
+
entity.page_anchor.page_refs[0].page
|
|
132
|
+
if hasattr(entity.page_anchor.page_refs[0], "page")
|
|
133
|
+
else 0,
|
|
134
|
+
)
|
|
118
135
|
)
|
|
119
136
|
aggregated_data[entity.type_].append(value)
|
|
120
137
|
|
|
@@ -145,7 +162,9 @@ async def extract_data_from_pdf_w_docai(
|
|
|
145
162
|
return aggregated_data, result_for_store, processor_version
|
|
146
163
|
|
|
147
164
|
|
|
148
|
-
async def identify_carrier(
|
|
165
|
+
async def identify_carrier(
|
|
166
|
+
document, llm_client, prompt, response_schema, doc_type=None
|
|
167
|
+
):
|
|
149
168
|
"""Identify the carrier from the Booking Confirmation document."""
|
|
150
169
|
|
|
151
170
|
result = await llm_client.ask_gemini(
|
|
@@ -153,6 +172,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
|
|
|
153
172
|
document=document,
|
|
154
173
|
response_schema=response_schema,
|
|
155
174
|
response_mime_type="text/x.enum",
|
|
175
|
+
doc_type=doc_type,
|
|
156
176
|
)
|
|
157
177
|
|
|
158
178
|
if result:
|
|
@@ -180,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
180
200
|
if input_doc_type == "bundeskasse"
|
|
181
201
|
else file_content
|
|
182
202
|
)
|
|
203
|
+
number_of_pages = get_pdf_page_count(file_content)
|
|
183
204
|
|
|
184
205
|
# convert file_content to required document
|
|
185
206
|
document = llm_client.prepare_document_for_gemini(file_content)
|
|
@@ -187,7 +208,14 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
187
208
|
# get the schema placeholder from the Doc AI and generate the response structure
|
|
188
209
|
response_schema = (
|
|
189
210
|
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
190
|
-
if input_doc_type
|
|
211
|
+
if input_doc_type
|
|
212
|
+
in [
|
|
213
|
+
"partnerInvoice",
|
|
214
|
+
"customsInvoice",
|
|
215
|
+
"bundeskasse",
|
|
216
|
+
"commercialInvoice",
|
|
217
|
+
"packingList",
|
|
218
|
+
]
|
|
191
219
|
else generate_schema_structure(params, input_doc_type)
|
|
192
220
|
)
|
|
193
221
|
|
|
@@ -209,7 +237,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
209
237
|
|
|
210
238
|
# identify carrier for customized prompting
|
|
211
239
|
carrier = await identify_carrier(
|
|
212
|
-
document,
|
|
240
|
+
document,
|
|
241
|
+
llm_client,
|
|
242
|
+
carrier_prompt,
|
|
243
|
+
carrier_schema,
|
|
244
|
+
doc_type=input_doc_type,
|
|
213
245
|
)
|
|
214
246
|
|
|
215
247
|
if input_doc_type == "bookingConfirmation":
|
|
@@ -224,12 +256,22 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
224
256
|
# get the related prompt from predefined prompt library
|
|
225
257
|
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
226
258
|
|
|
259
|
+
# Update schema to extract value-page_number pairs
|
|
260
|
+
if number_of_pages > 1:
|
|
261
|
+
response_schema = transform_schema_strings(response_schema)
|
|
262
|
+
|
|
263
|
+
# Update the prompt to instruct LLM to include page numbers
|
|
264
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
265
|
+
|
|
227
266
|
# generate the result with LLM (gemini)
|
|
228
267
|
result = await llm_client.get_unified_json_genai(
|
|
229
|
-
prompt=prompt,
|
|
268
|
+
prompt=prompt,
|
|
269
|
+
document=document,
|
|
270
|
+
response_schema=response_schema,
|
|
271
|
+
doc_type=input_doc_type,
|
|
230
272
|
)
|
|
231
273
|
|
|
232
|
-
result = llm_prediction_to_tuples(result)
|
|
274
|
+
result = llm_prediction_to_tuples(result, number_of_pages)
|
|
233
275
|
|
|
234
276
|
return result
|
|
235
277
|
return {}
|
|
@@ -309,15 +351,9 @@ async def extract_data_by_doctype(
|
|
|
309
351
|
processor_client,
|
|
310
352
|
if_use_docai,
|
|
311
353
|
if_use_llm,
|
|
354
|
+
llm_client,
|
|
312
355
|
isBetaTest=False,
|
|
313
356
|
):
|
|
314
|
-
# Select LLM client (Using 2.5 Flash model for Bundeskasse)
|
|
315
|
-
llm_client = (
|
|
316
|
-
params["LlmClient_Flash"]
|
|
317
|
-
if input_doc_type == "bundeskasse"
|
|
318
|
-
else params["LlmClient"]
|
|
319
|
-
)
|
|
320
|
-
|
|
321
357
|
async def extract_w_docai():
|
|
322
358
|
return await extract_data_from_pdf_w_docai(
|
|
323
359
|
params=params,
|
|
@@ -366,6 +402,7 @@ async def data_extraction_manual_flow(
|
|
|
366
402
|
meta,
|
|
367
403
|
processor_client,
|
|
368
404
|
schema_client,
|
|
405
|
+
use_default_logging=False,
|
|
369
406
|
):
|
|
370
407
|
"""
|
|
371
408
|
Process a PDF file and extract data from it.
|
|
@@ -386,6 +423,15 @@ async def data_extraction_manual_flow(
|
|
|
386
423
|
"""
|
|
387
424
|
# Get the start time for processing
|
|
388
425
|
start_time = asyncio.get_event_loop().time()
|
|
426
|
+
|
|
427
|
+
# Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
|
|
428
|
+
llm_client = (
|
|
429
|
+
params["LlmClient_Flash"]
|
|
430
|
+
if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
|
|
431
|
+
else params["LlmClient"]
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
page_count = None
|
|
389
435
|
# Validate the file type
|
|
390
436
|
if mime_type == "application/pdf":
|
|
391
437
|
# Enable Doc Ai only for certain document types.
|
|
@@ -407,8 +453,10 @@ async def data_extraction_manual_flow(
|
|
|
407
453
|
processor_client,
|
|
408
454
|
if_use_docai=if_use_docai,
|
|
409
455
|
if_use_llm=if_use_llm,
|
|
456
|
+
llm_client=llm_client,
|
|
410
457
|
isBetaTest=False,
|
|
411
458
|
)
|
|
459
|
+
page_count = get_pdf_page_count(file_content)
|
|
412
460
|
|
|
413
461
|
elif "excel" in mime_type or "spreadsheet" in mime_type:
|
|
414
462
|
# Extract data from the Excel file
|
|
@@ -417,8 +465,19 @@ async def data_extraction_manual_flow(
|
|
|
417
465
|
input_doc_type=meta.documentTypeCode,
|
|
418
466
|
file_content=file_content,
|
|
419
467
|
mime_type=mime_type,
|
|
468
|
+
llm_client=llm_client,
|
|
420
469
|
)
|
|
421
470
|
|
|
471
|
+
# Get sheet count from dd-trace span (set in extract_data_from_excel)
|
|
472
|
+
# Note: we use the span metric instead of len(extracted_data) because
|
|
473
|
+
# some sheets may fail extraction and not appear in extracted_data
|
|
474
|
+
span = tracer.current_span()
|
|
475
|
+
page_count = span.get_metric("est_page_count") if span else len(extracted_data)
|
|
476
|
+
if page_count > 100:
|
|
477
|
+
logger.warning(
|
|
478
|
+
f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
|
|
479
|
+
)
|
|
480
|
+
|
|
422
481
|
else:
|
|
423
482
|
raise HTTPException(
|
|
424
483
|
status_code=400,
|
|
@@ -426,7 +485,7 @@ async def data_extraction_manual_flow(
|
|
|
426
485
|
)
|
|
427
486
|
# Create the result dictionary with the extracted data
|
|
428
487
|
extracted_data = await format_all_entities(
|
|
429
|
-
extracted_data, meta.documentTypeCode, params
|
|
488
|
+
extracted_data, meta.documentTypeCode, params, mime_type
|
|
430
489
|
)
|
|
431
490
|
result = {
|
|
432
491
|
"id": meta.id,
|
|
@@ -441,7 +500,9 @@ async def data_extraction_manual_flow(
|
|
|
441
500
|
logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
|
|
442
501
|
|
|
443
502
|
# Schedule background tasks without using FastAPI's BackgroundTasks
|
|
444
|
-
if
|
|
503
|
+
if (
|
|
504
|
+
os.getenv("CLUSTER") != "ode"
|
|
505
|
+
) & use_default_logging: # skip data export to bigquery in ODE environment
|
|
445
506
|
asyncio.create_task(
|
|
446
507
|
run_background_tasks(
|
|
447
508
|
params,
|
|
@@ -452,6 +513,7 @@ async def data_extraction_manual_flow(
|
|
|
452
513
|
processor_version,
|
|
453
514
|
mime_type,
|
|
454
515
|
elapsed_time,
|
|
516
|
+
page_count,
|
|
455
517
|
)
|
|
456
518
|
)
|
|
457
519
|
return result
|