data-science-document-ai 1.61.1__tar.gz → 1.61.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/PKG-INFO +3 -5
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/pyproject.toml +4 -6
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/constants.py +3 -2
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/llm.py +49 -63
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/postprocessing/postprocess_partner_invoice.py +20 -8
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/setup.py +10 -5
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/docai.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/excel_processing.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/io.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/log_setup.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/pdf_processing.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/postprocessing/common.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/tms.py +0 -0
- {data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/utils.py +0 -0
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-science-document-ai
|
|
3
|
-
Version: 1.61.
|
|
3
|
+
Version: 1.61.3
|
|
4
4
|
Summary: "Document AI repo for data science"
|
|
5
5
|
Author: Naomi Nguyen
|
|
6
6
|
Author-email: naomi.nguyen@forto.com
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.11,<3.12
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
11
9
|
Classifier: Programming Language :: Python :: 3.11
|
|
12
10
|
Requires-Dist: apscheduler (>=3.10.4,<4.0.0)
|
|
13
11
|
Requires-Dist: db-dtypes (>=1.2.0,<2.0.0)
|
|
@@ -24,8 +22,8 @@ Requires-Dist: google-cloud-bigquery-storage (>=2.20.0,<3.0.0)
|
|
|
24
22
|
Requires-Dist: google-cloud-documentai (>=2.23.0,<3.0.0)
|
|
25
23
|
Requires-Dist: google-cloud-storage (>=2.9.0,<3.0.0)
|
|
26
24
|
Requires-Dist: google-cloud-vision (>=3.7.1,<4.0.0)
|
|
25
|
+
Requires-Dist: google-genai (>=1.61.0,<2.0.0)
|
|
27
26
|
Requires-Dist: gspread (>=6.1.0,<7.0.0)
|
|
28
|
-
Requires-Dist: httpx (>=0.26.0,<0.27.0)
|
|
29
27
|
Requires-Dist: jupyter (>=1.0.0,<2.0.0)
|
|
30
28
|
Requires-Dist: kubernetes (>=30.1.0,<31.0.0)
|
|
31
29
|
Requires-Dist: nltk (>=3.9.1,<4.0.0)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.61.
|
|
3
|
+
version = "1.61.3"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -9,7 +9,7 @@ packages = [
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
[tool.poetry.dependencies]
|
|
12
|
-
python = ">=3.
|
|
12
|
+
python = ">=3.11,<3.12"
|
|
13
13
|
pandas = "^2.0.3"
|
|
14
14
|
numpy = "^1.25.1"
|
|
15
15
|
google-cloud-bigquery-storage = "^2.20.0"
|
|
@@ -26,7 +26,6 @@ uvicorn = {extras = ["standard"], version = "^0.27.0.post1"}
|
|
|
26
26
|
requests-toolbelt = "^1.0.0"
|
|
27
27
|
google = "^3.0.0"
|
|
28
28
|
toml = "^0.10.2"
|
|
29
|
-
httpx = "^0.26.0"
|
|
30
29
|
python-multipart = "^0.0.7"
|
|
31
30
|
google-cloud-documentai = "^2.23.0"
|
|
32
31
|
jupyter = "^1.0.0"
|
|
@@ -42,6 +41,7 @@ xlrd = "^2.0.1"
|
|
|
42
41
|
openpyxl = "^3.1.5"
|
|
43
42
|
tabulate = "^0.9.0"
|
|
44
43
|
openai = "^1.53.0"
|
|
44
|
+
google-genai = "^1.61.0"
|
|
45
45
|
parameterized = "^0.9.0"
|
|
46
46
|
ddtrace = "^2.20.0"
|
|
47
47
|
rapidfuzz = "^3.12.2"
|
|
@@ -50,12 +50,10 @@ nltk = "^3.9.1"
|
|
|
50
50
|
pgzip = "^0.3.5"
|
|
51
51
|
pypdf = "^6.1.2"
|
|
52
52
|
|
|
53
|
-
[tool.poetry.dev
|
|
53
|
+
[tool.poetry.group.dev.dependencies]
|
|
54
54
|
jupyter = "^1.0.0"
|
|
55
55
|
ipykernel = "^6.23.1"
|
|
56
56
|
notebook = "^6.5.5"
|
|
57
|
-
|
|
58
|
-
[tool.poetry.group.dev.dependencies]
|
|
59
57
|
deepdiff = "^8.1.1"
|
|
60
58
|
|
|
61
59
|
[build-system]
|
|
@@ -9,7 +9,7 @@ project_parameters = {
|
|
|
9
9
|
"g_ai_project_id": "738250249861",
|
|
10
10
|
"g_api_endpoint": "eu-documentai.googleapis.com",
|
|
11
11
|
"g_location": "eu",
|
|
12
|
-
"g_region": "
|
|
12
|
+
"g_region": "global",
|
|
13
13
|
# Google Cloud Storage
|
|
14
14
|
"doc_ai_bucket_project_name": "forto-data-science-production",
|
|
15
15
|
"doc_ai_bucket_name": "ds-document-capture",
|
|
@@ -60,7 +60,8 @@ project_parameters = {
|
|
|
60
60
|
"top_p": 0.8,
|
|
61
61
|
"top_k": 40,
|
|
62
62
|
"seed": 42,
|
|
63
|
-
"model_id": "gemini-
|
|
63
|
+
"model_id": "gemini-3-flash-preview",
|
|
64
|
+
"thinking_level": "medium",
|
|
64
65
|
},
|
|
65
66
|
"gemini_flash_params": {
|
|
66
67
|
"temperature": 0,
|
|
@@ -3,17 +3,11 @@ import logging
|
|
|
3
3
|
|
|
4
4
|
logger = logging.getLogger(__name__)
|
|
5
5
|
|
|
6
|
-
import base64
|
|
7
6
|
import json
|
|
8
7
|
|
|
8
|
+
from google import genai
|
|
9
|
+
from google.genai import types
|
|
9
10
|
from openai import AsyncOpenAI as OpenAI
|
|
10
|
-
from vertexai.generative_models import (
|
|
11
|
-
GenerationConfig,
|
|
12
|
-
GenerativeModel,
|
|
13
|
-
HarmBlockThreshold,
|
|
14
|
-
HarmCategory,
|
|
15
|
-
Part,
|
|
16
|
-
)
|
|
17
11
|
|
|
18
12
|
from src.io import get_gcp_labels
|
|
19
13
|
from src.utils import cache_on_disk
|
|
@@ -24,8 +18,10 @@ from src.utils import cache_on_disk
|
|
|
24
18
|
class LlmClient:
|
|
25
19
|
"""A client for interacting with large language models (LLMs)."""
|
|
26
20
|
|
|
27
|
-
def __init__(self, openai_key=None, parameters=None):
|
|
21
|
+
def __init__(self, openai_key=None, parameters=None, genai_client=None):
|
|
28
22
|
"""Initialize the LLM client."""
|
|
23
|
+
self.genai_client = genai_client
|
|
24
|
+
|
|
29
25
|
# Initialize the model parameters
|
|
30
26
|
self.model_params = {
|
|
31
27
|
"temperature": parameters.get("temperature", 0),
|
|
@@ -35,31 +31,24 @@ class LlmClient:
|
|
|
35
31
|
"seed": parameters.get("seed", 42),
|
|
36
32
|
}
|
|
37
33
|
self.model_id = parameters.get("model_id", "gemini-2.5-flash")
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
34
|
+
self.thinking_level = parameters.get("thinking_level")
|
|
35
|
+
|
|
36
|
+
# Initialize the safety configuration (new format: list of SafetySetting objects)
|
|
37
|
+
self.safety_settings = [
|
|
38
|
+
types.SafetySetting(
|
|
39
|
+
category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"
|
|
40
|
+
),
|
|
41
|
+
types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF"),
|
|
42
|
+
types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
|
|
43
|
+
types.SafetySetting(
|
|
44
|
+
category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"
|
|
45
|
+
),
|
|
46
|
+
]
|
|
47
|
+
|
|
47
48
|
if openai_key is not None:
|
|
48
49
|
# Initialize the ChatGPT client
|
|
49
50
|
self.chatgpt_client = self._create_client_chatgpt(openai_key)
|
|
50
51
|
|
|
51
|
-
def _initialize_gemini(self):
|
|
52
|
-
"""Ask the Gemini model a question.
|
|
53
|
-
|
|
54
|
-
Returns:
|
|
55
|
-
str: The response from the model.
|
|
56
|
-
"""
|
|
57
|
-
# Initialize the model if it is not already initialized
|
|
58
|
-
model_gen = GenerativeModel(model_name=self.model_id)
|
|
59
|
-
self.model_config = GenerationConfig(**self.model_params)
|
|
60
|
-
|
|
61
|
-
return model_gen
|
|
62
|
-
|
|
63
52
|
def _create_client_chatgpt(self, openai_key):
|
|
64
53
|
client = OpenAI(api_key=openai_key)
|
|
65
54
|
return client
|
|
@@ -67,7 +56,7 @@ class LlmClient:
|
|
|
67
56
|
async def ask_gemini(
|
|
68
57
|
self,
|
|
69
58
|
prompt: str,
|
|
70
|
-
document:
|
|
59
|
+
document: types.Part = None,
|
|
71
60
|
response_schema: dict = None,
|
|
72
61
|
response_mime_type: str = "application/json",
|
|
73
62
|
doc_type: str = None,
|
|
@@ -76,7 +65,7 @@ class LlmClient:
|
|
|
76
65
|
|
|
77
66
|
Args:
|
|
78
67
|
prompt (str): The prompt to send to the model.
|
|
79
|
-
document (
|
|
68
|
+
document (types.Part, optional): An optional document to provide context.
|
|
80
69
|
response_schema (dict, optional): Defines a specific response schema for the model.
|
|
81
70
|
doc_type (str, optional): Document type for cost tracking labels.
|
|
82
71
|
|
|
@@ -84,33 +73,35 @@ class LlmClient:
|
|
|
84
73
|
str: The response from the model.
|
|
85
74
|
"""
|
|
86
75
|
try:
|
|
76
|
+
# Build config with all parameters
|
|
77
|
+
config_params = {
|
|
78
|
+
**self.model_params,
|
|
79
|
+
"safety_settings": self.safety_settings,
|
|
80
|
+
"labels": get_gcp_labels(doc_type=doc_type),
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Add thinking config for Gemini 3 models
|
|
84
|
+
if self.thinking_level:
|
|
85
|
+
config_params["thinking_config"] = types.ThinkingConfig(
|
|
86
|
+
thinking_level=self.thinking_level
|
|
87
|
+
)
|
|
87
88
|
|
|
88
|
-
# Start with the default model configuration
|
|
89
|
-
config = self.model_config
|
|
90
|
-
|
|
91
|
-
# Add response_schema if provided. This is only supported for Gemini 1.5 Flash & Pro models
|
|
92
89
|
if response_schema is not None:
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
response_mime_type=response_mime_type,
|
|
96
|
-
**self.model_params,
|
|
97
|
-
)
|
|
90
|
+
config_params["response_schema"] = response_schema
|
|
91
|
+
config_params["response_mime_type"] = response_mime_type
|
|
98
92
|
|
|
99
|
-
|
|
100
|
-
|
|
93
|
+
config = types.GenerateContentConfig(**config_params)
|
|
94
|
+
contents = [document, prompt] if document else prompt
|
|
101
95
|
|
|
102
|
-
#
|
|
96
|
+
# Use async client
|
|
103
97
|
model_response = await cache_on_disk(
|
|
104
|
-
self.
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
labels=get_gcp_labels(doc_type=doc_type),
|
|
98
|
+
self.genai_client.aio.models.generate_content,
|
|
99
|
+
model=self.model_id,
|
|
100
|
+
contents=contents,
|
|
101
|
+
config=config,
|
|
109
102
|
)
|
|
110
103
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
return response_text
|
|
104
|
+
return model_response.text
|
|
114
105
|
|
|
115
106
|
except Exception as e:
|
|
116
107
|
logger.error(f"Failed to generate response: {str(e)}")
|
|
@@ -146,25 +137,20 @@ class LlmClient:
|
|
|
146
137
|
logger.error(e)
|
|
147
138
|
return {}
|
|
148
139
|
|
|
149
|
-
def prepare_document_for_gemini(self, file_content):
|
|
150
|
-
"""Prepare a document from file content
|
|
140
|
+
def prepare_document_for_gemini(self, file_content: bytes) -> types.Part:
|
|
141
|
+
"""Prepare a document from file content for the Gemini model.
|
|
151
142
|
|
|
152
143
|
Args:
|
|
153
144
|
file_content (bytes): The binary content of the file to be processed.
|
|
154
145
|
|
|
155
146
|
Returns:
|
|
156
|
-
Part: A document object ready for processing by the language model.
|
|
147
|
+
types.Part: A document object ready for processing by the language model.
|
|
157
148
|
"""
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
# Create the document for the model
|
|
162
|
-
document = Part.from_data(
|
|
163
|
-
mime_type="application/pdf", data=base64.b64decode(pdf_base64)
|
|
149
|
+
return types.Part.from_bytes(
|
|
150
|
+
data=file_content,
|
|
151
|
+
mime_type="application/pdf",
|
|
164
152
|
)
|
|
165
153
|
|
|
166
|
-
return document
|
|
167
|
-
|
|
168
154
|
async def ask_chatgpt(self, prompt: str, document=None, response_schema=None):
|
|
169
155
|
"""Ask the chatgpt model a question.
|
|
170
156
|
|
|
@@ -119,16 +119,22 @@ def post_process_bundeskasse(aggregated_data):
|
|
|
119
119
|
if is_forto_recipient(line_item):
|
|
120
120
|
is_recipient_forto = True
|
|
121
121
|
|
|
122
|
-
|
|
122
|
+
update_aggregated_data_fields(aggregated_data, is_recipient_forto)
|
|
123
123
|
|
|
124
124
|
|
|
125
|
-
def
|
|
126
|
-
"""Update the recipient and
|
|
125
|
+
def update_aggregated_data_fields(aggregated_data, is_recipient_forto):
|
|
126
|
+
"""Update the recipient, vendor and deferredDutyPayer information in the aggregated data."""
|
|
127
127
|
# Check if the "recipientName" and "recipientAddress" keys exist
|
|
128
128
|
keys_to_init = ["recipientName", "recipientAddress", "vendorName", "vendorAddress"]
|
|
129
129
|
for key in keys_to_init:
|
|
130
130
|
aggregated_data.setdefault(key, {"formattedValue": "", "documentValue": ""})
|
|
131
131
|
|
|
132
|
+
# Update the vendor details always to Bundeskasse Trier
|
|
133
|
+
aggregated_data["vendorName"]["formattedValue"] = "Bundeskasse Trier"
|
|
134
|
+
aggregated_data["vendorAddress"][
|
|
135
|
+
"formattedValue"
|
|
136
|
+
] = "Dasbachstraße 15, 54292 Trier, Germany"
|
|
137
|
+
|
|
132
138
|
if is_recipient_forto:
|
|
133
139
|
# Update the aggregated data with the recipient information
|
|
134
140
|
aggregated_data["recipientName"][
|
|
@@ -138,11 +144,17 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
138
144
|
"formattedValue"
|
|
139
145
|
] = "Schönhauser Allee 9, 10119 Berlin, Germany"
|
|
140
146
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
147
|
+
# Update the defferDutyPayer to Forto Logistics SE & Co KG 'DE789147263644738 - Fort'
|
|
148
|
+
lineitems = aggregated_data.get("lineItem", [])
|
|
149
|
+
|
|
150
|
+
for lineitem in lineitems:
|
|
151
|
+
if "deferredDutyPayer" in lineitem:
|
|
152
|
+
lineitem["deferredDutyPayer"] = {
|
|
153
|
+
"documentValue": lineitem.get("deferredDutyPayer", {}).get(
|
|
154
|
+
"documentValue"
|
|
155
|
+
),
|
|
156
|
+
"formattedValue": "DE789147263644738",
|
|
157
|
+
}
|
|
146
158
|
|
|
147
159
|
|
|
148
160
|
def select_unique_bank_account(bank_account):
|
|
@@ -5,8 +5,8 @@ import random
|
|
|
5
5
|
import time
|
|
6
6
|
|
|
7
7
|
import toml
|
|
8
|
-
import vertexai
|
|
9
8
|
import yaml
|
|
9
|
+
from google import genai
|
|
10
10
|
from google.api_core.client_options import ClientOptions
|
|
11
11
|
from google.cloud import documentai
|
|
12
12
|
from google.cloud import documentai_v1beta3 as docai_beta
|
|
@@ -134,10 +134,14 @@ def setup_params(args=None):
|
|
|
134
134
|
|
|
135
135
|
# Set up LLM clients
|
|
136
136
|
params["LlmClient"] = LlmClient(
|
|
137
|
-
openai_key=os.getenv("OPENAI_KEY"),
|
|
137
|
+
openai_key=os.getenv("OPENAI_KEY"),
|
|
138
|
+
parameters=params["gemini_params"],
|
|
139
|
+
genai_client=params["genai_client"],
|
|
138
140
|
)
|
|
139
141
|
params["LlmClient_Flash"] = LlmClient(
|
|
140
|
-
openai_key=os.getenv("OPENAI_KEY"),
|
|
142
|
+
openai_key=os.getenv("OPENAI_KEY"),
|
|
143
|
+
parameters=params["gemini_flash_params"],
|
|
144
|
+
genai_client=params["genai_client"],
|
|
141
145
|
)
|
|
142
146
|
|
|
143
147
|
# Load lookup data from GCS bucket
|
|
@@ -167,8 +171,9 @@ def setup_docai_client_and_path(params):
|
|
|
167
171
|
|
|
168
172
|
|
|
169
173
|
def setup_vertexai(params):
|
|
170
|
-
"""Initialize the
|
|
171
|
-
|
|
174
|
+
"""Initialize the Google GenAI client with Vertex AI configuration."""
|
|
175
|
+
params["genai_client"] = genai.Client(
|
|
176
|
+
vertexai=True,
|
|
172
177
|
project=params["g_ai_project_name"],
|
|
173
178
|
location=params["g_region"],
|
|
174
179
|
)
|
{data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/postprocessing/common.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.61.1 → data_science_document_ai-1.61.3}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|