data-science-document-ai 1.61.1__py3-none-any.whl → 1.61.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.61.1
3
+ Version: 1.61.3
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
7
- Requires-Python: >=3.9,<3.12
7
+ Requires-Python: >=3.11,<3.12
8
8
  Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.9
10
- Classifier: Programming Language :: Python :: 3.10
11
9
  Classifier: Programming Language :: Python :: 3.11
12
10
  Requires-Dist: apscheduler (>=3.10.4,<4.0.0)
13
11
  Requires-Dist: db-dtypes (>=1.2.0,<2.0.0)
@@ -24,8 +22,8 @@ Requires-Dist: google-cloud-bigquery-storage (>=2.20.0,<3.0.0)
24
22
  Requires-Dist: google-cloud-documentai (>=2.23.0,<3.0.0)
25
23
  Requires-Dist: google-cloud-storage (>=2.9.0,<3.0.0)
26
24
  Requires-Dist: google-cloud-vision (>=3.7.1,<4.0.0)
25
+ Requires-Dist: google-genai (>=1.61.0,<2.0.0)
27
26
  Requires-Dist: gspread (>=6.1.0,<7.0.0)
28
- Requires-Dist: httpx (>=0.26.0,<0.27.0)
29
27
  Requires-Dist: jupyter (>=1.0.0,<2.0.0)
30
28
  Requires-Dist: kubernetes (>=30.1.0,<31.0.0)
31
29
  Requires-Dist: nltk (>=3.9.1,<4.0.0)
@@ -1,16 +1,16 @@
1
- src/constants.py,sha256=JjSdPW8wbo_sI6NgKHZxVsMBaiWAJs4o018fmOfygg4,3537
1
+ src/constants.py,sha256=qXsKAObEJdfEBL5gEqKNk9mVMe0HsSPxpQCVfscn898,3575
2
2
  src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
3
  src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
4
  src/docai_processor_config.yaml,sha256=ZewXqbyiftzmVAaS08BoNp5trY6WXx3HMWDfPwmKfaI,256
5
5
  src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
6
6
  src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
7
- src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
7
+ src/llm.py,sha256=I2UOCY1I4LjiMUHs11N-hzrG1f5U2Oma4K5O3q578E0,6762
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
9
  src/pdf_processing.py,sha256=Fx-Glb9niEUU3WUCrBZ02ZYV-E2vWoUM0ifN7-0A1Q4,19961
10
10
  src/postprocessing/common.py,sha256=uFaJYpctS4vr-0Z3InRyfRZcEar0UWpcTxdB_TDCJ5E,26671
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=ijBh8LRN23HWdotqMw6uf0ro1IaiL_obBiiikMwXYXk,14768
14
14
  src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
15
15
  src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
16
16
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=49mmxxjExniMqLkT37zd5q8ILCLPGsfugKOMkR926kk,5854
@@ -52,9 +52,9 @@ src/prompts/library/preprocessing/carrier/prompt.txt,sha256=C2ExzNXXxX1ZU8yGQNah
52
52
  src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
53
53
  src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
54
54
  src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
55
- src/setup.py,sha256=8-vZWjC8Iwa3xxdk3iR4412VCjtNtgzVqkXcFon7UBE,7309
55
+ src/setup.py,sha256=prJNY3N2qu14ttfCYnvRtpCqdHQeSW34zXwmuAGWX_M,7472
56
56
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
57
57
  src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
- data_science_document_ai-1.61.1.dist-info/METADATA,sha256=knwcr94ybymLp_FaGl4Pu9uRJj-gVWh2lrDFhiPEddg,2152
59
- data_science_document_ai-1.61.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
- data_science_document_ai-1.61.1.dist-info/RECORD,,
58
+ data_science_document_ai-1.61.3.dist-info/METADATA,sha256=Lq6ZHTHHCJJg5zOaI7awo2Uf4Mx1ZC0FgG7XmXKaETk,2058
59
+ data_science_document_ai-1.61.3.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.61.3.dist-info/RECORD,,
src/constants.py CHANGED
@@ -9,7 +9,7 @@ project_parameters = {
9
9
  "g_ai_project_id": "738250249861",
10
10
  "g_api_endpoint": "eu-documentai.googleapis.com",
11
11
  "g_location": "eu",
12
- "g_region": "europe-west1",
12
+ "g_region": "global",
13
13
  # Google Cloud Storage
14
14
  "doc_ai_bucket_project_name": "forto-data-science-production",
15
15
  "doc_ai_bucket_name": "ds-document-capture",
@@ -60,7 +60,8 @@ project_parameters = {
60
60
  "top_p": 0.8,
61
61
  "top_k": 40,
62
62
  "seed": 42,
63
- "model_id": "gemini-2.5-pro",
63
+ "model_id": "gemini-3-flash-preview",
64
+ "thinking_level": "medium",
64
65
  },
65
66
  "gemini_flash_params": {
66
67
  "temperature": 0,
src/llm.py CHANGED
@@ -3,17 +3,11 @@ import logging
3
3
 
4
4
  logger = logging.getLogger(__name__)
5
5
 
6
- import base64
7
6
  import json
8
7
 
8
+ from google import genai
9
+ from google.genai import types
9
10
  from openai import AsyncOpenAI as OpenAI
10
- from vertexai.generative_models import (
11
- GenerationConfig,
12
- GenerativeModel,
13
- HarmBlockThreshold,
14
- HarmCategory,
15
- Part,
16
- )
17
11
 
18
12
  from src.io import get_gcp_labels
19
13
  from src.utils import cache_on_disk
@@ -24,8 +18,10 @@ from src.utils import cache_on_disk
24
18
  class LlmClient:
25
19
  """A client for interacting with large language models (LLMs)."""
26
20
 
27
- def __init__(self, openai_key=None, parameters=None):
21
+ def __init__(self, openai_key=None, parameters=None, genai_client=None):
28
22
  """Initialize the LLM client."""
23
+ self.genai_client = genai_client
24
+
29
25
  # Initialize the model parameters
30
26
  self.model_params = {
31
27
  "temperature": parameters.get("temperature", 0),
@@ -35,31 +31,24 @@ class LlmClient:
35
31
  "seed": parameters.get("seed", 42),
36
32
  }
37
33
  self.model_id = parameters.get("model_id", "gemini-2.5-flash")
38
- # Initialize the safety configuration
39
- self.safety_config = {
40
- HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
41
- HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
42
- HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
43
- HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
44
- }
45
- # Initialize the Gemini client
46
- self.geminy_client = self._initialize_gemini()
34
+ self.thinking_level = parameters.get("thinking_level")
35
+
36
+ # Initialize the safety configuration (new format: list of SafetySetting objects)
37
+ self.safety_settings = [
38
+ types.SafetySetting(
39
+ category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"
40
+ ),
41
+ types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF"),
42
+ types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
43
+ types.SafetySetting(
44
+ category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"
45
+ ),
46
+ ]
47
+
47
48
  if openai_key is not None:
48
49
  # Initialize the ChatGPT client
49
50
  self.chatgpt_client = self._create_client_chatgpt(openai_key)
50
51
 
51
- def _initialize_gemini(self):
52
- """Ask the Gemini model a question.
53
-
54
- Returns:
55
- str: The response from the model.
56
- """
57
- # Initialize the model if it is not already initialized
58
- model_gen = GenerativeModel(model_name=self.model_id)
59
- self.model_config = GenerationConfig(**self.model_params)
60
-
61
- return model_gen
62
-
63
52
  def _create_client_chatgpt(self, openai_key):
64
53
  client = OpenAI(api_key=openai_key)
65
54
  return client
@@ -67,7 +56,7 @@ class LlmClient:
67
56
  async def ask_gemini(
68
57
  self,
69
58
  prompt: str,
70
- document: str = None,
59
+ document: types.Part = None,
71
60
  response_schema: dict = None,
72
61
  response_mime_type: str = "application/json",
73
62
  doc_type: str = None,
@@ -76,7 +65,7 @@ class LlmClient:
76
65
 
77
66
  Args:
78
67
  prompt (str): The prompt to send to the model.
79
- document (str, optional): An optional document to provide context.
68
+ document (types.Part, optional): An optional document to provide context.
80
69
  response_schema (dict, optional): Defines a specific response schema for the model.
81
70
  doc_type (str, optional): Document type for cost tracking labels.
82
71
 
@@ -84,33 +73,35 @@ class LlmClient:
84
73
  str: The response from the model.
85
74
  """
86
75
  try:
76
+ # Build config with all parameters
77
+ config_params = {
78
+ **self.model_params,
79
+ "safety_settings": self.safety_settings,
80
+ "labels": get_gcp_labels(doc_type=doc_type),
81
+ }
82
+
83
+ # Add thinking config for Gemini 3 models
84
+ if self.thinking_level:
85
+ config_params["thinking_config"] = types.ThinkingConfig(
86
+ thinking_level=self.thinking_level
87
+ )
87
88
 
88
- # Start with the default model configuration
89
- config = self.model_config
90
-
91
- # Add response_schema if provided. This is only supported for Gemini 1.5 Flash & Pro models
92
89
  if response_schema is not None:
93
- config = GenerationConfig(
94
- response_schema=response_schema,
95
- response_mime_type=response_mime_type,
96
- **self.model_params,
97
- )
90
+ config_params["response_schema"] = response_schema
91
+ config_params["response_mime_type"] = response_mime_type
98
92
 
99
- # Prepare inputs for the model
100
- inputs = [document, prompt] if document else prompt
93
+ config = types.GenerateContentConfig(**config_params)
94
+ contents = [document, prompt] if document else prompt
101
95
 
102
- # Generate the response with labels for cost tracking
96
+ # Use async client
103
97
  model_response = await cache_on_disk(
104
- self.geminy_client.generate_content_async,
105
- contents=inputs,
106
- generation_config=config,
107
- safety_settings=self.safety_config,
108
- labels=get_gcp_labels(doc_type=doc_type),
98
+ self.genai_client.aio.models.generate_content,
99
+ model=self.model_id,
100
+ contents=contents,
101
+ config=config,
109
102
  )
110
103
 
111
- response_text = model_response.text
112
-
113
- return response_text
104
+ return model_response.text
114
105
 
115
106
  except Exception as e:
116
107
  logger.error(f"Failed to generate response: {str(e)}")
@@ -146,25 +137,20 @@ class LlmClient:
146
137
  logger.error(e)
147
138
  return {}
148
139
 
149
- def prepare_document_for_gemini(self, file_content):
150
- """Prepare a document from file content by encoding it to base64.
140
+ def prepare_document_for_gemini(self, file_content: bytes) -> types.Part:
141
+ """Prepare a document from file content for the Gemini model.
151
142
 
152
143
  Args:
153
144
  file_content (bytes): The binary content of the file to be processed.
154
145
 
155
146
  Returns:
156
- Part: A document object ready for processing by the language model.
147
+ types.Part: A document object ready for processing by the language model.
157
148
  """
158
- # Convert binary file to base64
159
- pdf_base64 = base64.b64encode(file_content).decode("utf-8")
160
-
161
- # Create the document for the model
162
- document = Part.from_data(
163
- mime_type="application/pdf", data=base64.b64decode(pdf_base64)
149
+ return types.Part.from_bytes(
150
+ data=file_content,
151
+ mime_type="application/pdf",
164
152
  )
165
153
 
166
- return document
167
-
168
154
  async def ask_chatgpt(self, prompt: str, document=None, response_schema=None):
169
155
  """Ask the chatgpt model a question.
170
156
 
@@ -119,16 +119,22 @@ def post_process_bundeskasse(aggregated_data):
119
119
  if is_forto_recipient(line_item):
120
120
  is_recipient_forto = True
121
121
 
122
- update_recipient_and_vendor(aggregated_data, is_recipient_forto)
122
+ update_aggregated_data_fields(aggregated_data, is_recipient_forto)
123
123
 
124
124
 
125
- def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
126
- """Update the recipient and vendor information in the aggregated data."""
125
+ def update_aggregated_data_fields(aggregated_data, is_recipient_forto):
126
+ """Update the recipient, vendor and deferredDutyPayer information in the aggregated data."""
127
127
  # Check if the "recipientName" and "recipientAddress" keys exist
128
128
  keys_to_init = ["recipientName", "recipientAddress", "vendorName", "vendorAddress"]
129
129
  for key in keys_to_init:
130
130
  aggregated_data.setdefault(key, {"formattedValue": "", "documentValue": ""})
131
131
 
132
+ # Update the vendor details always to Bundeskasse Trier
133
+ aggregated_data["vendorName"]["formattedValue"] = "Bundeskasse Trier"
134
+ aggregated_data["vendorAddress"][
135
+ "formattedValue"
136
+ ] = "Dasbachstraße 15, 54292 Trier, Germany"
137
+
132
138
  if is_recipient_forto:
133
139
  # Update the aggregated data with the recipient information
134
140
  aggregated_data["recipientName"][
@@ -138,11 +144,17 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
138
144
  "formattedValue"
139
145
  ] = "Schönhauser Allee 9, 10119 Berlin, Germany"
140
146
 
141
- # Update the vendor details always to Bundeskasse Trier
142
- aggregated_data["vendorName"]["formattedValue"] = "Bundeskasse Trier"
143
- aggregated_data["vendorAddress"][
144
- "formattedValue"
145
- ] = "Dasbachstraße 15, 54292 Trier, Germany"
147
+ # Update the defferDutyPayer to Forto Logistics SE & Co KG 'DE789147263644738 - Fort'
148
+ lineitems = aggregated_data.get("lineItem", [])
149
+
150
+ for lineitem in lineitems:
151
+ if "deferredDutyPayer" in lineitem:
152
+ lineitem["deferredDutyPayer"] = {
153
+ "documentValue": lineitem.get("deferredDutyPayer", {}).get(
154
+ "documentValue"
155
+ ),
156
+ "formattedValue": "DE789147263644738",
157
+ }
146
158
 
147
159
 
148
160
  def select_unique_bank_account(bank_account):
src/setup.py CHANGED
@@ -5,8 +5,8 @@ import random
5
5
  import time
6
6
 
7
7
  import toml
8
- import vertexai
9
8
  import yaml
9
+ from google import genai
10
10
  from google.api_core.client_options import ClientOptions
11
11
  from google.cloud import documentai
12
12
  from google.cloud import documentai_v1beta3 as docai_beta
@@ -134,10 +134,14 @@ def setup_params(args=None):
134
134
 
135
135
  # Set up LLM clients
136
136
  params["LlmClient"] = LlmClient(
137
- openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_params"]
137
+ openai_key=os.getenv("OPENAI_KEY"),
138
+ parameters=params["gemini_params"],
139
+ genai_client=params["genai_client"],
138
140
  )
139
141
  params["LlmClient_Flash"] = LlmClient(
140
- openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_flash_params"]
142
+ openai_key=os.getenv("OPENAI_KEY"),
143
+ parameters=params["gemini_flash_params"],
144
+ genai_client=params["genai_client"],
141
145
  )
142
146
 
143
147
  # Load lookup data from GCS bucket
@@ -167,8 +171,9 @@ def setup_docai_client_and_path(params):
167
171
 
168
172
 
169
173
  def setup_vertexai(params):
170
- """Initialize the Vertex AI with the specified project and location."""
171
- vertexai.init(
174
+ """Initialize the Google GenAI client with Vertex AI configuration."""
175
+ params["genai_client"] = genai.Client(
176
+ vertexai=True,
172
177
  project=params["g_ai_project_name"],
173
178
  location=params["g_region"],
174
179
  )