data-science-document-ai 1.48.0__py3-none-any.whl → 1.49.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.48.0
3
+ Version: 1.49.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -10,7 +10,7 @@ src/pdf_processing.py,sha256=oKVPnIu_keiN17XLOGImeyJ4iMT2H51x4OD1Tp9yw1s,19992
10
10
  src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=p-oC0kUJDI9wsOfIDSrvqeEji3TYiCvP0WJ4qfGVvZ4,12538
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=Fv4Y6Lc8e6aFFcwX0kLOal2y4TrR-XfAzjtuQnBwo0o,12815
14
14
  src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
15
15
  src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
16
16
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
@@ -52,9 +52,9 @@ src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93
52
52
  src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
53
53
  src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
54
54
  src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
55
- src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
55
+ src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
56
56
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
57
57
  src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
- data_science_document_ai-1.48.0.dist-info/METADATA,sha256=TEE-nCF6lxV4DdmQIhbij2ZXGrVwM0wBSLJxJPdUJLE,2152
59
- data_science_document_ai-1.48.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
- data_science_document_ai-1.48.0.dist-info/RECORD,,
58
+ data_science_document_ai-1.49.0.dist-info/METADATA,sha256=eeNMP49JWiYaKGBimC61dUak20ztpCSmAN3M4hqLA_g,2152
59
+ data_science_document_ai-1.49.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.49.0.dist-info/RECORD,,
@@ -103,9 +103,18 @@ def post_process_bundeskasse(aggregated_data):
103
103
  )
104
104
 
105
105
  # Check if the deferredDutyPayer is forto
106
- deferredDutyPayer = line_item.get("deferredDutyPayer", {})
107
- lower = deferredDutyPayer.get("documentValue", "").lower()
108
- if any(key in lower for key in ["de789147263644738", "forto"]):
106
+ KEYWORDS = {"de789147263644738", "forto", "009812"}
107
+
108
+ def is_forto_recipient(line_item: dict) -> bool:
109
+ values_to_check = [
110
+ line_item.get("deferredDutyPayer", {}).get("documentValue", ""),
111
+ line_item.get("vatId", {}).get("documentValue", ""),
112
+ ]
113
+
114
+ combined = " ".join(values_to_check).lower()
115
+ return any(keyword in combined for keyword in KEYWORDS)
116
+
117
+ if is_forto_recipient(line_item):
109
118
  is_recipient_forto = True
110
119
 
111
120
  update_recipient_and_vendor(aggregated_data, is_recipient_forto)
src/setup.py CHANGED
@@ -1,11 +1,8 @@
1
1
  """Contains project setup parameters and initialization functions."""
2
2
  import json
3
-
4
- # import streamlit as st
5
3
  import os
6
4
  import random
7
5
  import time
8
- from pathlib import Path
9
6
 
10
7
  import toml
11
8
  import vertexai
@@ -18,7 +15,7 @@ from src.constants import project_parameters
18
15
  from src.constants_sandbox import project_parameters_sandbox
19
16
 
20
17
  # Parent repos are imported without .
21
- from src.io import download_dir_from_bucket, get_bq_client, get_storage_client, logger
18
+ from src.io import get_bq_client, get_storage_client, logger
22
19
  from src.llm import LlmClient
23
20
 
24
21
 
@@ -133,15 +130,7 @@ def setup_params(args=None):
133
130
  assert params.keys() & yaml_content.keys() == set()
134
131
  params.update(yaml_content)
135
132
 
136
- # Get models meta data from cloud
137
- client = get_storage_client(params)
138
- bucket = client.bucket(params["doc_ai_bucket_name"])
139
- downloaded_meta = download_dir_from_bucket(
140
- bucket, params["g_model_data_folder"], Path(params["local_model_data_folder"])
141
- )
142
- if not downloaded_meta:
143
- logger.info(f"Could not load models metadata from cloud.")
144
-
133
+ # Set up LLM clients
145
134
  params["LlmClient"] = LlmClient(
146
135
  openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_params"]
147
136
  )
@@ -149,7 +138,8 @@ def setup_params(args=None):
149
138
  openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_flash_params"]
150
139
  )
151
140
 
152
- params["lookup_data"] = setup_lookup_data(params, bucket)
141
+ # Load lookup data from GCS bucket
142
+ setup_lookup_data(params)
153
143
 
154
144
  return params
155
145
 
@@ -182,10 +172,13 @@ def setup_vertexai(params):
182
172
  )
183
173
 
184
174
 
185
- def setup_lookup_data(params, bucket):
175
+ def setup_lookup_data(params):
186
176
  """
187
177
  Loads JSON mapping data from given GCP Bucket.
188
178
  """
179
+ client = get_storage_client(params)
180
+ bucket = client.bucket(params["doc_ai_bucket_name"])
181
+
189
182
  data = dict()
190
183
 
191
184
  input_path_item_code = (
@@ -208,4 +201,4 @@ def setup_lookup_data(params, bucket):
208
201
  input_path_reverse_charge
209
202
  )
210
203
 
211
- return data
204
+ params["lookup_data"] = data