data-science-document-ai 1.48.0__py3-none-any.whl → 1.50.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.48.0
3
+ Version: 1.50.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -10,7 +10,7 @@ src/pdf_processing.py,sha256=oKVPnIu_keiN17XLOGImeyJ4iMT2H51x4OD1Tp9yw1s,19992
10
10
  src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=p-oC0kUJDI9wsOfIDSrvqeEji3TYiCvP0WJ4qfGVvZ4,12538
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=Fv4Y6Lc8e6aFFcwX0kLOal2y4TrR-XfAzjtuQnBwo0o,12815
14
14
  src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
15
15
  src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
16
16
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
@@ -27,8 +27,8 @@ src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPd
27
27
  src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
28
28
  src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
29
29
  src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
30
- src/prompts/library/bundeskasse/other/placeholders.json,sha256=1ll8AI58F2zRDSwQq_r0gxQdxlQB521l5CuiJ-8G6us,4068
31
- src/prompts/library/bundeskasse/other/prompt.txt,sha256=MBv4MIMASMstkzDS7H0q_pNJbPQeadP1vcmhCRrpjQ4,2906
30
+ src/prompts/library/bundeskasse/other/placeholders.json,sha256=eSIVplLYCseVN17tygyV5TyheJtOTeybfJzFfKyPuUE,4275
31
+ src/prompts/library/bundeskasse/other/prompt.txt,sha256=55b-umkE1648V-UoT-3UJjDEzs53TGQNZaOwVCjQPyc,3089
32
32
  src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
33
33
  src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
34
34
  src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
@@ -52,9 +52,9 @@ src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93
52
52
  src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
53
53
  src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
54
54
  src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
55
- src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
55
+ src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
56
56
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
57
57
  src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
- data_science_document_ai-1.48.0.dist-info/METADATA,sha256=TEE-nCF6lxV4DdmQIhbij2ZXGrVwM0wBSLJxJPdUJLE,2152
59
- data_science_document_ai-1.48.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
- data_science_document_ai-1.48.0.dist-info/RECORD,,
58
+ data_science_document_ai-1.50.0.dist-info/METADATA,sha256=_e_1gpRkzKH5RvZ7yKUfbMNtvWH5Hc-0xS1Gyeq4FUQ,2152
59
+ data_science_document_ai-1.50.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.50.0.dist-info/RECORD,,
@@ -103,9 +103,18 @@ def post_process_bundeskasse(aggregated_data):
103
103
  )
104
104
 
105
105
  # Check if the deferredDutyPayer is forto
106
- deferredDutyPayer = line_item.get("deferredDutyPayer", {})
107
- lower = deferredDutyPayer.get("documentValue", "").lower()
108
- if any(key in lower for key in ["de789147263644738", "forto"]):
106
+ KEYWORDS = {"de789147263644738", "forto", "009812"}
107
+
108
+ def is_forto_recipient(line_item: dict) -> bool:
109
+ values_to_check = [
110
+ line_item.get("deferredDutyPayer", {}).get("documentValue", ""),
111
+ line_item.get("vatId", {}).get("documentValue", ""),
112
+ ]
113
+
114
+ combined = " ".join(values_to_check).lower()
115
+ return any(keyword in combined for keyword in KEYWORDS)
116
+
117
+ if is_forto_recipient(line_item):
109
118
  is_recipient_forto = True
110
119
 
111
120
  update_recipient_and_vendor(aggregated_data, is_recipient_forto)
@@ -4,12 +4,12 @@
4
4
  "currencyCode": {
5
5
  "type": "STRING",
6
6
  "nullable": true,
7
- "description": "The currency in which the invoice is issued."
7
+ "description": "The currency in which the invoice is issued. Extract the currency associated with the grand total amount. It is majorly mentioned as EUR, Euro or €."
8
8
  },
9
9
  "grandTotal": {
10
10
  "type": "STRING",
11
11
  "nullable": true,
12
- "description": "The overall total amount of the invoice."
12
+ "description": "The overall total amount of the invoice. It can be found with the key words Gesamtabgabenbetrag, Gesamtbetrag, or Zu erstattender Abgabenbetrag"
13
13
  },
14
14
  "issueDate": {
15
15
  "type": "STRING",
@@ -54,7 +54,7 @@
54
54
  "deferredDutyPayer": {
55
55
  "type": "STRING",
56
56
  "nullable": true,
57
- "description": "It can be identified under \"Aufschubenhmer\" for each line item"
57
+ "description": "It can be identified under 'Aufschubenhmer' for each line item"
58
58
  },
59
59
  "name": {
60
60
  "type": "STRING",
@@ -31,7 +31,9 @@ Your role is to accurately extract specific entities from these Customs invoices
31
31
  - Credit Note Invoice Number is a unique identifier for the credit note, it starts with "ATS" only (e.g., ATS.....).
32
32
  - NIZZA is not a credit note invoice number.
33
33
 
34
- - grandTotal in the credit notes can be found under "Zu erstattender Abgabenbetrag".
34
+ - grandTotal:
35
+ - It can be found with the key words Gesamtabgabenbetrag, Gesamtbetragin. In credit notes, it can be found under "Zu erstattender Abgabenbetrag".
36
+ - grandTotal value is always or mostly mentioned in EUR currency as it is issued by German Customs.
35
37
 
36
38
  - serviceDate can also be referred to as "Zollanmeldung" or "Eingangdatum" in the invoice.
37
39
  - issueDate can also be referred to as "Einfuhrabgabenbescheid" in the invoice. issueDate and serviceDate can be same in some cases.
src/setup.py CHANGED
@@ -1,11 +1,8 @@
1
1
  """Contains project setup parameters and initialization functions."""
2
2
  import json
3
-
4
- # import streamlit as st
5
3
  import os
6
4
  import random
7
5
  import time
8
- from pathlib import Path
9
6
 
10
7
  import toml
11
8
  import vertexai
@@ -18,7 +15,7 @@ from src.constants import project_parameters
18
15
  from src.constants_sandbox import project_parameters_sandbox
19
16
 
20
17
  # Parent repos are imported without .
21
- from src.io import download_dir_from_bucket, get_bq_client, get_storage_client, logger
18
+ from src.io import get_bq_client, get_storage_client, logger
22
19
  from src.llm import LlmClient
23
20
 
24
21
 
@@ -133,15 +130,7 @@ def setup_params(args=None):
133
130
  assert params.keys() & yaml_content.keys() == set()
134
131
  params.update(yaml_content)
135
132
 
136
- # Get models meta data from cloud
137
- client = get_storage_client(params)
138
- bucket = client.bucket(params["doc_ai_bucket_name"])
139
- downloaded_meta = download_dir_from_bucket(
140
- bucket, params["g_model_data_folder"], Path(params["local_model_data_folder"])
141
- )
142
- if not downloaded_meta:
143
- logger.info(f"Could not load models metadata from cloud.")
144
-
133
+ # Set up LLM clients
145
134
  params["LlmClient"] = LlmClient(
146
135
  openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_params"]
147
136
  )
@@ -149,7 +138,8 @@ def setup_params(args=None):
149
138
  openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_flash_params"]
150
139
  )
151
140
 
152
- params["lookup_data"] = setup_lookup_data(params, bucket)
141
+ # Load lookup data from GCS bucket
142
+ setup_lookup_data(params)
153
143
 
154
144
  return params
155
145
 
@@ -182,10 +172,13 @@ def setup_vertexai(params):
182
172
  )
183
173
 
184
174
 
185
- def setup_lookup_data(params, bucket):
175
+ def setup_lookup_data(params):
186
176
  """
187
177
  Loads JSON mapping data from given GCP Bucket.
188
178
  """
179
+ client = get_storage_client(params)
180
+ bucket = client.bucket(params["doc_ai_bucket_name"])
181
+
189
182
  data = dict()
190
183
 
191
184
  input_path_item_code = (
@@ -208,4 +201,4 @@ def setup_lookup_data(params, bucket):
208
201
  input_path_reverse_charge
209
202
  )
210
203
 
211
- return data
204
+ params["lookup_data"] = data