data-science-document-ai 1.38.0__tar.gz → 1.39.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/pdf_processing.py +1 -2
  4. data_science_document_ai-1.39.0/src/postprocessing/common.py +595 -0
  5. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/postprocessing/postprocess_partner_invoice.py +8 -8
  6. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/setup.py +0 -45
  7. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/utils.py +48 -0
  8. data_science_document_ai-1.38.0/src/postprocessing/common.py +0 -1136
  9. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/constants.py +0 -0
  10. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/constants_sandbox.py +0 -0
  11. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/docai.py +0 -0
  12. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/docai_processor_config.yaml +0 -0
  13. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/excel_processing.py +0 -0
  14. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/io.py +0 -0
  15. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/llm.py +0 -0
  16. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/log_setup.py +0 -0
  17. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  18. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  19. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  20. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  21. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  22. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  23. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  24. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  25. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  26. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  27. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  28. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  29. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  30. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  31. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  32. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  33. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  34. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  35. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  36. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
  37. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  38. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  39. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  40. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  41. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  42. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  43. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  44. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  45. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  46. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  47. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  48. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
  49. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  50. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  51. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  52. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  53. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  54. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  55. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  56. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  57. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  58. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/prompts/prompt_library.py +0 -0
  59. {data_science_document_ai-1.38.0 → data_science_document_ai-1.39.0}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.38.0
3
+ Version: 1.39.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.38.0"
3
+ version = "1.39.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -355,7 +355,6 @@ async def data_extraction_manual_flow(
355
355
  meta,
356
356
  processor_client,
357
357
  schema_client,
358
- embed_manager,
359
358
  ):
360
359
  """
361
360
  Process a PDF file and extract data from it.
@@ -418,7 +417,7 @@ async def data_extraction_manual_flow(
418
417
  )
419
418
  # Create the result dictionary with the extracted data
420
419
  extracted_data = await format_all_entities(
421
- extracted_data, embed_manager, meta.documentTypeCode, params
420
+ extracted_data, meta.documentTypeCode, params
422
421
  )
423
422
  result = {
424
423
  "id": meta.id,
@@ -0,0 +1,595 @@
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import re
5
+ from datetime import timezone
6
+
7
+ import pandas as pd
8
+ from nltk.corpus import stopwords
9
+ from rapidfuzz import process
10
+
11
+ from src.constants import formatting_rules
12
+ from src.io import logger
13
+ from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
14
+ from src.prompts.prompt_library import prompt_library
15
+ from src.utils import get_tms_mappings
16
+
17
+ tms_domain = os.environ["TMS_DOMAIN"]
18
+
19
+
20
+ def convert_container_number(container_number):
21
+ """
22
+ Convert a container number to ISO standard.
23
+
24
+ Args:
25
+ container_number (str): The container number to be converted.
26
+
27
+ Returns:
28
+ str: The formatted container number if it is valid, None otherwise.
29
+ """
30
+ if not container_number:
31
+ return
32
+ # 'FFAU2932130--FX34650895-40HC' -> 'FFAU2932130'
33
+ match = re.findall(r"[A-Z]{4}\d{7}", container_number)
34
+ stripped_value = match if match else None
35
+
36
+ # LLMs do extract all the container numbers as a list of strings
37
+ if stripped_value and len(stripped_value) > 1:
38
+ return stripped_value
39
+ else:
40
+ stripped_value = stripped_value[0] if stripped_value else None
41
+
42
+ if not stripped_value:
43
+ stripped_value = "".join(
44
+ filter(lambda char: str.isalnum(char) or char == "/", container_number)
45
+ )
46
+
47
+ # This is to catch container number that has the format like: ABCD1234567/40DC or ABCD1234567/SEAL1234567
48
+ formatted_value = stripped_value.split("/")[0]
49
+ if len(formatted_value) != 11:
50
+ return
51
+ # Check if the format is according to the ISO standard
52
+ if not formatted_value[:4].isalpha() or not formatted_value[4:].isdigit():
53
+ return
54
+ return formatted_value
55
+
56
+
57
+ def clean_invoice_number(invoice_number):
58
+ """Post process invoice number
59
+
60
+ Args:
61
+ invoice_number (str): The invoice number to be cleaned.
62
+
63
+ Returns:
64
+ str: The cleaned invoice number if it is valid, None otherwise.
65
+ """
66
+ if not invoice_number:
67
+ return
68
+
69
+ # Remove all non-alphanumeric characters
70
+ stripped_value = re.sub(r"[^\w]", "", invoice_number)
71
+
72
+ return stripped_value
73
+
74
+
75
+ def clean_shipment_id(shipment_id):
76
+ """
77
+ Convert shipment_id to Forto standard.
78
+
79
+ Args:
80
+ shipment_id (str): The Shipment ID to be converted.
81
+
82
+ Returns:
83
+ str: The formatted shipment_id if it is valid, None otherwise.
84
+ """
85
+ if not shipment_id:
86
+ return
87
+ # '#S123456@-1' -> 'S123456'
88
+ # Find the pattern of a shipment ID that starts with 'S' followed by 5 to 7 digits
89
+ match = re.findall(r"S\d{5,7}", shipment_id)
90
+ stripped_value = match[0] if match else None
91
+
92
+ if not stripped_value:
93
+ return None
94
+
95
+ # Check if length is valid (should be either 7 or 8)
96
+ if len(stripped_value) not in (6, 7, 8):
97
+ return None
98
+
99
+ return stripped_value
100
+
101
+
102
+ # Clean the date for date obj parse in tms formatting
103
+ def clean_date_string(date_str):
104
+ """Remove hours and timezone information from the date string."""
105
+ date_str = date_str.strip()
106
+ if "hrs" in date_str:
107
+ return date_str.replace("hrs", "")
108
+ if "(CET)" in date_str:
109
+ return date_str.replace("(CET)", "")
110
+ return date_str
111
+
112
+
113
+ def extract_date(date_str):
114
+ """
115
+ Extract date from string using european format (day first).
116
+
117
+ Check if starts with year, then YYYY-MM-DD, else DD-MM-YYYY
118
+ """
119
+ if all([c.isnumeric() for c in date_str[:4]]):
120
+ dt_obj = pd.to_datetime(date_str, dayfirst=False).to_pydatetime()
121
+ else:
122
+ dt_obj = pd.to_datetime(date_str, dayfirst=True).to_pydatetime()
123
+ return dt_obj
124
+
125
+
126
+ def extract_number(data_field_value):
127
+ """
128
+ Remove everything not a digit and not in [, .].
129
+
130
+ Args:
131
+ data_field_value: string
132
+
133
+ Returns:
134
+ formatted_value: string
135
+
136
+ """
137
+ formatted_value = ""
138
+ for c in data_field_value:
139
+ if c.isnumeric() or c in [",", "."]:
140
+ formatted_value += c
141
+
142
+ # First and last characters should not be [",", "."]
143
+ formatted_value = formatted_value.strip(",.")
144
+
145
+ return formatted_value if formatted_value not in ["''", ""] else None
146
+
147
+
148
+ def extract_string(data_field_value):
149
+ """Remove numeric characters from the string.
150
+
151
+ Args:
152
+ data_field_value: string
153
+
154
+ Returns:
155
+ formatted_value: string
156
+
157
+ """
158
+ if not isinstance(data_field_value, str):
159
+ return None
160
+
161
+ excluded_chars = [".", ",", ")", "(", " ", "[", "]"]
162
+ formatted_value = "".join(
163
+ c for c in data_field_value if not c.isdigit() and c not in excluded_chars
164
+ )
165
+
166
+ return formatted_value if formatted_value not in ["''", ""] else None
167
+
168
+
169
+ def remove_none_values(d):
170
+ if isinstance(d, dict):
171
+ # Create a new dictionary to store non-None values
172
+ cleaned_dict = {}
173
+ for key, value in d.items():
174
+ cleaned_value = remove_none_values(value)
175
+ if cleaned_value is not None: # Only add non-None values
176
+ cleaned_dict[key] = cleaned_value
177
+ return cleaned_dict if cleaned_dict else None
178
+
179
+ elif isinstance(d, list):
180
+ # Create a new list to store non-None values
181
+ cleaned_list = []
182
+ for item in d:
183
+ cleaned_item = remove_none_values(item)
184
+ if cleaned_item is not None: # Only add non-None values
185
+ cleaned_list.append(cleaned_item)
186
+ return cleaned_list if cleaned_list else None
187
+
188
+ else:
189
+ # Return the value if it's not a dictionary or list
190
+ return d if d is not None else None
191
+
192
+
193
+ def check_formatting_rule(entity_key, document_type_code, rule):
194
+ if (
195
+ document_type_code in formatting_rules.keys()
196
+ and entity_key in formatting_rules[document_type_code].keys()
197
+ and formatting_rules[document_type_code][entity_key] == rule
198
+ ):
199
+ return True
200
+ return False
201
+
202
+
203
+ def convert_invoice_type(data_field_value, params):
204
+ """
205
+ Converts a raw invoice type string to either invoice or creditNote using fuzzy matching.
206
+
207
+ Args:
208
+ data_field_value (str): The raw invoice type string from the data.
209
+ params (dict): A dictionary of parameters, including:
210
+ - "lookup_data": A dictionary containing lookup tables.
211
+ - "fuzzy_threshold_invoice_classification": The minimum fuzzy matching score.
212
+
213
+ Returns:
214
+ str or None: The standardized invoice type if a match is found, otherwise None.
215
+ """
216
+ lookup_data = params["lookup_data"]["invoice_classification"]
217
+ keywords = list(lookup_data.keys())
218
+
219
+ best_match = process.extractOne(
220
+ data_field_value.lower(),
221
+ keywords,
222
+ )
223
+ if best_match:
224
+ best_match_key, score, _ = best_match
225
+ if score >= params["fuzzy_threshold_invoice_classification"]:
226
+ return lookup_data[best_match_key]
227
+ return None
228
+
229
+
230
+ # Function to create KVP dictionary using apply method
231
+ def create_kvp_dictionary(df_raw: pd.DataFrame):
232
+ """Create a key-value pair dictionary from the given DataFrame.
233
+
234
+ Args:
235
+ df_raw (pd.DataFrame): The input DataFrame containing 'lineitem' and 'Forto SLI' columns.
236
+
237
+ return:
238
+ A key-value pair dictionary with 'Processed Lineitem' as key and 'Forto SLI' as value.
239
+ """
240
+ df = df_raw.copy()
241
+ df["Processed Lineitem"] = df["lineitem"].apply(clean_item_description)
242
+ kvp_dict = df.set_index("Processed Lineitem")["Forto SLI"].to_dict()
243
+
244
+ return kvp_dict
245
+
246
+
247
+ def remove_dates(lineitem: str):
248
+ """
249
+ This function removes dates in the format "dd Month yyyy" from the given line item string.
250
+
251
+ Args:
252
+ lineitem (str): The input string from which dates will be removed.
253
+
254
+ Returns:
255
+ str: The string with dates removed.
256
+ """
257
+ # Remove dates in the format dd.mm.yy or dd.mm.yyyy
258
+ lineitem = re.sub(r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b", "", lineitem)
259
+
260
+ # Remove dates in the format "dd Month yyyy"
261
+ lineitem = re.sub(
262
+ r"\b\d{1,2} (?:january|february|march|april|may|june|july|august|september|october|november|december|januar|"
263
+ r"februar|märz|mai|juni|juli|oktober|dezember) \d{4}\b",
264
+ "",
265
+ lineitem,
266
+ flags=re.IGNORECASE,
267
+ )
268
+
269
+ # Define a list of month abbreviations in English and German
270
+ month_abbreviations = [
271
+ "JAN",
272
+ "FEB",
273
+ "MAR",
274
+ "APR",
275
+ "MAY",
276
+ "JUN",
277
+ "JUL",
278
+ "AUG",
279
+ "SEP",
280
+ "OCT",
281
+ "NOV",
282
+ "DEC",
283
+ "JAN",
284
+ "FEB",
285
+ "MRZ",
286
+ "APR",
287
+ "MAI",
288
+ "JUN",
289
+ "JUL",
290
+ "AUG",
291
+ "SEP",
292
+ "OKT",
293
+ "NOV",
294
+ "DEZ",
295
+ ]
296
+
297
+ # Create a regular expression pattern to match month abbreviations
298
+ pattern = r"\b(?:{})\b".format("|".join(month_abbreviations))
299
+
300
+ # Remove month abbreviations
301
+ lineitem = re.sub(pattern, "", lineitem, flags=re.IGNORECASE)
302
+
303
+ return lineitem
304
+
305
+
306
+ def remove_unwanted_patterns(lineitem: str):
307
+ """
308
+ This function removes dates, month names, and container numbers from the given line item string.
309
+
310
+ Args:
311
+ lineitem (str): The input string from which unwanted patterns will be removed.
312
+
313
+ Returns:
314
+ str: The string with dates, month names, and container numbers removed.
315
+ """
316
+ # Remove container numbers (4 letters followed by 7 digits)
317
+ lineitem = re.sub(r"\b[A-Z]{4}\d{7}\b", "", lineitem)
318
+
319
+ # Remove "HIGH CUBE"
320
+ lineitem = lineitem.replace("HIGH CUBE", "")
321
+
322
+ return lineitem
323
+
324
+
325
+ def clean_item_description(lineitem: str, remove_numbers: bool = True):
326
+ """
327
+ This function removes dates, month names, whitespaces, currency patterns and container numbers from the given line item string. # noqa
328
+
329
+ Args:
330
+ lineitem (str): The input string from which unwanted patterns will be removed.
331
+
332
+ Returns:
333
+ str: The cleaned string removed.
334
+ """
335
+ currency_codes_pattern = r"\b(USD|EUR|JPY|GBP|CAD|AUD|CHF|CNY|SEK|NZD|KRW|SGD|INR|BRL|ZAR|RUB|MXN|HKD|NOK|TRY|IDR|MYR|PHP|THB|VND|PLN|CZK|HUF|ILS|AED|SAR|QAR|KWD|EGP|NGN|ARS|CLP|COP|PEN|UYU|VEF|INR|PKR|BDT|LKR|NPR|MMK)\b" # noqa
336
+
337
+ # Remove stopwords
338
+ lineitem = remove_stop_words(lineitem)
339
+
340
+ # remove dates
341
+ lineitem = remove_dates(lineitem)
342
+
343
+ # remove whitespaces
344
+ lineitem = re.sub(r"\s{2,}", " ", lineitem)
345
+
346
+ # remove newlines
347
+ lineitem = re.sub(r"\\n|\n", " ", lineitem)
348
+
349
+ # Remove the currency codes
350
+ lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
351
+
352
+ # Remove numbers from the line item
353
+ if (
354
+ remove_numbers
355
+ ): # Do not remove numbers for the reverse charge sentence as it contains Article number
356
+ lineitem = re.sub(r"\d+", "", lineitem)
357
+
358
+ # remove other patterns
359
+ lineitem = remove_unwanted_patterns(lineitem)
360
+
361
+ # remove special chars
362
+ lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
363
+
364
+ return re.sub(r"\s{2,}", " ", lineitem).strip()
365
+
366
+
367
+ async def format_label(entity_k, entity_value, document_type_code, params):
368
+ llm_client = params["LlmClient"]
369
+ if isinstance(entity_value, dict): # if it's a nested entity
370
+ format_tasks = [
371
+ format_label(sub_k, sub_v, document_type_code, params)
372
+ for sub_k, sub_v in entity_value.items()
373
+ ]
374
+ return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
375
+ if isinstance(entity_value, list):
376
+ format_tasks = await asyncio.gather(
377
+ *[
378
+ format_label(entity_k, sub_v, document_type_code, params)
379
+ for sub_v in entity_value
380
+ ]
381
+ )
382
+ return entity_k, [v for _, v in format_tasks]
383
+ entity_key = entity_k.lower()
384
+ formatted_value = None
385
+
386
+ if entity_key.startswith("port"):
387
+ formatted_value = await get_port_code_ai(entity_value, llm_client)
388
+
389
+ elif (entity_key == "containertype") or (entity_key == "containersize"):
390
+ formatted_value = get_tms_mappings(entity_value, "container_types")
391
+
392
+ elif check_formatting_rule(entity_k, document_type_code, "terminal"):
393
+ formatted_value = get_tms_mappings(entity_value, "terminals")
394
+
395
+ elif check_formatting_rule(entity_k, document_type_code, "depot"):
396
+ formatted_value = get_tms_mappings(entity_value, "depots")
397
+
398
+ elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
399
+ try:
400
+ cleaned_data_field_value = clean_date_string(entity_value)
401
+ dt_obj = extract_date(cleaned_data_field_value)
402
+ formatted_value = str(dt_obj.date())
403
+ except ValueError as e:
404
+ logger.info(f"ParserError: {e}")
405
+ elif "cutoff" in entity_key:
406
+ try:
407
+ cleaned_data_field_value = clean_date_string(entity_value)
408
+ dt_obj = extract_date(cleaned_data_field_value)
409
+ if dt_obj.tzinfo is None:
410
+ dt_obj = dt_obj.replace(tzinfo=timezone.utc)
411
+ else:
412
+ dt_obj = dt_obj.astimezone(timezone.utc)
413
+ formatted_value = dt_obj.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
414
+ except ValueError as e:
415
+ logger.info(f"ParserError: {e}")
416
+
417
+ elif entity_key in ["invoicenumber", "creditnoteinvoicenumber"]:
418
+ formatted_value = clean_invoice_number(entity_value)
419
+
420
+ elif entity_key in ("shipmentid", "partnerreference"):
421
+ # Clean the shipment ID to match Forto's standard (starts with 'S' followed by 5 to 7 digits)
422
+ formatted_value = clean_shipment_id(entity_value)
423
+
424
+ elif entity_key == "containernumber":
425
+ # Remove all non-alphanumeric characters like ' ', '-', etc.
426
+ formatted_value = convert_container_number(entity_value)
427
+
428
+ elif (
429
+ document_type_code in ["finalMbL", "draftMbl"] and entity_key == "measurements"
430
+ ):
431
+ formatted_value = decimal_convertor(extract_number(entity_value))
432
+ elif any(
433
+ packaging_type in entity_key
434
+ for packaging_type in ["packagingtype", "packagetype", "currency"]
435
+ ):
436
+ # Remove all numeric characters from the string. For example 23CARTONS -> CARTONS
437
+ formatted_value = extract_string(entity_value)
438
+ elif "lineitemdescription" in entity_key:
439
+ formatted_value = clean_item_description(entity_value)
440
+ elif "documenttype" in entity_key:
441
+ formatted_value = convert_invoice_type(entity_value, params)
442
+
443
+ # Handle reverseChargeSentence
444
+ elif "reversechargesentence" in entity_key:
445
+ formatted_value = clean_item_description(entity_value, remove_numbers=False)
446
+
447
+ elif any(
448
+ numeric_indicator in entity_key
449
+ for numeric_indicator in [
450
+ "weight",
451
+ "quantity",
452
+ "value",
453
+ "amount",
454
+ "price",
455
+ "totalamount",
456
+ "totalamounteuro",
457
+ "vatamount",
458
+ "vatapplicableamount",
459
+ "grandtotal",
460
+ ]
461
+ ):
462
+ # Convert EU values to English values (e.g., 4.123,45 -> 4123.45)
463
+ formatted_value = decimal_convertor(extract_number(entity_value))
464
+
465
+ result = {
466
+ "documentValue": entity_value,
467
+ "formattedValue": formatted_value,
468
+ }
469
+ return entity_k, result
470
+
471
+
472
+ async def get_port_code_ai(port: str, llm_client):
473
+ """Get port code using AI model."""
474
+ port_llm = await get_port_code_llm(port, llm_client)
475
+
476
+ return get_tms_mappings(port, "ports", port_llm)
477
+
478
+
479
+ async def get_port_code_llm(port: str, llm_client):
480
+ if (
481
+ "postprocessing" in prompt_library.library.keys()
482
+ and "port_code" in prompt_library.library["postprocessing"].keys()
483
+ ):
484
+ # Get the prompt from the prompt library and prepare the response schema for ChatGPT
485
+ prompt = prompt_library.library["postprocessing"]["port_code"]["prompt"]
486
+ response_schema = {
487
+ "type": "json_schema",
488
+ "json_schema": {
489
+ "name": "port",
490
+ "strict": True,
491
+ "schema": {
492
+ "type": "object",
493
+ "properties": {
494
+ "port": {
495
+ "type": "string",
496
+ "description": f"Get the port code for the given port: {port}",
497
+ }
498
+ },
499
+ "required": ["port"],
500
+ "additionalProperties": False,
501
+ },
502
+ },
503
+ }
504
+
505
+ response = await llm_client.get_unified_json_genai(
506
+ prompt, response_schema=response_schema, model="chatgpt"
507
+ )
508
+ try:
509
+ mapped_port = response["port"]
510
+ return mapped_port
511
+ except json.JSONDecodeError:
512
+ logger.error(f"Error decoding response: {response}")
513
+ return None
514
+
515
+
516
+ def decimal_convertor(value):
517
+ """Convert EU values to English values."""
518
+ if value is None:
519
+ return None
520
+
521
+ # Remove spaces
522
+ value = value.strip().replace(" ", "")
523
+
524
+ # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
525
+ if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
526
+ value = value.replace(".", "").replace(",", ".")
527
+
528
+ # European style integer with thousand separator: 2.500
529
+ elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
530
+ value = value.replace(".", "")
531
+
532
+ # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
533
+ elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
534
+ value = value.replace(",", "")
535
+
536
+ # English style integer with thousand separator: 2,500
537
+ elif re.match(r"^\d{1,3}(,\d{3})+$", value):
538
+ value = value.replace(",", "")
539
+
540
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
541
+ elif re.match(r"^\d+,\d{1,2}$", value):
542
+ value = value.replace(",", ".")
543
+
544
+ return value
545
+
546
+
547
+ async def format_all_entities(result, document_type_code, params):
548
+ """Format the entity values in the result dictionary."""
549
+ # Since we treat `customsInvoice` same as `partnerInvoice`
550
+ document_type_code = (
551
+ "partnerInvoice"
552
+ if document_type_code == "customsInvoice"
553
+ else document_type_code
554
+ )
555
+ # Remove None values from the dictionary
556
+ result = remove_none_values(result)
557
+ if result is None:
558
+ logger.info("No data was extracted.")
559
+ return {}
560
+
561
+ # Format all entities recursively
562
+ _, aggregated_data = await format_label(None, result, document_type_code, params)
563
+
564
+ # Process partner invoice on lineitem mapping and reverse charge sentence
565
+ if document_type_code in ["partnerInvoice", "bundeskasse"]:
566
+ process_partner_invoice(params, aggregated_data, document_type_code)
567
+
568
+ logger.info("Data Extraction completed successfully")
569
+ return aggregated_data
570
+
571
+
572
+ def add_text_without_space(text):
573
+ """If the cleaned text is different from the original text, append it.
574
+ Useful for port names like QUINHON - Quinhon"""
575
+ cleaned_text = "".join(text.split())
576
+ if text != cleaned_text:
577
+ text += f" {cleaned_text}"
578
+ return text
579
+
580
+
581
+ def remove_stop_words(lineitem: str):
582
+ """Remove stop words in English and German from the given line item string.
583
+
584
+ Args:
585
+ lineitem (str): The input string from which stop words will be removed.
586
+
587
+ Returns:
588
+ str: The string with stop words removed.
589
+ """
590
+ stop_words = set(stopwords.words("english") + stopwords.words("german")) - {"off"}
591
+ return (
592
+ " ".join(word for word in lineitem.split() if word.lower() not in stop_words)
593
+ .upper()
594
+ .strip()
595
+ )
@@ -4,6 +4,7 @@ from concurrent.futures import ThreadPoolExecutor
4
4
  from fuzzywuzzy import fuzz
5
5
 
6
6
  from src.io import logger
7
+ from src.utils import get_tms_mappings
7
8
 
8
9
 
9
10
  def postprocessing_partner_invoice(partner_invoice):
@@ -135,7 +136,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
135
136
  ] = "Dasbachstraße 15, 54292 Trier, Germany"
136
137
 
137
138
 
138
- def process_partner_invoice(params, aggregated_data, embed_manager, document_type_code):
139
+ def process_partner_invoice(params, aggregated_data, document_type_code):
139
140
  """Process the partner invoice data."""
140
141
  # Post process containerNumber.
141
142
  # TODO: Remove this block of code after migrating to LLM completely and update the placeholder in the prompt library
@@ -192,7 +193,6 @@ def process_partner_invoice(params, aggregated_data, embed_manager, document_typ
192
193
  for line_item in line_items:
193
194
  if line_item.get("lineItemDescription", None) is not None:
194
195
  line_item["itemCode"] = associate_forto_item_code(
195
- embed_manager,
196
196
  line_item["lineItemDescription"]["formattedValue"],
197
197
  params,
198
198
  )
@@ -275,7 +275,7 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
275
275
  return kvp_dict.get(best_match, None)
276
276
 
277
277
 
278
- def associate_forto_item_code(embed_manager, input_string, params):
278
+ def associate_forto_item_code(input_string, params):
279
279
  """
280
280
  Finds a match for the input string using fuzzy matching first, then embedding fallback.
281
281
 
@@ -286,7 +286,6 @@ def associate_forto_item_code(embed_manager, input_string, params):
286
286
 
287
287
  Args:
288
288
  input_string: The string to find a match for.
289
- embed_manager: The embedding manager instance to use for fallback.
290
289
  params: Parameters containing the lookup data and fuzzy threshold.
291
290
 
292
291
  Returns:
@@ -301,10 +300,11 @@ def associate_forto_item_code(embed_manager, input_string, params):
301
300
 
302
301
  if forto_item_code is None:
303
302
  # 2. Fallback to embedding function if no good fuzzy match
304
- embeddings_dict = embed_manager.embeddings_dict
305
- forto_item_code = embed_manager._find_most_similar_option(
306
- input_string, *embeddings_dict["item_codes_label"]
307
- )
303
+ forto_item_code = get_tms_mappings(input_string, "line_items")
304
+ # embeddings_dict = embed_manager.embeddings_dict
305
+ # forto_item_code = embed_manager._find_most_similar_option(
306
+ # input_string, *embeddings_dict["item_codes_label"]
307
+ # )
308
308
 
309
309
  result = {"documentValue": input_string, "formattedValue": forto_item_code}
310
310
  return result