data-science-document-ai 1.43.0__tar.gz → 1.43.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/PKG-INFO +1 -1
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/pyproject.toml +1 -1
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/common.py +11 -3
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/constants.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/docai.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/excel_processing.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/io.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/llm.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/log_setup.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/pdf_processing.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/postprocess_partner_invoice.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/setup.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/tms.py +0 -0
- {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.43.
|
|
3
|
+
version = "1.43.1"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
{data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/common.py
RENAMED
|
@@ -319,6 +319,11 @@ def remove_unwanted_patterns(lineitem: str):
|
|
|
319
319
|
# Remove "HIGH CUBE"
|
|
320
320
|
lineitem = lineitem.replace("HIGH CUBE", "")
|
|
321
321
|
|
|
322
|
+
# Remove container size e.g., 20FT, 40HC, etc.
|
|
323
|
+
lineitem = re.sub(
|
|
324
|
+
r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
|
|
325
|
+
).strip()
|
|
326
|
+
|
|
322
327
|
return lineitem
|
|
323
328
|
|
|
324
329
|
|
|
@@ -349,18 +354,21 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
349
354
|
# Remove the currency codes
|
|
350
355
|
lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
|
|
351
356
|
|
|
357
|
+
# remove other patterns
|
|
358
|
+
lineitem = remove_unwanted_patterns(lineitem)
|
|
359
|
+
|
|
352
360
|
# Remove numbers from the line item
|
|
353
361
|
if (
|
|
354
362
|
remove_numbers
|
|
355
363
|
): # Do not remove numbers for the reverse charge sentence as it contains Article number
|
|
356
364
|
lineitem = re.sub(r"\d+", "", lineitem)
|
|
357
365
|
|
|
358
|
-
# remove other patterns
|
|
359
|
-
lineitem = remove_unwanted_patterns(lineitem)
|
|
360
|
-
|
|
361
366
|
# remove special chars
|
|
362
367
|
lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
|
|
363
368
|
|
|
369
|
+
# Remove x from lineitem like 10 x
|
|
370
|
+
lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
|
|
371
|
+
|
|
364
372
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
365
373
|
|
|
366
374
|
|
|
File without changes
|
{data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|