data-science-document-ai 1.43.0__tar.gz → 1.43.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/common.py +11 -3
  4. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/constants.py +0 -0
  5. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/constants_sandbox.py +0 -0
  6. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/docai.py +0 -0
  7. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/docai_processor_config.yaml +0 -0
  8. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/excel_processing.py +0 -0
  9. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/io.py +0 -0
  10. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/llm.py +0 -0
  11. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/log_setup.py +0 -0
  12. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/pdf_processing.py +0 -0
  13. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  14. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  15. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/postprocessing/postprocess_partner_invoice.py +0 -0
  16. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  17. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  18. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  19. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  20. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  21. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  22. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  23. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  24. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  25. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  26. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  27. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  28. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  29. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  30. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  31. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  32. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  33. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  34. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  35. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  36. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  37. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  38. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  39. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  40. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  41. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  42. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  43. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  44. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/packingList/other/prompt.txt +0 -0
  46. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  47. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  48. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  49. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  50. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  51. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  52. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  53. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/prompts/prompt_library.py +0 -0
  54. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/setup.py +0 -0
  55. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/tms.py +0 -0
  56. {data_science_document_ai-1.43.0 → data_science_document_ai-1.43.1}/src/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.43.0
3
+ Version: 1.43.1
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.43.0"
3
+ version = "1.43.1"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -319,6 +319,11 @@ def remove_unwanted_patterns(lineitem: str):
319
319
  # Remove "HIGH CUBE"
320
320
  lineitem = lineitem.replace("HIGH CUBE", "")
321
321
 
322
+ # Remove container size e.g., 20FT, 40HC, etc.
323
+ lineitem = re.sub(
324
+ r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
325
+ ).strip()
326
+
322
327
  return lineitem
323
328
 
324
329
 
@@ -349,18 +354,21 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
349
354
  # Remove the currency codes
350
355
  lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
351
356
 
357
+ # remove other patterns
358
+ lineitem = remove_unwanted_patterns(lineitem)
359
+
352
360
  # Remove numbers from the line item
353
361
  if (
354
362
  remove_numbers
355
363
  ): # Do not remove numbers for the reverse charge sentence as it contains Article number
356
364
  lineitem = re.sub(r"\d+", "", lineitem)
357
365
 
358
- # remove other patterns
359
- lineitem = remove_unwanted_patterns(lineitem)
360
-
361
366
  # remove special chars
362
367
  lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
363
368
 
369
+ # Remove x from lineitem like 10 x
370
+ lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
371
+
364
372
  return re.sub(r"\s{2,}", " ", lineitem).strip()
365
373
 
366
374