data-science-document-ai 1.43.5__tar.gz → 1.43.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/partnerInvoice/other/prompt.txt +2 -1
  4. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/utils.py +34 -23
  5. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/constants.py +0 -0
  6. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/constants_sandbox.py +0 -0
  7. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/docai.py +0 -0
  8. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/docai_processor_config.yaml +0 -0
  9. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/excel_processing.py +0 -0
  10. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/io.py +0 -0
  11. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/llm.py +0 -0
  12. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/log_setup.py +0 -0
  13. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/pdf_processing.py +0 -0
  14. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/common.py +0 -0
  15. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  16. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  17. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/postprocessing/postprocess_partner_invoice.py +0 -0
  18. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  19. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  20. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  21. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  22. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  23. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  24. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  25. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  26. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  27. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  28. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  29. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  30. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  31. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  32. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  33. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  34. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  35. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  36. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  37. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  38. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  39. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  40. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  41. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  42. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  43. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  44. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  46. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  47. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  48. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/packingList/other/placeholders.json +0 -0
  49. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/packingList/other/prompt.txt +0 -0
  50. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  51. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  52. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  53. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  54. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  55. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  56. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/prompts/prompt_library.py +0 -0
  57. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/setup.py +0 -0
  58. {data_science_document_ai-1.43.5 → data_science_document_ai-1.43.7}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.43.5
3
+ Version: 1.43.7
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.43.5"
3
+ version = "1.43.7"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -53,7 +53,7 @@ Your role is to accurately extract specific entities from these invoices to supp
53
53
  - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
54
54
  - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
55
55
  - quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
56
- - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
56
+ - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
57
57
 
58
58
  - hblNumber and mblNumber:
59
59
  - The Master Bill of Lading number. Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", or "HBL No.".
@@ -81,6 +81,7 @@ Your role is to accurately extract specific entities from these invoices to supp
81
81
 
82
82
  IMPORTANT NOTE:
83
83
  - Ensure all extracted values are directly from the document. Do not make assumptions, modifications or calculations.
84
+ - Do not split the quantity into different line items. e.g., if quantity is 2 or 2 CTR or 2 BIL, do not create 2 separate line items with quantity 1 each.
84
85
  - Do not normalize or modify any entity values.
85
86
  - Pay attention to the line item details and paymentInformation, as they may vary significantly across different invoices.
86
87
 
@@ -443,12 +443,23 @@ def transform_schema_strings(schema):
443
443
  Returns:
444
444
  dict: The transformed schema dictionary.
445
445
  """
446
- # Base case: if the current schema definition is for a string
447
- if isinstance(schema, dict) and schema.get("type").upper() == "STRING":
448
- new_schema = {
446
+ if not isinstance(schema, dict):
447
+ return schema
448
+
449
+ schema_type = schema.get("type")
450
+ if not schema_type:
451
+ return schema
452
+
453
+ # Base case: STRING → OBJECT (only if not already transformed)
454
+ if schema_type.upper() == "STRING":
455
+ return {
449
456
  "type": "OBJECT",
450
457
  "properties": {
451
- "value": {"type": "STRING"},
458
+ "value": {
459
+ "type": "STRING",
460
+ "nullable": schema.get("nullable", False),
461
+ "description": schema.get("description", ""),
462
+ },
452
463
  "page_number": {
453
464
  "type": "STRING",
454
465
  "description": "Number of a page where the value was found in the document starting from 0.",
@@ -457,29 +468,29 @@ def transform_schema_strings(schema):
457
468
  "required": [],
458
469
  }
459
470
 
460
- # Preserve original properties like nullable and description on the new 'value' key
461
- if "nullable" in schema:
462
- new_schema["properties"]["value"]["nullable"] = schema["nullable"]
463
- if "description" in schema:
464
- new_schema["properties"]["value"]["description"] = schema["description"]
471
+ # Skip already transformed OBJECT (has both 'value' & 'page_number')
472
+ if (
473
+ schema_type.upper() == "OBJECT"
474
+ and "properties" in schema
475
+ and {"value", "page_number"}.issubset(schema["properties"].keys())
476
+ ):
477
+ return schema
465
478
 
479
+ # Recursive case for OBJECT
480
+ if schema_type.upper() == "OBJECT" and "properties" in schema:
481
+ new_schema = schema.copy()
482
+ new_schema["properties"] = {
483
+ k: transform_schema_strings(v) for k, v in schema["properties"].items()
484
+ }
466
485
  return new_schema
467
486
 
468
- # Recursive case: if the schema is a dictionary
469
- elif isinstance(schema, dict) and schema.get("type").upper() == "OBJECT":
470
- transformed_schema = schema.copy()
471
- for key, value in schema.get("properties").items():
472
- transformed_schema["properties"][key] = transform_schema_strings(value)
473
- return transformed_schema
474
-
475
- # Recursive case: if the schema is a list
476
- elif isinstance(schema, dict) and schema.get("type").upper() == "ARRAY":
477
- schema["items"] = transform_schema_strings(schema["items"])
478
- return schema
487
+ # Recursive case for ARRAY
488
+ if schema_type.upper() == "ARRAY" and "items" in schema:
489
+ new_schema = schema.copy()
490
+ new_schema["items"] = transform_schema_strings(schema["items"])
491
+ return new_schema
479
492
 
480
- # Base case: for non-dict/list values (e.g., None, bool, str)
481
- else:
482
- return schema
493
+ return schema
483
494
 
484
495
 
485
496
  def estimate_page_count(sheet):