data-science-document-ai 1.42.0__tar.gz → 1.42.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/excel_processing.py +2 -1
  4. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/io.py +24 -4
  5. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/postprocess_partner_invoice.py +5 -30
  6. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/constants.py +0 -0
  7. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/constants_sandbox.py +0 -0
  8. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/docai.py +0 -0
  9. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/docai_processor_config.yaml +0 -0
  10. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/llm.py +0 -0
  11. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/log_setup.py +0 -0
  12. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/pdf_processing.py +0 -0
  13. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/common.py +0 -0
  14. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  15. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  16. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  17. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  18. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  19. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  20. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  21. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  22. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  23. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  24. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  25. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  26. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  27. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  28. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  29. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  30. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  31. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  32. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  33. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  34. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  35. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  36. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  37. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  38. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  39. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  40. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  41. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  42. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  43. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  44. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/packingList/other/prompt.txt +0 -0
  46. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  47. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  48. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  49. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  50. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  51. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  52. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  53. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/prompt_library.py +0 -0
  54. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/setup.py +0 -0
  55. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/tms.py +0 -0
  56. {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.42.0
3
+ Version: 1.42.2
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.42.0"
3
+ version = "1.42.2"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -20,7 +20,8 @@ async def extract_data_from_sheet(
20
20
  params, sheet_name, sheet, response_schema, doc_type=None
21
21
  ):
22
22
  logger.info(f"Processing sheet: {sheet_name}")
23
- excel_content = pd.DataFrame(sheet.values)
23
+ excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
24
+
24
25
  # Convert to Markdown format for the LLM model
25
26
  worksheet = (
26
27
  "This is from a excel. Pay attention to the cell position:\n"
@@ -11,8 +11,6 @@ from pathlib import Path
11
11
 
12
12
  from google.cloud import bigquery, storage
13
13
 
14
- from src.constants import project_parameters
15
-
16
14
 
17
15
  def get_gcp_labels(**extra_labels):
18
16
  """Generate standardized GCP labels for cost tracking.
@@ -23,12 +21,34 @@ def get_gcp_labels(**extra_labels):
23
21
  Returns:
24
22
  dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
25
23
  """
24
+ project_name = os.getenv("PROJECT_NAME")
25
+
26
+ # If not set, detect once and cache it
27
+ if not project_name:
28
+ # Try pyproject.toml first
29
+ try:
30
+ import toml
31
+
32
+ pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
33
+ if pyproject_path.exists():
34
+ config = toml.load(pyproject_path)
35
+ project_name = config.get("tool", {}).get("poetry", {}).get("name")
36
+ except Exception:
37
+ pass
38
+
39
+ # Fallback to unknown
40
+ if not project_name:
41
+ project_name = "unknown"
42
+
43
+ # Cache it
44
+ os.environ["PROJECT_NAME"] = project_name
45
+
26
46
  labels = {
27
- "ds-project-name": project_parameters["project_name"],
47
+ "ds-project-name": project_name.lower(),
28
48
  "ds-env": os.getenv("CLUSTER", "local").lower(),
29
49
  }
30
50
 
31
- # Add any extra labels passed in
51
+ # Add any extra labels
32
52
  labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
33
53
 
34
54
  return labels
@@ -138,36 +138,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
138
138
 
139
139
  def process_partner_invoice(params, aggregated_data, document_type_code):
140
140
  """Process the partner invoice data."""
141
- # Post process containerNumber.
142
- # TODO: Remove this block of code after migrating to LLM completely and update the placeholder in the prompt library
143
- if "containerNumber" in aggregated_data and isinstance(
144
- aggregated_data["containerNumber"], dict
145
- ):
146
- container_number = aggregated_data.get("containerNumber", {}).get(
147
- "formattedValue", None
148
- )
149
- if container_number:
150
- aggregated_data["containerNumber"] = (
151
- [
152
- {
153
- "documentValue": aggregated_data.get("containerNumber", {}).get(
154
- "documentValue", ""
155
- ),
156
- "formattedValue": ctr_number,
157
- }
158
- for ctr_number in container_number
159
- ]
160
- if isinstance(container_number, list)
161
- else [
162
- {
163
- "documentValue": aggregated_data.get("containerNumber", {}).get(
164
- "documentValue", ""
165
- ),
166
- "formattedValue": container_number,
167
- }
168
- ]
169
- )
170
-
141
+ # Post process bundeskasse invoices
171
142
  if document_type_code == "bundeskasse":
172
143
  post_process_bundeskasse(aggregated_data)
173
144
  return
@@ -197,9 +168,13 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
197
168
  params,
198
169
  )
199
170
 
171
+ # Add page number for the consistency
172
+ line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
173
+
200
174
  if reverse_charge:
201
175
  # Distribute reverseChargeSentence to all line items
202
176
  line_item["reverseChargeSentence"] = reverse_charge
177
+ line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
203
178
 
204
179
 
205
180
  def compute_score(args):