data-science-document-ai 1.42.0__tar.gz → 1.42.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/PKG-INFO +1 -1
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/pyproject.toml +1 -1
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/excel_processing.py +2 -1
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/io.py +24 -4
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/postprocess_partner_invoice.py +5 -30
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/constants.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/docai.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/llm.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/log_setup.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/pdf_processing.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/common.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/setup.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/tms.py +0 -0
- {data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.42.
|
|
3
|
+
version = "1.42.2"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -20,7 +20,8 @@ async def extract_data_from_sheet(
|
|
|
20
20
|
params, sheet_name, sheet, response_schema, doc_type=None
|
|
21
21
|
):
|
|
22
22
|
logger.info(f"Processing sheet: {sheet_name}")
|
|
23
|
-
excel_content = pd.DataFrame(sheet.values)
|
|
23
|
+
excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
|
|
24
|
+
|
|
24
25
|
# Convert to Markdown format for the LLM model
|
|
25
26
|
worksheet = (
|
|
26
27
|
"This is from a excel. Pay attention to the cell position:\n"
|
|
@@ -11,8 +11,6 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
from google.cloud import bigquery, storage
|
|
13
13
|
|
|
14
|
-
from src.constants import project_parameters
|
|
15
|
-
|
|
16
14
|
|
|
17
15
|
def get_gcp_labels(**extra_labels):
|
|
18
16
|
"""Generate standardized GCP labels for cost tracking.
|
|
@@ -23,12 +21,34 @@ def get_gcp_labels(**extra_labels):
|
|
|
23
21
|
Returns:
|
|
24
22
|
dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
|
|
25
23
|
"""
|
|
24
|
+
project_name = os.getenv("PROJECT_NAME")
|
|
25
|
+
|
|
26
|
+
# If not set, detect once and cache it
|
|
27
|
+
if not project_name:
|
|
28
|
+
# Try pyproject.toml first
|
|
29
|
+
try:
|
|
30
|
+
import toml
|
|
31
|
+
|
|
32
|
+
pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
|
|
33
|
+
if pyproject_path.exists():
|
|
34
|
+
config = toml.load(pyproject_path)
|
|
35
|
+
project_name = config.get("tool", {}).get("poetry", {}).get("name")
|
|
36
|
+
except Exception:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
# Fallback to unknown
|
|
40
|
+
if not project_name:
|
|
41
|
+
project_name = "unknown"
|
|
42
|
+
|
|
43
|
+
# Cache it
|
|
44
|
+
os.environ["PROJECT_NAME"] = project_name
|
|
45
|
+
|
|
26
46
|
labels = {
|
|
27
|
-
"ds-project-name":
|
|
47
|
+
"ds-project-name": project_name.lower(),
|
|
28
48
|
"ds-env": os.getenv("CLUSTER", "local").lower(),
|
|
29
49
|
}
|
|
30
50
|
|
|
31
|
-
# Add any extra labels
|
|
51
|
+
# Add any extra labels
|
|
32
52
|
labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
|
|
33
53
|
|
|
34
54
|
return labels
|
|
@@ -138,36 +138,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
138
138
|
|
|
139
139
|
def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
140
140
|
"""Process the partner invoice data."""
|
|
141
|
-
# Post process
|
|
142
|
-
# TODO: Remove this block of code after migrating to LLM completely and update the placeholder in the prompt library
|
|
143
|
-
if "containerNumber" in aggregated_data and isinstance(
|
|
144
|
-
aggregated_data["containerNumber"], dict
|
|
145
|
-
):
|
|
146
|
-
container_number = aggregated_data.get("containerNumber", {}).get(
|
|
147
|
-
"formattedValue", None
|
|
148
|
-
)
|
|
149
|
-
if container_number:
|
|
150
|
-
aggregated_data["containerNumber"] = (
|
|
151
|
-
[
|
|
152
|
-
{
|
|
153
|
-
"documentValue": aggregated_data.get("containerNumber", {}).get(
|
|
154
|
-
"documentValue", ""
|
|
155
|
-
),
|
|
156
|
-
"formattedValue": ctr_number,
|
|
157
|
-
}
|
|
158
|
-
for ctr_number in container_number
|
|
159
|
-
]
|
|
160
|
-
if isinstance(container_number, list)
|
|
161
|
-
else [
|
|
162
|
-
{
|
|
163
|
-
"documentValue": aggregated_data.get("containerNumber", {}).get(
|
|
164
|
-
"documentValue", ""
|
|
165
|
-
),
|
|
166
|
-
"formattedValue": container_number,
|
|
167
|
-
}
|
|
168
|
-
]
|
|
169
|
-
)
|
|
170
|
-
|
|
141
|
+
# Post process bundeskasse invoices
|
|
171
142
|
if document_type_code == "bundeskasse":
|
|
172
143
|
post_process_bundeskasse(aggregated_data)
|
|
173
144
|
return
|
|
@@ -197,9 +168,13 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
197
168
|
params,
|
|
198
169
|
)
|
|
199
170
|
|
|
171
|
+
# Add page number for the consistency
|
|
172
|
+
line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
|
|
173
|
+
|
|
200
174
|
if reverse_charge:
|
|
201
175
|
# Distribute reverseChargeSentence to all line items
|
|
202
176
|
line_item["reverseChargeSentence"] = reverse_charge
|
|
177
|
+
line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
|
|
203
178
|
|
|
204
179
|
|
|
205
180
|
def compute_score(args):
|
|
File without changes
|
{data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/constants_sandbox.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/docai_processor_config.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/postprocessing/common.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_science_document_ai-1.42.0 → data_science_document_ai-1.42.2}/src/prompts/prompt_library.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|