satif-ai 0.1.1a0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/PKG-INFO +3 -3
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/pyproject.toml +4 -3
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/adapters/tidy.py +1 -1
- satif_ai-0.1.2/satif_ai/standardizers/__init__.py +0 -0
- satif_ai-0.1.2/satif_ai/standardizers/ai_csv.py +662 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/LICENSE +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/README.md +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/__init__.py +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/adapters/__init__.py +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/code_builders/__init__.py +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/code_builders/adaptation.py +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/code_builders/transformation.py +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/plot_builders/__init__.py +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/plot_builders/agent.py +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/plot_builders/prompt.py +0 -0
- {satif_ai-0.1.1a0 → satif_ai-0.1.2}/satif_ai/plot_builders/tool.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: satif-ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: AI Agents for Satif
|
5
5
|
License: MIT
|
6
6
|
Author: Bryan Djafer
|
@@ -13,8 +13,8 @@ Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
14
14
|
Classifier: Programming Language :: Python :: 3.13
|
15
15
|
Requires-Dist: openai-agents (>=0.0.9,<0.0.10)
|
16
|
-
Requires-Dist: satif-sdk (>=0.1.0,<0.
|
17
|
-
Requires-Dist: sdif-mcp (>=0.1.0,<0.
|
16
|
+
Requires-Dist: satif-sdk (>=0.1.0,<1.0.0)
|
17
|
+
Requires-Dist: sdif-mcp (>=0.1.0,<1.0.0)
|
18
18
|
Description-Content-Type: text/markdown
|
19
19
|
|
20
20
|
# SATIF AI
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "satif-ai"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.2"
|
4
4
|
description = "AI Agents for Satif"
|
5
5
|
authors = [
|
6
6
|
{name = "Bryan Djafer", email = "bryan.djafer@syncpulse.fr"}
|
@@ -12,8 +12,8 @@ requires-python = ">=3.10,<4.0"
|
|
12
12
|
|
13
13
|
[tool.poetry.dependencies]
|
14
14
|
openai-agents = ">=0.0.9,<0.0.10"
|
15
|
-
satif-sdk = ">=0.1.0,<0.
|
16
|
-
sdif-mcp = ">=0.1.0,<0.
|
15
|
+
satif-sdk = ">=0.1.0,<1.0.0"
|
16
|
+
sdif-mcp = ">=0.1.0,<1.0.0"
|
17
17
|
|
18
18
|
[build-system]
|
19
19
|
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
@@ -28,6 +28,7 @@ satif-core = {path = "../core", develop = true}
|
|
28
28
|
satif-sdk = {path = "../sdk", develop = true}
|
29
29
|
sdif-mcp = {path = "../mcp", develop = true}
|
30
30
|
sdif-db = {path = "../sdif", develop = true}
|
31
|
+
ipykernel = "^6.29.5"
|
31
32
|
|
32
33
|
|
33
34
|
|
@@ -11,11 +11,11 @@ from typing import Optional
|
|
11
11
|
# MCP and Agent imports
|
12
12
|
from agents import Agent, Runner, function_tool
|
13
13
|
from agents.mcp.server import MCPServerStdio
|
14
|
-
from libs.core.satif_core.types import Datasource
|
15
14
|
from mcp import ClientSession
|
16
15
|
|
17
16
|
# SATIF imports
|
18
17
|
from satif_core.adapters.base import Adapter
|
18
|
+
from satif_core.types import Datasource
|
19
19
|
from satif_sdk import SDIFDatabase
|
20
20
|
from satif_sdk.adapters.code import AdapterError, CodeAdapter
|
21
21
|
|
File without changes
|
@@ -0,0 +1,662 @@
|
|
1
|
+
import contextvars
|
2
|
+
import csv
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
import re
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any, Dict, Optional
|
8
|
+
|
9
|
+
import clevercsv
|
10
|
+
from agents import Agent, Runner, function_tool
|
11
|
+
from agents.mcp.server import MCPServerStdio
|
12
|
+
from charset_normalizer import detect
|
13
|
+
from mcp import ClientSession
|
14
|
+
from satif_core.types import Datasource, SDIFPath
|
15
|
+
from satif_sdk.standardizers.csv import (
|
16
|
+
DELIMITER_SAMPLE_SIZE,
|
17
|
+
ENCODING_SAMPLE_SIZE,
|
18
|
+
CSVStandardizer,
|
19
|
+
SkipColumnsConfig,
|
20
|
+
)
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
# TODO: maybe we want more analysis tools:
|
26
|
+
# get empty rows or same values rows.
|
27
|
+
|
28
|
+
|
29
|
+
# --- Agent Prompt Definition ---
|
30
|
+
AI_CSV_PROMPT = """
|
31
|
+
You are an expert CSV Data Standardization Agent. Your mission is to analyze a given CSV file and determine all necessary parameters and metadata so it can be correctly standardized into a well-structured SDIF table using the underlying CSVStandardizer.
|
32
|
+
|
33
|
+
**CSV File Path:** {file_path}
|
34
|
+
**Initial Guesses (Hints for you to verify or correct):**
|
35
|
+
- Encoding: {initial_encoding}
|
36
|
+
- Delimiter: '{initial_delimiter}'
|
37
|
+
|
38
|
+
**Your Comprehensive Task:**
|
39
|
+
|
40
|
+
1. **Core Parsing Parameters:**
|
41
|
+
* Determine the correct file `encoding` (string, e.g., "utf-8", "latin-1").
|
42
|
+
* Determine the correct `delimiter` (string, e.g., ",", ";", "\\t").
|
43
|
+
* Determine if a `has_header` row exists (boolean: true/false).
|
44
|
+
* Determine `skip_rows` (integer for initial N rows OR a list of 0-based specific row indices to skip, e.g., metadata blocks, comments, empty lines, repeated headers). Ensure the list is sorted and contains unique, non-negative integers.
|
45
|
+
|
46
|
+
2. **Table Definition:**
|
47
|
+
* Generate a concise, descriptive, and SQL-safe `table_name` for the data in this CSV (string, snake_case preferred, e.g., "customer_orders_2023_q4"). This name will be sanitized by the system, but try to make it good.
|
48
|
+
* Optionally, generate a `table_description` (string) providing a brief semantic overview of what the entire table represents, especially if the `table_name` isn't fully self-explanatory. (e.g., "Contains quarterly sales data for all product lines."). Only provide if it adds clear value.
|
49
|
+
|
50
|
+
3. **Column Analysis and Definition:**
|
51
|
+
* For **each column** you identify that should be included in the final table:
|
52
|
+
* `identifier_in_csv` (string): This is how the column is found in the *raw CSV data*.
|
53
|
+
* If `has_header` is true, this is the **exact original header name** from the CSV.
|
54
|
+
* If `has_header` is false, this is a **string representation of the 0-based column index** (e.g., "0", "1", "2").
|
55
|
+
* `final_column_name` (string): This is the desired name for the column in the SDIF database table. It **MUST** be:
|
56
|
+
* Clean and descriptive.
|
57
|
+
* Sanitized by you (snake_case, lowercase, no special characters besides underscore, no spaces). The system will also sanitize it, but aim for a good one.
|
58
|
+
* Potentially an improved/clarified version of the original header (e.g., fixing typos, expanding abbreviations).
|
59
|
+
* `description` (string, OPTIONAL): A concise semantic description of what the data in this specific column represents.
|
60
|
+
* **Provide this ONLY if the `final_column_name` is not entirely self-explanatory or if the column's content is ambiguous.**
|
61
|
+
* Focus on clarity and a human-understandable meaning. (e.g., for a column `order_total_usd`, a description might be "Total amount of the order in US Dollars, including taxes but excluding discounts.")
|
62
|
+
* If the `final_column_name` is very clear (e.g., `customer_email_address`), a separate description is likely NOT needed. Omit the field or set to null.
|
63
|
+
|
64
|
+
4. **Final Output:**
|
65
|
+
* Respond ONLY with a single JSON object containing all the determined parameters and metadata.
|
66
|
+
* The JSON object MUST adhere strictly to the following structure:
|
67
|
+
|
68
|
+
```json
|
69
|
+
{{
|
70
|
+
"table_name": "...",
|
71
|
+
"table_description": null, // Or string value. Null or omit if not generated.
|
72
|
+
"encoding": "...",
|
73
|
+
"delimiter": "...",
|
74
|
+
"has_header": true/false,
|
75
|
+
"skip_rows": 0, // Integer for initial N, or sorted list of 0-based indices e.g. [0, 1, 5]
|
76
|
+
"columns": [
|
77
|
+
{{
|
78
|
+
"identifier_in_csv": "original_header_or_index_string",
|
79
|
+
"final_column_name": "sanitized_snake_case_name",
|
80
|
+
"description": null // Or string value. Null or omit if not generated.
|
81
|
+
}}
|
82
|
+
// ... more column objects
|
83
|
+
]
|
84
|
+
}}
|
85
|
+
```
|
86
|
+
|
87
|
+
**Tools Available:**
|
88
|
+
- `read_csv_sample(encoding: str, delimiter: str, skip_initial_rows: int = 0, row_limit: int = 20, include_row_indices: bool = False)`: Reads a sample from the *beginning* of the file. Crucial for header and initial structure.
|
89
|
+
- `read_raw_lines(encoding: str, line_limit: int = 50, start_line: int = 0)`: Reads raw lines. Useful for finding specific rows to skip (empty, repeated headers, footers) by their 0-based index.
|
90
|
+
- `get_file_chunk(encoding: str, start_byte: int = 0, end_byte: int = 4096)`: Reads a raw chunk. Good for diagnosing encoding/delimiter issues if `read_csv_sample` returns garbled data or errors.
|
91
|
+
|
92
|
+
**General Workflow Guidance:**
|
93
|
+
1. **Initial Probe & Core Params:** Use `read_csv_sample` with initial hints (and `include_row_indices=True`) to examine the first few rows. Verify/correct `encoding` and `delimiter`. If `read_csv_sample` reports errors or shows garbled data, use `get_file_chunk` with different encodings to diagnose. Determine `has_header` by looking at the first non-skipped row.
|
94
|
+
2. **Identify Skip Rows:**
|
95
|
+
* If there's metadata/comments at the top, determine how many initial rows to skip and use that for `skip_rows` (integer value).
|
96
|
+
* Use `read_raw_lines` to scan for other rows to skip (e.g., empty lines, comment lines, repeated headers mid-file, summary footers). Collect all 0-based indices of such rows. If you have specific indices, `skip_rows` should be a sorted list of these indices. If you only skip initial N rows, it's an integer.
|
97
|
+
3. **Column Identification & Definition:**
|
98
|
+
* After settling `skip_rows` and `has_header`, call `read_csv_sample` again with `skip_initial_rows` set appropriately (if `skip_rows` is an int) to see the clean data rows and the header (if present).
|
99
|
+
* If `has_header` is true, the first row from this clean sample gives you the `identifier_in_csv` values (original header names).
|
100
|
+
* If `has_header` is false, the `identifier_in_csv` for each column will be its 0-based index as a string (e.g., "0", "1", "2", ... for as many columns as you see in the first data row).
|
101
|
+
* For each column you decide to include:
|
102
|
+
* Determine its `identifier_in_csv`.
|
103
|
+
* Create a clean, descriptive `final_column_name` (snake_case).
|
104
|
+
* If (and ONLY IF) necessary, write a `description` for that column.
|
105
|
+
4. **Table Naming & Description:** Based on the clean data and column names, formulate a `table_name` and, if valuable, a `table_description`.
|
106
|
+
5. **Construct Final JSON:** Ensure your output is ONLY the JSON object, perfectly matching the specified schema. Pay close attention to the format of `skip_rows` and how optional fields (`table_description`, column `description`) are handled (either omit the key or set its value to `null`).
|
107
|
+
"""
|
108
|
+
|
109
|
+
|
110
|
+
# --- Tool Context Manager ---
|
111
|
+
class AIStandardizerToolContext:
|
112
|
+
def __init__(self, file_path: Path):
|
113
|
+
self.file_path = file_path
|
114
|
+
self._original_context = None
|
115
|
+
|
116
|
+
def __enter__(self):
|
117
|
+
global _CURRENT_AI_CSV_TOOL_CONTEXT
|
118
|
+
self._original_context = _CURRENT_AI_CSV_TOOL_CONTEXT.set(self)
|
119
|
+
return self
|
120
|
+
|
121
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
122
|
+
global _CURRENT_AI_CSV_TOOL_CONTEXT
|
123
|
+
if self._original_context is not None:
|
124
|
+
_CURRENT_AI_CSV_TOOL_CONTEXT.reset(self._original_context)
|
125
|
+
self._original_context = None
|
126
|
+
|
127
|
+
|
128
|
+
_CURRENT_AI_CSV_TOOL_CONTEXT: contextvars.ContextVar[
|
129
|
+
Optional[AIStandardizerToolContext]
|
130
|
+
] = contextvars.ContextVar("current_ai_csv_tool_context", default=None)
|
131
|
+
|
132
|
+
|
133
|
+
# --- Tool Implementations (Assumed to be largely the same, ensure JSON output is robust) ---
|
134
|
+
@function_tool
|
135
|
+
async def read_csv_sample(
|
136
|
+
encoding: str,
|
137
|
+
delimiter: str,
|
138
|
+
skip_initial_rows: int | None, # Made optional to match agent's potential calls
|
139
|
+
row_limit: int | None,
|
140
|
+
include_row_indices: bool | None,
|
141
|
+
) -> str:
|
142
|
+
if skip_initial_rows is None:
|
143
|
+
skip_initial_rows = 0
|
144
|
+
if row_limit is None:
|
145
|
+
row_limit = 20
|
146
|
+
if include_row_indices is None:
|
147
|
+
include_row_indices = False
|
148
|
+
|
149
|
+
context = _CURRENT_AI_CSV_TOOL_CONTEXT.get()
|
150
|
+
if not context or not context.file_path or not context.file_path.exists():
|
151
|
+
return json.dumps({"error": "File path not found in tool context."})
|
152
|
+
|
153
|
+
rows = []
|
154
|
+
error_message = None
|
155
|
+
processed_row_count = 0
|
156
|
+
actual_skipped_count = 0
|
157
|
+
try:
|
158
|
+
with open(context.file_path, encoding=encoding, newline="") as f:
|
159
|
+
for i in range(skip_initial_rows):
|
160
|
+
try:
|
161
|
+
next(f)
|
162
|
+
actual_skipped_count += 1
|
163
|
+
except StopIteration:
|
164
|
+
error_message = f"EOF reached while skipping initial {skip_initial_rows} rows (skipped {actual_skipped_count})."
|
165
|
+
break
|
166
|
+
if error_message:
|
167
|
+
return json.dumps(
|
168
|
+
{"error": error_message, "rows": [], "processed_row_count": 0}
|
169
|
+
)
|
170
|
+
|
171
|
+
reader = csv.reader(f, delimiter=delimiter)
|
172
|
+
current_read_index = actual_skipped_count
|
173
|
+
for i, row_fields in enumerate(reader):
|
174
|
+
if i >= row_limit:
|
175
|
+
break
|
176
|
+
processed_row_count += 1
|
177
|
+
if include_row_indices:
|
178
|
+
rows.append([current_read_index] + row_fields)
|
179
|
+
else:
|
180
|
+
rows.append(row_fields)
|
181
|
+
current_read_index += 1
|
182
|
+
return json.dumps(
|
183
|
+
{"rows": rows, "processed_row_count": processed_row_count, "error": None}
|
184
|
+
)
|
185
|
+
except UnicodeDecodeError as e:
|
186
|
+
error_message = f"Encoding error: {e}. Used encoding '{encoding}'."
|
187
|
+
except csv.Error as e:
|
188
|
+
error_message = f"CSV parsing error: {e}. Used delimiter '{delimiter}'. Check if delimiter is correct."
|
189
|
+
except StopIteration:
|
190
|
+
error_message = (
|
191
|
+
"Reached end of file unexpectedly." if processed_row_count == 0 else None
|
192
|
+
)
|
193
|
+
except Exception as e:
|
194
|
+
logger.error(f"Unexpected error in read_csv_sample tool: {e}", exc_info=True)
|
195
|
+
error_message = f"Unexpected error reading sample: {str(e)}"
|
196
|
+
return json.dumps(
|
197
|
+
{
|
198
|
+
"error": error_message,
|
199
|
+
"rows": rows,
|
200
|
+
"processed_row_count": processed_row_count,
|
201
|
+
}
|
202
|
+
)
|
203
|
+
|
204
|
+
|
205
|
+
@function_tool
|
206
|
+
async def read_raw_lines(
|
207
|
+
encoding: str, line_limit: int | None, start_line: int | None
|
208
|
+
) -> str:
|
209
|
+
if line_limit is None:
|
210
|
+
line_limit = 50
|
211
|
+
if start_line is None:
|
212
|
+
start_line = 0
|
213
|
+
|
214
|
+
context = _CURRENT_AI_CSV_TOOL_CONTEXT.get()
|
215
|
+
if not context or not context.file_path or not context.file_path.exists():
|
216
|
+
return json.dumps({"error": "File path not found in tool context."})
|
217
|
+
if start_line < 0:
|
218
|
+
return json.dumps({"error": "start_line cannot be negative."})
|
219
|
+
|
220
|
+
lines = []
|
221
|
+
error_message = None
|
222
|
+
actual_start_line = 0
|
223
|
+
lines_read_count = 0
|
224
|
+
try:
|
225
|
+
with open(context.file_path, encoding=encoding, newline="") as f:
|
226
|
+
for i in range(start_line):
|
227
|
+
try:
|
228
|
+
next(f)
|
229
|
+
actual_start_line += 1
|
230
|
+
except StopIteration:
|
231
|
+
error_message = f"EOF reached while skipping to start_line {start_line} (skipped {actual_start_line})."
|
232
|
+
break
|
233
|
+
if error_message:
|
234
|
+
return json.dumps(
|
235
|
+
{
|
236
|
+
"error": error_message,
|
237
|
+
"lines": [],
|
238
|
+
"start_line_processed": actual_start_line,
|
239
|
+
"lines_read_count": 0,
|
240
|
+
}
|
241
|
+
)
|
242
|
+
|
243
|
+
for i, line in enumerate(f):
|
244
|
+
if i >= line_limit:
|
245
|
+
break
|
246
|
+
lines.append(line.rstrip("\r\n"))
|
247
|
+
lines_read_count += 1
|
248
|
+
return json.dumps(
|
249
|
+
{
|
250
|
+
"lines": lines,
|
251
|
+
"start_line_processed": actual_start_line,
|
252
|
+
"lines_read_count": lines_read_count,
|
253
|
+
"error": None,
|
254
|
+
}
|
255
|
+
)
|
256
|
+
except UnicodeDecodeError as e:
|
257
|
+
error_message = f"Encoding error: {e}. Used encoding '{encoding}'."
|
258
|
+
except StopIteration:
|
259
|
+
error_message = (
|
260
|
+
"Reached end of file unexpectedly." if lines_read_count == 0 else None
|
261
|
+
)
|
262
|
+
except Exception as e:
|
263
|
+
logger.error(f"Unexpected error in read_raw_lines tool: {e}", exc_info=True)
|
264
|
+
error_message = f"Unexpected error reading raw lines: {str(e)}"
|
265
|
+
return json.dumps(
|
266
|
+
{
|
267
|
+
"error": error_message,
|
268
|
+
"lines": lines,
|
269
|
+
"start_line_processed": actual_start_line,
|
270
|
+
"lines_read_count": lines_read_count,
|
271
|
+
}
|
272
|
+
)
|
273
|
+
|
274
|
+
|
275
|
+
@function_tool
|
276
|
+
async def get_file_chunk(
|
277
|
+
encoding: str, start_byte: int | None, end_byte: int | None
|
278
|
+
) -> str:
|
279
|
+
if start_byte is None:
|
280
|
+
start_byte = 0
|
281
|
+
if end_byte is None:
|
282
|
+
end_byte = 4096
|
283
|
+
context = _CURRENT_AI_CSV_TOOL_CONTEXT.get()
|
284
|
+
if not context or not context.file_path or not context.file_path.exists():
|
285
|
+
return json.dumps({"error": "File path not found in tool context."})
|
286
|
+
if start_byte < 0 or end_byte < start_byte:
|
287
|
+
return json.dumps({"error": "Invalid byte range specified."})
|
288
|
+
|
289
|
+
chunk_text = ""
|
290
|
+
error_message = None
|
291
|
+
bytes_read = 0
|
292
|
+
try:
|
293
|
+
with open(context.file_path, "rb") as fb:
|
294
|
+
file_size = context.file_path.stat().st_size
|
295
|
+
effective_start_byte = min(start_byte, file_size)
|
296
|
+
fb.seek(effective_start_byte)
|
297
|
+
bytes_to_read = max(0, min(end_byte, file_size) - effective_start_byte)
|
298
|
+
if bytes_to_read > 0:
|
299
|
+
chunk_bytes = fb.read(bytes_to_read)
|
300
|
+
bytes_read = len(chunk_bytes)
|
301
|
+
chunk_text = chunk_bytes.decode(encoding, errors="replace")
|
302
|
+
else:
|
303
|
+
chunk_text = ""
|
304
|
+
return json.dumps(
|
305
|
+
{
|
306
|
+
"chunk": chunk_text,
|
307
|
+
"bytes_read": bytes_read,
|
308
|
+
"requested_range": [start_byte, end_byte],
|
309
|
+
"error": None,
|
310
|
+
}
|
311
|
+
)
|
312
|
+
except (UnicodeDecodeError, ValueError) as e:
|
313
|
+
error_message = f"Failed to decode file chunk: {e}. Used encoding '{encoding}'."
|
314
|
+
except OSError as e:
|
315
|
+
error_message = f"File read error: {e}."
|
316
|
+
except Exception as e:
|
317
|
+
logger.error(f"Unexpected error in get_file_chunk tool: {e}", exc_info=True)
|
318
|
+
error_message = f"Unexpected error reading file chunk: {str(e)}"
|
319
|
+
return json.dumps(
|
320
|
+
{
|
321
|
+
"error": error_message,
|
322
|
+
"chunk": chunk_text,
|
323
|
+
"bytes_read": bytes_read,
|
324
|
+
"requested_range": [start_byte, end_byte],
|
325
|
+
}
|
326
|
+
)
|
327
|
+
|
328
|
+
|
329
|
+
# --- AICSVStandardizer Class ---
|
330
|
+
class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStandardizer
|
331
|
+
def __init__(
|
332
|
+
self,
|
333
|
+
mcp_server: Optional[MCPServerStdio] = None,
|
334
|
+
mcp_session: Optional[ClientSession] = None,
|
335
|
+
llm_model: str = "gpt-4.1-2025-04-14",
|
336
|
+
# --- Initial Hints (Optional) ---
|
337
|
+
initial_delimiter: Optional[str] = None,
|
338
|
+
initial_encoding: Optional[str] = None,
|
339
|
+
# --- Base Class Args Passthrough (some will be overridden by AI) ---
|
340
|
+
default_skip_columns: SkipColumnsConfig = None, # Keep for base if AI doesn't define cols
|
341
|
+
):
|
342
|
+
super().__init__(
|
343
|
+
delimiter=None, # AI will determine
|
344
|
+
encoding=None, # AI will determine
|
345
|
+
has_header=True, # AI will determine
|
346
|
+
skip_rows=0, # AI will determine
|
347
|
+
skip_columns=default_skip_columns, # Can still be a fallback
|
348
|
+
descriptions=None, # AI will generate table_description
|
349
|
+
table_names=None, # AI will generate table_name
|
350
|
+
file_configs=None, # AI provides all config for the one file
|
351
|
+
column_definitions=None, # AI will generate column definitions
|
352
|
+
)
|
353
|
+
|
354
|
+
self.mcp_servers = [mcp_server] if mcp_server else []
|
355
|
+
self.mcp_session = mcp_session
|
356
|
+
self.llm_model = llm_model
|
357
|
+
self._initial_delimiter_hint = initial_delimiter
|
358
|
+
self._initial_encoding_hint = initial_encoding
|
359
|
+
# self.generate_description from prompt structure (table_description, column descriptions)
|
360
|
+
|
361
|
+
async def _run_analysis_agent(
|
362
|
+
self,
|
363
|
+
file_path: Path,
|
364
|
+
initial_encoding: str,
|
365
|
+
initial_delimiter: str,
|
366
|
+
) -> Dict[str, Any]:
|
367
|
+
with AIStandardizerToolContext(file_path):
|
368
|
+
prompt = AI_CSV_PROMPT.format(
|
369
|
+
file_path=str(file_path),
|
370
|
+
initial_encoding=initial_encoding,
|
371
|
+
initial_delimiter=initial_delimiter,
|
372
|
+
)
|
373
|
+
agent = Agent(
|
374
|
+
name="CSV Detail Analyzer Agent",
|
375
|
+
mcp_servers=self.mcp_servers,
|
376
|
+
tools=[read_csv_sample, read_raw_lines, get_file_chunk],
|
377
|
+
model=self.llm_model,
|
378
|
+
)
|
379
|
+
logger.info(f"Running CSV Detail Analyzer Agent for {file_path.name}...")
|
380
|
+
result = await Runner.run(agent, input=prompt)
|
381
|
+
|
382
|
+
if not result or not result.final_output:
|
383
|
+
raise RuntimeError(
|
384
|
+
f"Agent execution failed or returned no output for {file_path.name}."
|
385
|
+
)
|
386
|
+
logger.info(
|
387
|
+
f"Agent for {file_path.name} finished. Raw output preview: {result.final_output[:500]}..."
|
388
|
+
)
|
389
|
+
|
390
|
+
try:
|
391
|
+
final_params_text = result.final_output.strip()
|
392
|
+
match = re.search(r"```(?:json)?(.*)```", final_params_text, re.DOTALL)
|
393
|
+
if match:
|
394
|
+
final_params_text = match.group(1).strip()
|
395
|
+
|
396
|
+
ai_output = json.loads(final_params_text)
|
397
|
+
|
398
|
+
# --- Validate Agent Output Structure ---
|
399
|
+
if not isinstance(ai_output, dict):
|
400
|
+
raise ValueError("Agent did not return a valid JSON object.")
|
401
|
+
|
402
|
+
required_top_keys = {
|
403
|
+
"table_name",
|
404
|
+
"encoding",
|
405
|
+
"delimiter",
|
406
|
+
"has_header",
|
407
|
+
"skip_rows",
|
408
|
+
"columns",
|
409
|
+
}
|
410
|
+
if not required_top_keys.issubset(ai_output.keys()):
|
411
|
+
missing = required_top_keys - ai_output.keys()
|
412
|
+
raise ValueError(
|
413
|
+
f"Agent JSON missing required top-level keys: {missing}"
|
414
|
+
)
|
415
|
+
|
416
|
+
if not isinstance(ai_output["columns"], list):
|
417
|
+
raise ValueError("Agent JSON 'columns' must be a list.")
|
418
|
+
if not ai_output["columns"]: # Must have at least one column defined
|
419
|
+
raise ValueError("Agent JSON 'columns' list cannot be empty.")
|
420
|
+
|
421
|
+
for col_spec in ai_output["columns"]:
|
422
|
+
if not isinstance(col_spec, dict):
|
423
|
+
raise ValueError(
|
424
|
+
f"Each item in 'columns' list must be a dictionary. Found: {type(col_spec)}"
|
425
|
+
)
|
426
|
+
req_col_keys = {"identifier_in_csv", "final_column_name"}
|
427
|
+
if not req_col_keys.issubset(col_spec.keys()):
|
428
|
+
missing_col_keys = req_col_keys - col_spec.keys()
|
429
|
+
raise ValueError(
|
430
|
+
f"Column spec {col_spec.get('final_column_name', 'N/A')} missing keys: {missing_col_keys}"
|
431
|
+
)
|
432
|
+
# Ensure description is present, even if None (or agent omits it)
|
433
|
+
if "description" not in col_spec:
|
434
|
+
col_spec["description"] = None
|
435
|
+
|
436
|
+
sr = ai_output["skip_rows"]
|
437
|
+
if not isinstance(sr, int) and not (
|
438
|
+
isinstance(sr, list) and all(isinstance(i, int) for i in sr)
|
439
|
+
):
|
440
|
+
raise ValueError(
|
441
|
+
f"Agent JSON 'skip_rows' must be an integer or list of integers, got {type(sr)}"
|
442
|
+
)
|
443
|
+
if isinstance(sr, list):
|
444
|
+
ai_output["skip_rows"] = sorted(list(set(i for i in sr if i >= 0)))
|
445
|
+
|
446
|
+
# Ensure table_description is present, even if None
|
447
|
+
if "table_description" not in ai_output:
|
448
|
+
ai_output["table_description"] = None
|
449
|
+
|
450
|
+
logger.info(
|
451
|
+
f"Agent successfully determined parameters for {file_path.name}"
|
452
|
+
)
|
453
|
+
return ai_output
|
454
|
+
except json.JSONDecodeError as e:
|
455
|
+
logger.error(
|
456
|
+
f"Agent for {file_path.name} did not return valid JSON: {e}. Output: {result.final_output}",
|
457
|
+
exc_info=True,
|
458
|
+
)
|
459
|
+
raise ValueError(
|
460
|
+
f"Agent failed to produce valid JSON output for {file_path.name}."
|
461
|
+
) from e
|
462
|
+
except ValueError as e: # Catch our custom validation errors
|
463
|
+
logger.error(
|
464
|
+
f"Invalid JSON structure or content from agent for {file_path.name}: {e}. Output: {result.final_output}",
|
465
|
+
exc_info=True,
|
466
|
+
)
|
467
|
+
raise e # Re-raise
|
468
|
+
|
469
|
+
async def standardize(
|
470
|
+
self,
|
471
|
+
datasource: Datasource,
|
472
|
+
output_path: SDIFPath, # Corrected name from output_sdif
|
473
|
+
*,
|
474
|
+
overwrite: bool = False,
|
475
|
+
config: Optional[Dict[str, Any]] = None,
|
476
|
+
**kwargs,
|
477
|
+
) -> Path:
|
478
|
+
output_path_obj = Path(output_path)
|
479
|
+
if isinstance(datasource, (str, Path)):
|
480
|
+
input_paths = [Path(datasource)]
|
481
|
+
elif isinstance(datasource, list) and all(
|
482
|
+
isinstance(p, (str, Path)) for p in datasource
|
483
|
+
):
|
484
|
+
input_paths = [Path(p) for p in datasource]
|
485
|
+
if len(input_paths) > 1:
|
486
|
+
logger.warning(
|
487
|
+
"AICSVStandardizer currently processes one CSV file at a time for detailed AI analysis. Using the first file only."
|
488
|
+
)
|
489
|
+
input_paths = [input_paths[0]] # Process one file if multiple given
|
490
|
+
else:
|
491
|
+
raise TypeError(
|
492
|
+
"datasource must be a file path string/Path object or a list of such paths."
|
493
|
+
)
|
494
|
+
|
495
|
+
if not input_paths:
|
496
|
+
raise ValueError("No input datasource provided.")
|
497
|
+
|
498
|
+
input_path = input_paths[
|
499
|
+
0
|
500
|
+
] # We focus on a single file for this AI standardizer
|
501
|
+
|
502
|
+
if not input_path.exists() or not input_path.is_file():
|
503
|
+
raise FileNotFoundError(
|
504
|
+
f"Input CSV file not found or is not a file: {input_path}"
|
505
|
+
)
|
506
|
+
|
507
|
+
logger.info(f"--- AI Analysis for file: {input_path.name} ---")
|
508
|
+
|
509
|
+
# 1. Initial Guesses for AI
|
510
|
+
initial_encoding_guess = self._initial_encoding_hint
|
511
|
+
if not initial_encoding_guess:
|
512
|
+
try:
|
513
|
+
# Use base class's _detect_encoding, need an instance or make it static/helper
|
514
|
+
# For simplicity, re-implement or call a static version if available.
|
515
|
+
# Here, we simulate it for now or assume base standardizer's helper is callable.
|
516
|
+
with open(input_path, "rb") as fb_enc:
|
517
|
+
enc_sample = fb_enc.read(ENCODING_SAMPLE_SIZE)
|
518
|
+
detected_enc_info = detect(enc_sample) if enc_sample else None
|
519
|
+
initial_encoding_guess = (
|
520
|
+
detected_enc_info["encoding"]
|
521
|
+
if detected_enc_info and detected_enc_info["encoding"]
|
522
|
+
else "utf-8"
|
523
|
+
)
|
524
|
+
logger.info(
|
525
|
+
f"Initial encoding guess (detected): {initial_encoding_guess}"
|
526
|
+
)
|
527
|
+
except Exception as e:
|
528
|
+
logger.warning(
|
529
|
+
f"Initial encoding detection failed: {e}. Using utf-8 as fallback guess."
|
530
|
+
)
|
531
|
+
initial_encoding_guess = "utf-8"
|
532
|
+
else:
|
533
|
+
logger.info(
|
534
|
+
f"Using provided initial encoding hint: {initial_encoding_guess}"
|
535
|
+
)
|
536
|
+
|
537
|
+
initial_delimiter_guess = self._initial_delimiter_hint
|
538
|
+
if not initial_delimiter_guess:
|
539
|
+
try:
|
540
|
+
with open(
|
541
|
+
input_path, encoding=initial_encoding_guess, errors="ignore"
|
542
|
+
) as f_delim_sample:
|
543
|
+
delim_sample_text = f_delim_sample.read(DELIMITER_SAMPLE_SIZE)
|
544
|
+
if delim_sample_text:
|
545
|
+
# Simulate base class's _detect_delimiter
|
546
|
+
sniffer = clevercsv.Sniffer()
|
547
|
+
dialect = sniffer.sniff(delim_sample_text)
|
548
|
+
initial_delimiter_guess = dialect.delimiter if dialect else ","
|
549
|
+
logger.info(
|
550
|
+
f"Initial delimiter guess (detected): '{initial_delimiter_guess}'"
|
551
|
+
)
|
552
|
+
else:
|
553
|
+
initial_delimiter_guess = "," # Fallback
|
554
|
+
logger.warning(
|
555
|
+
f"File empty/small, defaulting delimiter guess to ',' for {input_path.name}"
|
556
|
+
)
|
557
|
+
except Exception as e:
|
558
|
+
logger.warning(
|
559
|
+
f"Initial delimiter detection failed ({e}). Using ',' as fallback guess for {input_path.name}."
|
560
|
+
)
|
561
|
+
initial_delimiter_guess = ","
|
562
|
+
else:
|
563
|
+
logger.info(
|
564
|
+
f"Using provided initial delimiter hint: '{initial_delimiter_guess}'"
|
565
|
+
)
|
566
|
+
|
567
|
+
# 2. Run AI Agent Analysis
|
568
|
+
try:
|
569
|
+
ai_params = await self._run_analysis_agent(
|
570
|
+
input_path,
|
571
|
+
initial_encoding_guess,
|
572
|
+
initial_delimiter_guess,
|
573
|
+
)
|
574
|
+
except Exception as e:
|
575
|
+
logger.exception(
|
576
|
+
f"AI Agent analysis failed critically for {input_path.name}. Aborting."
|
577
|
+
)
|
578
|
+
raise RuntimeError(f"AI analysis failed for {input_path.name}") from e
|
579
|
+
|
580
|
+
# 3. Prepare parameters for the base CSVStandardizer
|
581
|
+
# The AI provides parameters for a single file processing scenario.
|
582
|
+
|
583
|
+
# Column definitions for the base standardizer:
|
584
|
+
# Base class expects: List[Optional[Dict[str, List[Dict[str, Any]]]]]
|
585
|
+
# For a single file, it's List containing one Dict: [{table_name: [col_specs...]}]
|
586
|
+
# Or, if base class is adapted, List containing one List: [[col_specs...]]
|
587
|
+
|
588
|
+
# The AI output `ai_params["columns"]` is already in the format:
|
589
|
+
# [{"identifier_in_csv": ..., "final_column_name": ..., "description": ...}, ...]
|
590
|
+
# This is exactly what the enhanced CSVStandardizer's `_setup_columns` expects for `defined_columns_spec`
|
591
|
+
# when `column_definitions` is a list containing this list of specs.
|
592
|
+
|
593
|
+
ai_column_definitions = [
|
594
|
+
ai_params["columns"]
|
595
|
+
] # Wrap the list of col specs for the single file/table
|
596
|
+
|
597
|
+
# The base CSVStandardizer will use its own _sanitize_name for the table name from AI.
|
598
|
+
# We provide it via table_names list.
|
599
|
+
ai_table_name = [ai_params["table_name"]]
|
600
|
+
ai_table_description = [
|
601
|
+
ai_params.get("table_description")
|
602
|
+
] # List of one description
|
603
|
+
|
604
|
+
# File-specific config for the base standardizer
|
605
|
+
# For a single file, this will be a list containing one dictionary.
|
606
|
+
file_specific_config = [
|
607
|
+
{
|
608
|
+
"encoding": ai_params["encoding"],
|
609
|
+
"delimiter": ai_params["delimiter"],
|
610
|
+
"has_header": ai_params["has_header"],
|
611
|
+
"skip_rows": ai_params["skip_rows"],
|
612
|
+
# skip_columns is not used if column_definitions are provided,
|
613
|
+
# as column selection is implicit in the provided definitions.
|
614
|
+
"skip_columns": None, # Explicitly set to None
|
615
|
+
}
|
616
|
+
]
|
617
|
+
|
618
|
+
logger.info(f"AI determined parameters for {input_path.name}:")
|
619
|
+
logger.info(f" Table Name: {ai_table_name[0]}")
|
620
|
+
logger.info(f" Encoding: {file_specific_config[0]['encoding']}")
|
621
|
+
logger.info(f" Delimiter: '{file_specific_config[0]['delimiter']}'")
|
622
|
+
logger.info(f" Has Header: {file_specific_config[0]['has_header']}")
|
623
|
+
logger.info(f" Skip Rows: {file_specific_config[0]['skip_rows']}")
|
624
|
+
logger.info(
|
625
|
+
f" Table Description: {ai_table_description[0] if ai_table_description and ai_table_description[0] is not None else 'N/A'}"
|
626
|
+
)
|
627
|
+
logger.info(f" Column Definitions ({len(ai_column_definitions[0])} cols):")
|
628
|
+
for i, c_def in enumerate(ai_column_definitions[0]):
|
629
|
+
logger.info(
|
630
|
+
f" {i + 1}. ID in CSV: '{c_def['identifier_in_csv']}', Final Name: '{c_def['final_column_name']}', Desc: '{c_def.get('description', 'N/A')}'"
|
631
|
+
)
|
632
|
+
|
633
|
+
# 4. Call Base Class Standardizer Logic with AI-derived parameters
|
634
|
+
# We instantiate a new CSVStandardizer configured by the AI for this specific file.
|
635
|
+
final_processor = CSVStandardizer(
|
636
|
+
# These are now single-element lists because we process one file
|
637
|
+
table_names=ai_table_name,
|
638
|
+
descriptions=ai_table_description,
|
639
|
+
file_configs=file_specific_config,
|
640
|
+
column_definitions=ai_column_definitions, # Pass the AI-generated column specs
|
641
|
+
# default_skip_columns from __init__ can remain as a very deep fallback if AI somehow fails for columns
|
642
|
+
skip_columns=self.default_skip_columns,
|
643
|
+
)
|
644
|
+
|
645
|
+
try:
|
646
|
+
# The datasource for the base standardizer is the single input_path
|
647
|
+
result_path = final_processor.standardize(
|
648
|
+
datasource=[input_path], # Pass as a list of one
|
649
|
+
output_path=output_path_obj,
|
650
|
+
overwrite=overwrite,
|
651
|
+
)
|
652
|
+
logger.info(
|
653
|
+
f"AI CSV Standardization complete for {input_path.name}. Output: {result_path}"
|
654
|
+
)
|
655
|
+
return result_path
|
656
|
+
except Exception as e:
|
657
|
+
logger.exception(
|
658
|
+
f"Error during final standardization step using AI parameters for {input_path.name}: {e}"
|
659
|
+
)
|
660
|
+
raise RuntimeError(
|
661
|
+
f"Final standardization step failed for {input_path.name}."
|
662
|
+
) from e
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|