satif-ai 0.1.1a0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: satif-ai
3
- Version: 0.1.1a0
3
+ Version: 0.1.2
4
4
  Summary: AI Agents for Satif
5
5
  License: MIT
6
6
  Author: Bryan Djafer
@@ -13,8 +13,8 @@ Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
15
  Requires-Dist: openai-agents (>=0.0.9,<0.0.10)
16
- Requires-Dist: satif-sdk (>=0.1.0,<0.2.0)
17
- Requires-Dist: sdif-mcp (>=0.1.0,<0.2.0)
16
+ Requires-Dist: satif-sdk (>=0.1.0,<1.0.0)
17
+ Requires-Dist: sdif-mcp (>=0.1.0,<1.0.0)
18
18
  Description-Content-Type: text/markdown
19
19
 
20
20
  # SATIF AI
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "satif-ai"
3
- version = "0.1.1a0"
3
+ version = "0.1.2"
4
4
  description = "AI Agents for Satif"
5
5
  authors = [
6
6
  {name = "Bryan Djafer", email = "bryan.djafer@syncpulse.fr"}
@@ -12,8 +12,8 @@ requires-python = ">=3.10,<4.0"
12
12
 
13
13
  [tool.poetry.dependencies]
14
14
  openai-agents = ">=0.0.9,<0.0.10"
15
- satif-sdk = ">=0.1.0,<0.2.0"
16
- sdif-mcp = ">=0.1.0,<0.2.0"
15
+ satif-sdk = ">=0.1.0,<1.0.0"
16
+ sdif-mcp = ">=0.1.0,<1.0.0"
17
17
 
18
18
  [build-system]
19
19
  requires = ["poetry-core>=2.0.0,<3.0.0"]
@@ -28,6 +28,7 @@ satif-core = {path = "../core", develop = true}
28
28
  satif-sdk = {path = "../sdk", develop = true}
29
29
  sdif-mcp = {path = "../mcp", develop = true}
30
30
  sdif-db = {path = "../sdif", develop = true}
31
+ ipykernel = "^6.29.5"
31
32
 
32
33
 
33
34
 
@@ -11,11 +11,11 @@ from typing import Optional
11
11
  # MCP and Agent imports
12
12
  from agents import Agent, Runner, function_tool
13
13
  from agents.mcp.server import MCPServerStdio
14
- from libs.core.satif_core.types import Datasource
15
14
  from mcp import ClientSession
16
15
 
17
16
  # SATIF imports
18
17
  from satif_core.adapters.base import Adapter
18
+ from satif_core.types import Datasource
19
19
  from satif_sdk import SDIFDatabase
20
20
  from satif_sdk.adapters.code import AdapterError, CodeAdapter
21
21
 
File without changes
@@ -0,0 +1,662 @@
1
+ import contextvars
2
+ import csv
3
+ import json
4
+ import logging
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional
8
+
9
+ import clevercsv
10
+ from agents import Agent, Runner, function_tool
11
+ from agents.mcp.server import MCPServerStdio
12
+ from charset_normalizer import detect
13
+ from mcp import ClientSession
14
+ from satif_core.types import Datasource, SDIFPath
15
+ from satif_sdk.standardizers.csv import (
16
+ DELIMITER_SAMPLE_SIZE,
17
+ ENCODING_SAMPLE_SIZE,
18
+ CSVStandardizer,
19
+ SkipColumnsConfig,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ # TODO: maybe we want more analysis tools:
26
+ # get empty rows or same values rows.
27
+
28
+
29
+ # --- Agent Prompt Definition ---
30
+ AI_CSV_PROMPT = """
31
+ You are an expert CSV Data Standardization Agent. Your mission is to analyze a given CSV file and determine all necessary parameters and metadata so it can be correctly standardized into a well-structured SDIF table using the underlying CSVStandardizer.
32
+
33
+ **CSV File Path:** {file_path}
34
+ **Initial Guesses (Hints for you to verify or correct):**
35
+ - Encoding: {initial_encoding}
36
+ - Delimiter: '{initial_delimiter}'
37
+
38
+ **Your Comprehensive Task:**
39
+
40
+ 1. **Core Parsing Parameters:**
41
+ * Determine the correct file `encoding` (string, e.g., "utf-8", "latin-1").
42
+ * Determine the correct `delimiter` (string, e.g., ",", ";", "\\t").
43
+ * Determine if a `has_header` row exists (boolean: true/false).
44
+ * Determine `skip_rows` (integer for initial N rows OR a list of 0-based specific row indices to skip, e.g., metadata blocks, comments, empty lines, repeated headers). Ensure the list is sorted and contains unique, non-negative integers.
45
+
46
+ 2. **Table Definition:**
47
+ * Generate a concise, descriptive, and SQL-safe `table_name` for the data in this CSV (string, snake_case preferred, e.g., "customer_orders_2023_q4"). This name will be sanitized by the system, but try to make it good.
48
+ * Optionally, generate a `table_description` (string) providing a brief semantic overview of what the entire table represents, especially if the `table_name` isn't fully self-explanatory. (e.g., "Contains quarterly sales data for all product lines."). Only provide if it adds clear value.
49
+
50
+ 3. **Column Analysis and Definition:**
51
+ * For **each column** you identify that should be included in the final table:
52
+ * `identifier_in_csv` (string): This is how the column is found in the *raw CSV data*.
53
+ * If `has_header` is true, this is the **exact original header name** from the CSV.
54
+ * If `has_header` is false, this is a **string representation of the 0-based column index** (e.g., "0", "1", "2").
55
+ * `final_column_name` (string): This is the desired name for the column in the SDIF database table. It **MUST** be:
56
+ * Clean and descriptive.
57
+ * Sanitized by you (snake_case, lowercase, no special characters besides underscore, no spaces). The system will also sanitize it, but aim for a good one.
58
+ * Potentially an improved/clarified version of the original header (e.g., fixing typos, expanding abbreviations).
59
+ * `description` (string, OPTIONAL): A concise semantic description of what the data in this specific column represents.
60
+ * **Provide this ONLY if the `final_column_name` is not entirely self-explanatory or if the column's content is ambiguous.**
61
+ * Focus on clarity and a human-understandable meaning. (e.g., for a column `order_total_usd`, a description might be "Total amount of the order in US Dollars, including taxes but excluding discounts.")
62
+ * If the `final_column_name` is very clear (e.g., `customer_email_address`), a separate description is likely NOT needed. Omit the field or set to null.
63
+
64
+ 4. **Final Output:**
65
+ * Respond ONLY with a single JSON object containing all the determined parameters and metadata.
66
+ * The JSON object MUST adhere strictly to the following structure:
67
+
68
+ ```json
69
+ {{
70
+ "table_name": "...",
71
+ "table_description": null, // Or string value. Null or omit if not generated.
72
+ "encoding": "...",
73
+ "delimiter": "...",
74
+ "has_header": true/false,
75
+ "skip_rows": 0, // Integer for initial N, or sorted list of 0-based indices e.g. [0, 1, 5]
76
+ "columns": [
77
+ {{
78
+ "identifier_in_csv": "original_header_or_index_string",
79
+ "final_column_name": "sanitized_snake_case_name",
80
+ "description": null // Or string value. Null or omit if not generated.
81
+ }}
82
+ // ... more column objects
83
+ ]
84
+ }}
85
+ ```
86
+
87
+ **Tools Available:**
88
+ - `read_csv_sample(encoding: str, delimiter: str, skip_initial_rows: int = 0, row_limit: int = 20, include_row_indices: bool = False)`: Reads a sample from the *beginning* of the file. Crucial for header and initial structure.
89
+ - `read_raw_lines(encoding: str, line_limit: int = 50, start_line: int = 0)`: Reads raw lines. Useful for finding specific rows to skip (empty, repeated headers, footers) by their 0-based index.
90
+ - `get_file_chunk(encoding: str, start_byte: int = 0, end_byte: int = 4096)`: Reads a raw chunk. Good for diagnosing encoding/delimiter issues if `read_csv_sample` returns garbled data or errors.
91
+
92
+ **General Workflow Guidance:**
93
+ 1. **Initial Probe & Core Params:** Use `read_csv_sample` with initial hints (and `include_row_indices=True`) to examine the first few rows. Verify/correct `encoding` and `delimiter`. If `read_csv_sample` reports errors or shows garbled data, use `get_file_chunk` with different encodings to diagnose. Determine `has_header` by looking at the first non-skipped row.
94
+ 2. **Identify Skip Rows:**
95
+ * If there's metadata/comments at the top, determine how many initial rows to skip and use that for `skip_rows` (integer value).
96
+ * Use `read_raw_lines` to scan for other rows to skip (e.g., empty lines, comment lines, repeated headers mid-file, summary footers). Collect all 0-based indices of such rows. If you have specific indices, `skip_rows` should be a sorted list of these indices. If you only skip initial N rows, it's an integer.
97
+ 3. **Column Identification & Definition:**
98
+ * After settling `skip_rows` and `has_header`, call `read_csv_sample` again with `skip_initial_rows` set appropriately (if `skip_rows` is an int) to see the clean data rows and the header (if present).
99
+ * If `has_header` is true, the first row from this clean sample gives you the `identifier_in_csv` values (original header names).
100
+ * If `has_header` is false, the `identifier_in_csv` for each column will be its 0-based index as a string (e.g., "0", "1", "2", ... for as many columns as you see in the first data row).
101
+ * For each column you decide to include:
102
+ * Determine its `identifier_in_csv`.
103
+ * Create a clean, descriptive `final_column_name` (snake_case).
104
+ * If (and ONLY IF) necessary, write a `description` for that column.
105
+ 4. **Table Naming & Description:** Based on the clean data and column names, formulate a `table_name` and, if valuable, a `table_description`.
106
+ 5. **Construct Final JSON:** Ensure your output is ONLY the JSON object, perfectly matching the specified schema. Pay close attention to the format of `skip_rows` and how optional fields (`table_description`, column `description`) are handled (either omit the key or set its value to `null`).
107
+ """
108
+
109
+
110
+ # --- Tool Context Manager ---
111
+ class AIStandardizerToolContext:
112
+ def __init__(self, file_path: Path):
113
+ self.file_path = file_path
114
+ self._original_context = None
115
+
116
+ def __enter__(self):
117
+ global _CURRENT_AI_CSV_TOOL_CONTEXT
118
+ self._original_context = _CURRENT_AI_CSV_TOOL_CONTEXT.set(self)
119
+ return self
120
+
121
+ def __exit__(self, exc_type, exc_val, exc_tb):
122
+ global _CURRENT_AI_CSV_TOOL_CONTEXT
123
+ if self._original_context is not None:
124
+ _CURRENT_AI_CSV_TOOL_CONTEXT.reset(self._original_context)
125
+ self._original_context = None
126
+
127
+
128
+ _CURRENT_AI_CSV_TOOL_CONTEXT: contextvars.ContextVar[
129
+ Optional[AIStandardizerToolContext]
130
+ ] = contextvars.ContextVar("current_ai_csv_tool_context", default=None)
131
+
132
+
133
+ # --- Tool Implementations (Assumed to be largely the same, ensure JSON output is robust) ---
134
+ @function_tool
135
+ async def read_csv_sample(
136
+ encoding: str,
137
+ delimiter: str,
138
+ skip_initial_rows: int | None, # Made optional to match agent's potential calls
139
+ row_limit: int | None,
140
+ include_row_indices: bool | None,
141
+ ) -> str:
142
+ if skip_initial_rows is None:
143
+ skip_initial_rows = 0
144
+ if row_limit is None:
145
+ row_limit = 20
146
+ if include_row_indices is None:
147
+ include_row_indices = False
148
+
149
+ context = _CURRENT_AI_CSV_TOOL_CONTEXT.get()
150
+ if not context or not context.file_path or not context.file_path.exists():
151
+ return json.dumps({"error": "File path not found in tool context."})
152
+
153
+ rows = []
154
+ error_message = None
155
+ processed_row_count = 0
156
+ actual_skipped_count = 0
157
+ try:
158
+ with open(context.file_path, encoding=encoding, newline="") as f:
159
+ for i in range(skip_initial_rows):
160
+ try:
161
+ next(f)
162
+ actual_skipped_count += 1
163
+ except StopIteration:
164
+ error_message = f"EOF reached while skipping initial {skip_initial_rows} rows (skipped {actual_skipped_count})."
165
+ break
166
+ if error_message:
167
+ return json.dumps(
168
+ {"error": error_message, "rows": [], "processed_row_count": 0}
169
+ )
170
+
171
+ reader = csv.reader(f, delimiter=delimiter)
172
+ current_read_index = actual_skipped_count
173
+ for i, row_fields in enumerate(reader):
174
+ if i >= row_limit:
175
+ break
176
+ processed_row_count += 1
177
+ if include_row_indices:
178
+ rows.append([current_read_index] + row_fields)
179
+ else:
180
+ rows.append(row_fields)
181
+ current_read_index += 1
182
+ return json.dumps(
183
+ {"rows": rows, "processed_row_count": processed_row_count, "error": None}
184
+ )
185
+ except UnicodeDecodeError as e:
186
+ error_message = f"Encoding error: {e}. Used encoding '{encoding}'."
187
+ except csv.Error as e:
188
+ error_message = f"CSV parsing error: {e}. Used delimiter '{delimiter}'. Check if delimiter is correct."
189
+ except StopIteration:
190
+ error_message = (
191
+ "Reached end of file unexpectedly." if processed_row_count == 0 else None
192
+ )
193
+ except Exception as e:
194
+ logger.error(f"Unexpected error in read_csv_sample tool: {e}", exc_info=True)
195
+ error_message = f"Unexpected error reading sample: {str(e)}"
196
+ return json.dumps(
197
+ {
198
+ "error": error_message,
199
+ "rows": rows,
200
+ "processed_row_count": processed_row_count,
201
+ }
202
+ )
203
+
204
+
205
+ @function_tool
206
+ async def read_raw_lines(
207
+ encoding: str, line_limit: int | None, start_line: int | None
208
+ ) -> str:
209
+ if line_limit is None:
210
+ line_limit = 50
211
+ if start_line is None:
212
+ start_line = 0
213
+
214
+ context = _CURRENT_AI_CSV_TOOL_CONTEXT.get()
215
+ if not context or not context.file_path or not context.file_path.exists():
216
+ return json.dumps({"error": "File path not found in tool context."})
217
+ if start_line < 0:
218
+ return json.dumps({"error": "start_line cannot be negative."})
219
+
220
+ lines = []
221
+ error_message = None
222
+ actual_start_line = 0
223
+ lines_read_count = 0
224
+ try:
225
+ with open(context.file_path, encoding=encoding, newline="") as f:
226
+ for i in range(start_line):
227
+ try:
228
+ next(f)
229
+ actual_start_line += 1
230
+ except StopIteration:
231
+ error_message = f"EOF reached while skipping to start_line {start_line} (skipped {actual_start_line})."
232
+ break
233
+ if error_message:
234
+ return json.dumps(
235
+ {
236
+ "error": error_message,
237
+ "lines": [],
238
+ "start_line_processed": actual_start_line,
239
+ "lines_read_count": 0,
240
+ }
241
+ )
242
+
243
+ for i, line in enumerate(f):
244
+ if i >= line_limit:
245
+ break
246
+ lines.append(line.rstrip("\r\n"))
247
+ lines_read_count += 1
248
+ return json.dumps(
249
+ {
250
+ "lines": lines,
251
+ "start_line_processed": actual_start_line,
252
+ "lines_read_count": lines_read_count,
253
+ "error": None,
254
+ }
255
+ )
256
+ except UnicodeDecodeError as e:
257
+ error_message = f"Encoding error: {e}. Used encoding '{encoding}'."
258
+ except StopIteration:
259
+ error_message = (
260
+ "Reached end of file unexpectedly." if lines_read_count == 0 else None
261
+ )
262
+ except Exception as e:
263
+ logger.error(f"Unexpected error in read_raw_lines tool: {e}", exc_info=True)
264
+ error_message = f"Unexpected error reading raw lines: {str(e)}"
265
+ return json.dumps(
266
+ {
267
+ "error": error_message,
268
+ "lines": lines,
269
+ "start_line_processed": actual_start_line,
270
+ "lines_read_count": lines_read_count,
271
+ }
272
+ )
273
+
274
+
275
+ @function_tool
276
+ async def get_file_chunk(
277
+ encoding: str, start_byte: int | None, end_byte: int | None
278
+ ) -> str:
279
+ if start_byte is None:
280
+ start_byte = 0
281
+ if end_byte is None:
282
+ end_byte = 4096
283
+ context = _CURRENT_AI_CSV_TOOL_CONTEXT.get()
284
+ if not context or not context.file_path or not context.file_path.exists():
285
+ return json.dumps({"error": "File path not found in tool context."})
286
+ if start_byte < 0 or end_byte < start_byte:
287
+ return json.dumps({"error": "Invalid byte range specified."})
288
+
289
+ chunk_text = ""
290
+ error_message = None
291
+ bytes_read = 0
292
+ try:
293
+ with open(context.file_path, "rb") as fb:
294
+ file_size = context.file_path.stat().st_size
295
+ effective_start_byte = min(start_byte, file_size)
296
+ fb.seek(effective_start_byte)
297
+ bytes_to_read = max(0, min(end_byte, file_size) - effective_start_byte)
298
+ if bytes_to_read > 0:
299
+ chunk_bytes = fb.read(bytes_to_read)
300
+ bytes_read = len(chunk_bytes)
301
+ chunk_text = chunk_bytes.decode(encoding, errors="replace")
302
+ else:
303
+ chunk_text = ""
304
+ return json.dumps(
305
+ {
306
+ "chunk": chunk_text,
307
+ "bytes_read": bytes_read,
308
+ "requested_range": [start_byte, end_byte],
309
+ "error": None,
310
+ }
311
+ )
312
+ except (UnicodeDecodeError, ValueError) as e:
313
+ error_message = f"Failed to decode file chunk: {e}. Used encoding '{encoding}'."
314
+ except OSError as e:
315
+ error_message = f"File read error: {e}."
316
+ except Exception as e:
317
+ logger.error(f"Unexpected error in get_file_chunk tool: {e}", exc_info=True)
318
+ error_message = f"Unexpected error reading file chunk: {str(e)}"
319
+ return json.dumps(
320
+ {
321
+ "error": error_message,
322
+ "chunk": chunk_text,
323
+ "bytes_read": bytes_read,
324
+ "requested_range": [start_byte, end_byte],
325
+ }
326
+ )
327
+
328
+
329
+ # --- AICSVStandardizer Class ---
330
+ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStandardizer
331
+ def __init__(
332
+ self,
333
+ mcp_server: Optional[MCPServerStdio] = None,
334
+ mcp_session: Optional[ClientSession] = None,
335
+ llm_model: str = "gpt-4.1-2025-04-14",
336
+ # --- Initial Hints (Optional) ---
337
+ initial_delimiter: Optional[str] = None,
338
+ initial_encoding: Optional[str] = None,
339
+ # --- Base Class Args Passthrough (some will be overridden by AI) ---
340
+ default_skip_columns: SkipColumnsConfig = None, # Keep for base if AI doesn't define cols
341
+ ):
342
+ super().__init__(
343
+ delimiter=None, # AI will determine
344
+ encoding=None, # AI will determine
345
+ has_header=True, # AI will determine
346
+ skip_rows=0, # AI will determine
347
+ skip_columns=default_skip_columns, # Can still be a fallback
348
+ descriptions=None, # AI will generate table_description
349
+ table_names=None, # AI will generate table_name
350
+ file_configs=None, # AI provides all config for the one file
351
+ column_definitions=None, # AI will generate column definitions
352
+ )
353
+
354
+ self.mcp_servers = [mcp_server] if mcp_server else []
355
+ self.mcp_session = mcp_session
356
+ self.llm_model = llm_model
357
+ self._initial_delimiter_hint = initial_delimiter
358
+ self._initial_encoding_hint = initial_encoding
359
+ # self.generate_description from prompt structure (table_description, column descriptions)
360
+
361
+ async def _run_analysis_agent(
362
+ self,
363
+ file_path: Path,
364
+ initial_encoding: str,
365
+ initial_delimiter: str,
366
+ ) -> Dict[str, Any]:
367
+ with AIStandardizerToolContext(file_path):
368
+ prompt = AI_CSV_PROMPT.format(
369
+ file_path=str(file_path),
370
+ initial_encoding=initial_encoding,
371
+ initial_delimiter=initial_delimiter,
372
+ )
373
+ agent = Agent(
374
+ name="CSV Detail Analyzer Agent",
375
+ mcp_servers=self.mcp_servers,
376
+ tools=[read_csv_sample, read_raw_lines, get_file_chunk],
377
+ model=self.llm_model,
378
+ )
379
+ logger.info(f"Running CSV Detail Analyzer Agent for {file_path.name}...")
380
+ result = await Runner.run(agent, input=prompt)
381
+
382
+ if not result or not result.final_output:
383
+ raise RuntimeError(
384
+ f"Agent execution failed or returned no output for {file_path.name}."
385
+ )
386
+ logger.info(
387
+ f"Agent for {file_path.name} finished. Raw output preview: {result.final_output[:500]}..."
388
+ )
389
+
390
+ try:
391
+ final_params_text = result.final_output.strip()
392
+ match = re.search(r"```(?:json)?(.*)```", final_params_text, re.DOTALL)
393
+ if match:
394
+ final_params_text = match.group(1).strip()
395
+
396
+ ai_output = json.loads(final_params_text)
397
+
398
+ # --- Validate Agent Output Structure ---
399
+ if not isinstance(ai_output, dict):
400
+ raise ValueError("Agent did not return a valid JSON object.")
401
+
402
+ required_top_keys = {
403
+ "table_name",
404
+ "encoding",
405
+ "delimiter",
406
+ "has_header",
407
+ "skip_rows",
408
+ "columns",
409
+ }
410
+ if not required_top_keys.issubset(ai_output.keys()):
411
+ missing = required_top_keys - ai_output.keys()
412
+ raise ValueError(
413
+ f"Agent JSON missing required top-level keys: {missing}"
414
+ )
415
+
416
+ if not isinstance(ai_output["columns"], list):
417
+ raise ValueError("Agent JSON 'columns' must be a list.")
418
+ if not ai_output["columns"]: # Must have at least one column defined
419
+ raise ValueError("Agent JSON 'columns' list cannot be empty.")
420
+
421
+ for col_spec in ai_output["columns"]:
422
+ if not isinstance(col_spec, dict):
423
+ raise ValueError(
424
+ f"Each item in 'columns' list must be a dictionary. Found: {type(col_spec)}"
425
+ )
426
+ req_col_keys = {"identifier_in_csv", "final_column_name"}
427
+ if not req_col_keys.issubset(col_spec.keys()):
428
+ missing_col_keys = req_col_keys - col_spec.keys()
429
+ raise ValueError(
430
+ f"Column spec {col_spec.get('final_column_name', 'N/A')} missing keys: {missing_col_keys}"
431
+ )
432
+ # Ensure description is present, even if None (or agent omits it)
433
+ if "description" not in col_spec:
434
+ col_spec["description"] = None
435
+
436
+ sr = ai_output["skip_rows"]
437
+ if not isinstance(sr, int) and not (
438
+ isinstance(sr, list) and all(isinstance(i, int) for i in sr)
439
+ ):
440
+ raise ValueError(
441
+ f"Agent JSON 'skip_rows' must be an integer or list of integers, got {type(sr)}"
442
+ )
443
+ if isinstance(sr, list):
444
+ ai_output["skip_rows"] = sorted(list(set(i for i in sr if i >= 0)))
445
+
446
+ # Ensure table_description is present, even if None
447
+ if "table_description" not in ai_output:
448
+ ai_output["table_description"] = None
449
+
450
+ logger.info(
451
+ f"Agent successfully determined parameters for {file_path.name}"
452
+ )
453
+ return ai_output
454
+ except json.JSONDecodeError as e:
455
+ logger.error(
456
+ f"Agent for {file_path.name} did not return valid JSON: {e}. Output: {result.final_output}",
457
+ exc_info=True,
458
+ )
459
+ raise ValueError(
460
+ f"Agent failed to produce valid JSON output for {file_path.name}."
461
+ ) from e
462
+ except ValueError as e: # Catch our custom validation errors
463
+ logger.error(
464
+ f"Invalid JSON structure or content from agent for {file_path.name}: {e}. Output: {result.final_output}",
465
+ exc_info=True,
466
+ )
467
+ raise e # Re-raise
468
+
469
+ async def standardize(
470
+ self,
471
+ datasource: Datasource,
472
+ output_path: SDIFPath, # Corrected name from output_sdif
473
+ *,
474
+ overwrite: bool = False,
475
+ config: Optional[Dict[str, Any]] = None,
476
+ **kwargs,
477
+ ) -> Path:
478
+ output_path_obj = Path(output_path)
479
+ if isinstance(datasource, (str, Path)):
480
+ input_paths = [Path(datasource)]
481
+ elif isinstance(datasource, list) and all(
482
+ isinstance(p, (str, Path)) for p in datasource
483
+ ):
484
+ input_paths = [Path(p) for p in datasource]
485
+ if len(input_paths) > 1:
486
+ logger.warning(
487
+ "AICSVStandardizer currently processes one CSV file at a time for detailed AI analysis. Using the first file only."
488
+ )
489
+ input_paths = [input_paths[0]] # Process one file if multiple given
490
+ else:
491
+ raise TypeError(
492
+ "datasource must be a file path string/Path object or a list of such paths."
493
+ )
494
+
495
+ if not input_paths:
496
+ raise ValueError("No input datasource provided.")
497
+
498
+ input_path = input_paths[
499
+ 0
500
+ ] # We focus on a single file for this AI standardizer
501
+
502
+ if not input_path.exists() or not input_path.is_file():
503
+ raise FileNotFoundError(
504
+ f"Input CSV file not found or is not a file: {input_path}"
505
+ )
506
+
507
+ logger.info(f"--- AI Analysis for file: {input_path.name} ---")
508
+
509
+ # 1. Initial Guesses for AI
510
+ initial_encoding_guess = self._initial_encoding_hint
511
+ if not initial_encoding_guess:
512
+ try:
513
+ # Use base class's _detect_encoding, need an instance or make it static/helper
514
+ # For simplicity, re-implement or call a static version if available.
515
+ # Here, we simulate it for now or assume base standardizer's helper is callable.
516
+ with open(input_path, "rb") as fb_enc:
517
+ enc_sample = fb_enc.read(ENCODING_SAMPLE_SIZE)
518
+ detected_enc_info = detect(enc_sample) if enc_sample else None
519
+ initial_encoding_guess = (
520
+ detected_enc_info["encoding"]
521
+ if detected_enc_info and detected_enc_info["encoding"]
522
+ else "utf-8"
523
+ )
524
+ logger.info(
525
+ f"Initial encoding guess (detected): {initial_encoding_guess}"
526
+ )
527
+ except Exception as e:
528
+ logger.warning(
529
+ f"Initial encoding detection failed: {e}. Using utf-8 as fallback guess."
530
+ )
531
+ initial_encoding_guess = "utf-8"
532
+ else:
533
+ logger.info(
534
+ f"Using provided initial encoding hint: {initial_encoding_guess}"
535
+ )
536
+
537
+ initial_delimiter_guess = self._initial_delimiter_hint
538
+ if not initial_delimiter_guess:
539
+ try:
540
+ with open(
541
+ input_path, encoding=initial_encoding_guess, errors="ignore"
542
+ ) as f_delim_sample:
543
+ delim_sample_text = f_delim_sample.read(DELIMITER_SAMPLE_SIZE)
544
+ if delim_sample_text:
545
+ # Simulate base class's _detect_delimiter
546
+ sniffer = clevercsv.Sniffer()
547
+ dialect = sniffer.sniff(delim_sample_text)
548
+ initial_delimiter_guess = dialect.delimiter if dialect else ","
549
+ logger.info(
550
+ f"Initial delimiter guess (detected): '{initial_delimiter_guess}'"
551
+ )
552
+ else:
553
+ initial_delimiter_guess = "," # Fallback
554
+ logger.warning(
555
+ f"File empty/small, defaulting delimiter guess to ',' for {input_path.name}"
556
+ )
557
+ except Exception as e:
558
+ logger.warning(
559
+ f"Initial delimiter detection failed ({e}). Using ',' as fallback guess for {input_path.name}."
560
+ )
561
+ initial_delimiter_guess = ","
562
+ else:
563
+ logger.info(
564
+ f"Using provided initial delimiter hint: '{initial_delimiter_guess}'"
565
+ )
566
+
567
+ # 2. Run AI Agent Analysis
568
+ try:
569
+ ai_params = await self._run_analysis_agent(
570
+ input_path,
571
+ initial_encoding_guess,
572
+ initial_delimiter_guess,
573
+ )
574
+ except Exception as e:
575
+ logger.exception(
576
+ f"AI Agent analysis failed critically for {input_path.name}. Aborting."
577
+ )
578
+ raise RuntimeError(f"AI analysis failed for {input_path.name}") from e
579
+
580
+ # 3. Prepare parameters for the base CSVStandardizer
581
+ # The AI provides parameters for a single file processing scenario.
582
+
583
+ # Column definitions for the base standardizer:
584
+ # Base class expects: List[Optional[Dict[str, List[Dict[str, Any]]]]]
585
+ # For a single file, it's List containing one Dict: [{table_name: [col_specs...]}]
586
+ # Or, if base class is adapted, List containing one List: [[col_specs...]]
587
+
588
+ # The AI output `ai_params["columns"]` is already in the format:
589
+ # [{"identifier_in_csv": ..., "final_column_name": ..., "description": ...}, ...]
590
+ # This is exactly what the enhanced CSVStandardizer's `_setup_columns` expects for `defined_columns_spec`
591
+ # when `column_definitions` is a list containing this list of specs.
592
+
593
+ ai_column_definitions = [
594
+ ai_params["columns"]
595
+ ] # Wrap the list of col specs for the single file/table
596
+
597
+ # The base CSVStandardizer will use its own _sanitize_name for the table name from AI.
598
+ # We provide it via table_names list.
599
+ ai_table_name = [ai_params["table_name"]]
600
+ ai_table_description = [
601
+ ai_params.get("table_description")
602
+ ] # List of one description
603
+
604
+ # File-specific config for the base standardizer
605
+ # For a single file, this will be a list containing one dictionary.
606
+ file_specific_config = [
607
+ {
608
+ "encoding": ai_params["encoding"],
609
+ "delimiter": ai_params["delimiter"],
610
+ "has_header": ai_params["has_header"],
611
+ "skip_rows": ai_params["skip_rows"],
612
+ # skip_columns is not used if column_definitions are provided,
613
+ # as column selection is implicit in the provided definitions.
614
+ "skip_columns": None, # Explicitly set to None
615
+ }
616
+ ]
617
+
618
+ logger.info(f"AI determined parameters for {input_path.name}:")
619
+ logger.info(f" Table Name: {ai_table_name[0]}")
620
+ logger.info(f" Encoding: {file_specific_config[0]['encoding']}")
621
+ logger.info(f" Delimiter: '{file_specific_config[0]['delimiter']}'")
622
+ logger.info(f" Has Header: {file_specific_config[0]['has_header']}")
623
+ logger.info(f" Skip Rows: {file_specific_config[0]['skip_rows']}")
624
+ logger.info(
625
+ f" Table Description: {ai_table_description[0] if ai_table_description and ai_table_description[0] is not None else 'N/A'}"
626
+ )
627
+ logger.info(f" Column Definitions ({len(ai_column_definitions[0])} cols):")
628
+ for i, c_def in enumerate(ai_column_definitions[0]):
629
+ logger.info(
630
+ f" {i + 1}. ID in CSV: '{c_def['identifier_in_csv']}', Final Name: '{c_def['final_column_name']}', Desc: '{c_def.get('description', 'N/A')}'"
631
+ )
632
+
633
+ # 4. Call Base Class Standardizer Logic with AI-derived parameters
634
+ # We instantiate a new CSVStandardizer configured by the AI for this specific file.
635
+ final_processor = CSVStandardizer(
636
+ # These are now single-element lists because we process one file
637
+ table_names=ai_table_name,
638
+ descriptions=ai_table_description,
639
+ file_configs=file_specific_config,
640
+ column_definitions=ai_column_definitions, # Pass the AI-generated column specs
641
+ # default_skip_columns from __init__ can remain as a very deep fallback if AI somehow fails for columns
642
+ skip_columns=self.default_skip_columns,
643
+ )
644
+
645
+ try:
646
+ # The datasource for the base standardizer is the single input_path
647
+ result_path = final_processor.standardize(
648
+ datasource=[input_path], # Pass as a list of one
649
+ output_path=output_path_obj,
650
+ overwrite=overwrite,
651
+ )
652
+ logger.info(
653
+ f"AI CSV Standardization complete for {input_path.name}. Output: {result_path}"
654
+ )
655
+ return result_path
656
+ except Exception as e:
657
+ logger.exception(
658
+ f"Error during final standardization step using AI parameters for {input_path.name}: {e}"
659
+ )
660
+ raise RuntimeError(
661
+ f"Final standardization step failed for {input_path.name}."
662
+ ) from e
File without changes
File without changes