satif-ai 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,6 +63,38 @@ class AIStandardizer(AsyncStandardizer):
63
63
  ) -> Optional[Type[AsyncStandardizer]]:
64
64
  return self.ai_standardizer_map.get(extension.lower())
65
65
 
66
+ def _resolve_file_path(
67
+ self, raw_path_item: Union[str, Path], temp_processing_dir: Path
68
+ ) -> List[Path]:
69
+ """
70
+ Resolves a single input path to a list of file paths.
71
+ This method contains blocking file system operations.
72
+ """
73
+ raw_path = Path(raw_path_item).resolve()
74
+ input_file_paths: List[Path] = []
75
+
76
+ if not raw_path.exists():
77
+ raise FileNotFoundError(f"Input path not found: {raw_path}")
78
+
79
+ if raw_path.is_file():
80
+ if raw_path.suffix.lower() == ".zip":
81
+ # Zip extraction is handled asynchronously in the calling method
82
+ return [raw_path]
83
+ else:
84
+ input_file_paths.append(raw_path)
85
+ elif raw_path.is_dir():
86
+ logger.info(f"Processing directory datasource: {raw_path}")
87
+ for child_item in raw_path.iterdir():
88
+ if child_item.is_file():
89
+ input_file_paths.append(child_item)
90
+ # Deeper recursion to be implemented.
91
+ else:
92
+ logger.warning(
93
+ f"Input path '{raw_path}' is not a file or directory and will be ignored."
94
+ )
95
+
96
+ return input_file_paths
97
+
66
98
  async def _resolve_input_files(
67
99
  self, datasource: Datasource, temp_processing_dir: Path
68
100
  ) -> List[Path]:
@@ -70,8 +102,8 @@ class AIStandardizer(AsyncStandardizer):
70
102
  Resolves the input datasource to a list of individual file paths.
71
103
  Handles single files, lists of files, and extracts ZIP archives.
72
104
  """
73
- input_file_paths: List[Path] = []
74
105
  raw_paths_to_check: List[Union[str, Path]] = []
106
+ all_input_file_paths: List[Path] = []
75
107
 
76
108
  if isinstance(datasource, (str, Path)):
77
109
  raw_paths_to_check = [datasource]
@@ -88,12 +120,13 @@ class AIStandardizer(AsyncStandardizer):
88
120
  if not raw_paths_to_check: # Should be caught by above, but defensive
89
121
  raise ValueError("No input datasource paths provided.")
90
122
 
123
+ # Process each path item in a thread to avoid blocking the event loop
91
124
  for raw_path_item in raw_paths_to_check:
92
- raw_path = Path(raw_path_item).resolve()
93
- if not raw_path.exists():
94
- raise FileNotFoundError(f"Input path not found: {raw_path}")
125
+ resolved_paths = await asyncio.to_thread(
126
+ self._resolve_file_path, raw_path_item, temp_processing_dir
127
+ )
95
128
 
96
- if raw_path.is_file():
129
+ for raw_path in resolved_paths:
97
130
  if raw_path.suffix.lower() == ".zip":
98
131
  zip_extract_target = (
99
132
  temp_processing_dir
@@ -103,7 +136,7 @@ class AIStandardizer(AsyncStandardizer):
103
136
  extracted_from_zip = await extract_zip_archive_async(
104
137
  raw_path, zip_extract_target
105
138
  )
106
- input_file_paths.extend(extracted_from_zip)
139
+ all_input_file_paths.extend(extracted_from_zip)
107
140
  except Exception as e_zip:
108
141
  logger.error(
109
142
  f"Failed to extract ZIP archive '{raw_path}': {e_zip}",
@@ -113,23 +146,14 @@ class AIStandardizer(AsyncStandardizer):
113
146
  # For now, skipping problematic zips.
114
147
  continue
115
148
  else:
116
- input_file_paths.append(raw_path)
117
- elif raw_path.is_dir():
118
- logger.info(f"Processing directory datasource: {raw_path}")
119
- for child_item in raw_path.iterdir():
120
- if child_item.is_file():
121
- input_file_paths.append(child_item)
122
- # Deeper recursion to be implemeted.
123
- else:
124
- logger.warning(
125
- f"Input path '{raw_path}' is not a file or directory and will be ignored."
126
- )
149
+ all_input_file_paths.append(raw_path)
127
150
 
128
- if not input_file_paths:
151
+ if not all_input_file_paths:
129
152
  # This means all inputs were invalid, unresolvable, or zips failed etc.
130
153
  logger.error("No processable files found after resolving datasource.")
131
154
  raise ValueError("Datasource resolution resulted in no processable files.")
132
- return input_file_paths
155
+
156
+ return all_input_file_paths
133
157
 
134
158
  def _group_files_by_standardizer(
135
159
  self, file_paths: List[Path]
@@ -269,7 +293,7 @@ class AIStandardizer(AsyncStandardizer):
269
293
 
270
294
  return successful_intermediate_sdif_files, aggregated_file_configs
271
295
 
272
- async def _consolidate_results(
296
+ def _consolidate_results(
273
297
  self,
274
298
  intermediate_sdif_files: List[Path],
275
299
  aggregated_file_configs: Optional[List[Dict[str, Any]]],
@@ -362,6 +386,59 @@ class AIStandardizer(AsyncStandardizer):
362
386
  file_configs=aggregated_file_configs if aggregated_file_configs else None,
363
387
  )
364
388
 
389
+ async def _setup_workspace(
390
+ self, output_path: Path, overwrite: bool
391
+ ) -> Tuple[Path, Path, Path]:
392
+ """
393
+ Sets up the temporary workspace directories and validates the output path.
394
+ Contains blocking file system operations.
395
+ """
396
+ final_sdif_file_target = output_path.resolve()
397
+
398
+ if final_sdif_file_target.is_dir():
399
+ raise ValueError(
400
+ f"Target output_path '{final_sdif_file_target}' is a directory. "
401
+ "It must be a full file path for the target SDIF SQLite database (e.g., data.sqlite or data.sdif)."
402
+ )
403
+ if not final_sdif_file_target.suffix:
404
+ logger.warning(
405
+ f"Target output_path '{final_sdif_file_target}' has no file extension. "
406
+ "It should be a path to an SDIF SQLite database file (e.g., data.sqlite or data.sdif)."
407
+ )
408
+ elif final_sdif_file_target.suffix.lower() not in (".sdif", ".sqlite", ".db"):
409
+ logger.warning(
410
+ f"Target output_path '{final_sdif_file_target}' does not have a common SQLite extension. "
411
+ "Ensure this is the intended SQLite file path."
412
+ )
413
+
414
+ # Create a unique temporary directory for this standardization run
415
+ run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aistd_run_"))
416
+ intermediate_sdif_files_dir = run_temp_dir / "intermediate_sdif_files"
417
+ intermediate_sdif_files_dir.mkdir(parents=True, exist_ok=True)
418
+ file_processing_temp_dir = run_temp_dir / "file_processing_temp"
419
+ file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
420
+
421
+ return (
422
+ final_sdif_file_target,
423
+ intermediate_sdif_files_dir,
424
+ file_processing_temp_dir,
425
+ )
426
+
427
+ async def _cleanup_workspace(self, run_temp_dir: Path) -> None:
428
+ """
429
+ Cleans up the temporary workspace directory.
430
+ Contains blocking file system operations.
431
+ """
432
+ if run_temp_dir.exists():
433
+ try:
434
+ await asyncio.to_thread(shutil.rmtree, run_temp_dir)
435
+ logger.info(f"Cleaned up temporary run directory: {run_temp_dir}")
436
+ except Exception as e_clean:
437
+ logger.error(
438
+ f"Error cleaning up temporary run directory {run_temp_dir}: {e_clean}",
439
+ exc_info=True,
440
+ )
441
+
365
442
  async def standardize(
366
443
  self,
367
444
  datasource: Datasource,
@@ -387,31 +464,15 @@ class AIStandardizer(AsyncStandardizer):
387
464
  logger.info(
388
465
  f"AIStandardizer starting process for output SDIF file: {output_path}"
389
466
  )
390
- final_sdif_file_target = Path(output_path).resolve()
391
467
 
392
- if final_sdif_file_target.is_dir():
393
- raise ValueError(
394
- f"Target output_path '{final_sdif_file_target}' is a directory. "
395
- "It must be a full file path for the target SDIF SQLite database (e.g., data.sqlite or data.sdif)."
396
- )
397
- if not final_sdif_file_target.suffix:
398
- logger.warning(
399
- f"Target output_path '{final_sdif_file_target}' has no file extension. "
400
- "It should be a path to an SDIF SQLite database file (e.g., data.sqlite or data.sdif)."
401
- )
402
- elif final_sdif_file_target.suffix.lower() not in (".sdif", ".sqlite", ".db"):
403
- logger.warning(
404
- f"Target output_path '{final_sdif_file_target}' does not have a common SQLite extension. "
405
- "Ensure this is the intended SQLite file path."
406
- )
468
+ # Setup workspace and validate output path - moved to a separate async function
469
+ (
470
+ final_sdif_file_target,
471
+ intermediate_sdif_files_dir,
472
+ file_processing_temp_dir,
473
+ ) = await asyncio.to_thread(self._setup_workspace, Path(output_path), overwrite)
407
474
 
408
- # Create a unique temporary directory for this standardization run
409
- # This directory will hold intermediate files and ZIP extractions.
410
- run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aistd_run_"))
411
- intermediate_sdif_files_dir = run_temp_dir / "intermediate_sdif_files"
412
- intermediate_sdif_files_dir.mkdir(parents=True, exist_ok=True)
413
- file_processing_temp_dir = run_temp_dir / "file_processing_temp"
414
- file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
475
+ run_temp_dir = file_processing_temp_dir.parent
415
476
 
416
477
  try:
417
478
  resolved_files = await self._resolve_input_files(
@@ -419,9 +480,11 @@ class AIStandardizer(AsyncStandardizer):
419
480
  )
420
481
  logger.info(f"Resolved {len(resolved_files)} file(s) for standardization.")
421
482
 
422
- grouped_by_std, unsupported = self._group_files_by_standardizer(
423
- resolved_files
483
+ # File grouping - potentially move to a thread if the list is very large
484
+ grouped_by_std, unsupported = await asyncio.to_thread(
485
+ self._group_files_by_standardizer, resolved_files
424
486
  )
487
+
425
488
  if not grouped_by_std:
426
489
  user_message = (
427
490
  "No files found that can be handled by configured AI standardizers."
@@ -451,7 +514,8 @@ class AIStandardizer(AsyncStandardizer):
451
514
  f"Successfully generated {len(intermediate_sdif_files)} intermediate SDIF SQLite file(s)."
452
515
  )
453
516
 
454
- final_result = await self._consolidate_results(
517
+ final_result = await asyncio.to_thread(
518
+ self._consolidate_results,
455
519
  intermediate_sdif_files,
456
520
  aggregated_file_configs,
457
521
  final_sdif_file_target,
@@ -469,13 +533,5 @@ class AIStandardizer(AsyncStandardizer):
469
533
  raise
470
534
  raise RuntimeError(f"AIStandardizer processing error: {e}") from e
471
535
  finally:
472
- # Clean up the entire temporary directory for this run
473
- if run_temp_dir.exists():
474
- try:
475
- shutil.rmtree(run_temp_dir)
476
- logger.info(f"Cleaned up temporary run directory: {run_temp_dir}")
477
- except Exception as e_clean:
478
- logger.error(
479
- f"Error cleaning up temporary run directory {run_temp_dir}: {e_clean}",
480
- exc_info=True,
481
- )
536
+ # Clean up using a dedicated async method
537
+ await self._cleanup_workspace(run_temp_dir)
satif_ai/transform.py CHANGED
@@ -90,7 +90,6 @@ async def atransform(
90
90
  # If code isn't provided, we need a builder. If a builder isn't provided, we create one.
91
91
  if current_transformation_code is None:
92
92
  if active_builder is None:
93
- # Create SyncpulseTransformationBuilder
94
93
  _effective_mcp_server = mcp_server if mcp_server is not None else mcp
95
94
 
96
95
  _openai_mcp_instance = OpenAICompatibleMCP(mcp=_effective_mcp_server)
@@ -137,7 +136,6 @@ async def atransform(
137
136
  if current_transformation_code is None:
138
137
  raise ValueError("Transformation code could not be obtained or generated.")
139
138
 
140
- # Code Executor and Transformation
141
139
  _code_executor = code_executor if code_executor is not None else LocalCodeExecutor()
142
140
 
143
141
  transformer = CodeTransformer(
@@ -2,6 +2,7 @@ import base64
2
2
  import os
3
3
  import re
4
4
  from collections import defaultdict
5
+ from contextvars import ContextVar
5
6
  from pathlib import Path
6
7
  from typing import Any, Dict, List, Optional, Union
7
8
 
@@ -15,10 +16,15 @@ from satif_sdk.comparators import get_comparator
15
16
  from satif_sdk.representers import get_representer
16
17
  from satif_sdk.transformers import CodeTransformer
17
18
 
18
- # Global variables for transformation
19
- INPUT_SDIF_PATH: Optional[Path] = None
20
- OUTPUT_TARGET_FILES: Optional[Dict[Union[str, Path], str]] = None
21
- SCHEMA_ONLY: Optional[bool] = None
19
+ CONTEXT_INPUT_SDIF_PATH: ContextVar[Optional[Path]] = ContextVar(
20
+ "CONTEXT_INPUT_SDIF_PATH", default=None
21
+ )
22
+ CONTEXT_OUTPUT_TARGET_FILES: ContextVar[Optional[Dict[Union[str, Path], str]]] = (
23
+ ContextVar("CONTEXT_OUTPUT_TARGET_FILES", default=None)
24
+ )
25
+ CONTEXT_SCHEMA_ONLY: ContextVar[Optional[bool]] = ContextVar(
26
+ "CONTEXT_SCHEMA_ONLY", default=None
27
+ )
22
28
 
23
29
 
24
30
  def _format_comparison_output(
@@ -60,18 +66,22 @@ async def execute_transformation(code: str) -> str:
60
66
  Args:
61
67
  code: The code to execute on the input.
62
68
  """
63
- if INPUT_SDIF_PATH is None or OUTPUT_TARGET_FILES is None:
64
- return "Error: Transformation context not initialized"
69
+ input_sdif_path = CONTEXT_INPUT_SDIF_PATH.get()
70
+ output_target_files_dict = CONTEXT_OUTPUT_TARGET_FILES.get()
71
+ schema_only_flag = CONTEXT_SCHEMA_ONLY.get()
72
+
73
+ if input_sdif_path is None or output_target_files_dict is None:
74
+ return "Error: Transformation context not initialized correctly via contextvars"
65
75
 
66
76
  code_transformer = CodeTransformer(
67
77
  function=code,
68
78
  code_executor=LocalCodeExecutor(disable_security_warning=True),
69
79
  )
70
- generated_output_path = code_transformer.export(INPUT_SDIF_PATH)
80
+ generated_output_path = code_transformer.export(input_sdif_path)
71
81
 
72
82
  comparisons = []
73
83
  comparator_kwargs = {}
74
- if SCHEMA_ONLY:
84
+ if schema_only_flag:
75
85
  comparator_kwargs["check_structure_only"] = True
76
86
 
77
87
  if os.path.isdir(generated_output_path):
@@ -81,7 +91,7 @@ async def execute_transformation(code: str) -> str:
81
91
  for (
82
92
  output_base_file,
83
93
  output_target_file_name,
84
- ) in OUTPUT_TARGET_FILES.items():
94
+ ) in output_target_files_dict.items():
85
95
  if output_target_file_name in generated_files:
86
96
  generated_file_path = os.path.join(
87
97
  generated_output_path, output_target_file_name
@@ -92,7 +102,7 @@ async def execute_transformation(code: str) -> str:
92
102
  )
93
103
  formatted_message = _format_comparison_output(
94
104
  comparison,
95
- SCHEMA_ONLY,
105
+ schema_only_flag,
96
106
  generated_file_path,
97
107
  output_target_file_name,
98
108
  )
@@ -103,16 +113,18 @@ async def execute_transformation(code: str) -> str:
103
113
  )
104
114
  else:
105
115
  # If it's a single file, ensure there's only one target and compare
106
- if len(OUTPUT_TARGET_FILES) == 1:
107
- output_file = list(OUTPUT_TARGET_FILES.keys())[0]
108
- output_target_file_name = list(OUTPUT_TARGET_FILES.values())[0]
109
- comparator = get_comparator(output_file.split(".")[-1])
116
+ if len(output_target_files_dict) == 1:
117
+ output_file = list(output_target_files_dict.keys())[0]
118
+ output_target_file_name = list(output_target_files_dict.values())[0]
119
+ comparator = get_comparator(
120
+ str(output_file).split(".")[-1]
121
+ ) # Ensure output_file is string for split
110
122
  comparison = comparator.compare(
111
123
  generated_output_path, output_file, **comparator_kwargs
112
124
  )
113
125
  formatted_message = _format_comparison_output(
114
126
  comparison,
115
- SCHEMA_ONLY,
127
+ schema_only_flag,
116
128
  str(generated_output_path),
117
129
  output_target_file_name,
118
130
  )
@@ -147,126 +159,144 @@ class SyncpulseTransformationBuilder(AsyncTransformationBuilder):
147
159
  schema_only: bool = False,
148
160
  representer_kwargs: Optional[Dict[str, Any]] = None,
149
161
  ) -> str:
150
- global INPUT_SDIF_PATH, OUTPUT_TARGET_FILES, SCHEMA_ONLY
151
-
152
- INPUT_SDIF_PATH = Path(sdif).resolve()
153
- SCHEMA_ONLY = schema_only
154
- # We must encode the path because special characters are not allowed in mcp read_resource()
155
- input_sdif_mcp_uri_path = base64.b64encode(str(sdif).encode()).decode()
156
- output_sdif_mcp_uri_path = (
157
- base64.b64encode(str(output_sdif).encode()).decode()
158
- if output_sdif
159
- else None
160
- )
161
-
162
- input_schema = await self.mcp_session.read_resource(
163
- f"schema://{input_sdif_mcp_uri_path}"
164
- )
165
- input_sample = await self.mcp_session.read_resource(
166
- f"sample://{input_sdif_mcp_uri_path}"
167
- )
168
-
169
- output_schema_text = "N/A"
170
- output_sample_text = "N/A"
171
- if output_sdif_mcp_uri_path:
172
- try:
173
- output_schema_content = await self.mcp_session.read_resource(
174
- f"schema://{output_sdif_mcp_uri_path}"
175
- )
176
- if output_schema_content.contents:
177
- output_schema_text = output_schema_content.contents[0].text
178
- except Exception as e:
179
- print(
180
- f"Warning: Could not read schema for output_sdif {output_sdif_mcp_uri_path}: {e}"
181
- )
182
-
183
- try:
184
- output_sample_content = await self.mcp_session.read_resource(
185
- f"sample://{output_sdif_mcp_uri_path}"
186
- )
187
- if output_sample_content.contents:
188
- output_sample_text = output_sample_content.contents[0].text
189
- except Exception as e:
190
- print(
191
- f"Warning: Could not read sample for output_sdif {output_sdif_mcp_uri_path}: {e}"
192
- )
162
+ resolved_input_sdif_path = Path(sdif).resolve()
193
163
 
194
164
  # OUTPUT_TARGET_FILES keys are absolute paths to original example files for local reading by representers/comparators.
195
165
  # Values are agent-facing filenames.
166
+ resolved_output_target_files: Dict[Union[str, Path], str]
196
167
  if isinstance(output_target_files, FilePath):
197
- OUTPUT_TARGET_FILES = {
168
+ resolved_output_target_files = {
198
169
  Path(output_target_files).resolve(): Path(output_target_files).name
199
170
  }
200
171
  elif isinstance(output_target_files, list):
201
- OUTPUT_TARGET_FILES = {
172
+ resolved_output_target_files = {
202
173
  Path(file_path).resolve(): Path(file_path).name
203
174
  for file_path in output_target_files
204
175
  }
205
176
  elif isinstance(output_target_files, dict):
206
177
  temp_map = {}
207
178
  for k, v in output_target_files.items():
208
- if isinstance(k, Path):
209
- temp_map[k.resolve()] = v
210
- else:
211
- temp_map[k] = v
212
- OUTPUT_TARGET_FILES = temp_map
179
+ # Resolve Path keys to absolute paths
180
+ key_to_resolve = k
181
+ if (
182
+ isinstance(key_to_resolve, str) and Path(key_to_resolve).exists()
183
+ ): # Check if string is a valid path
184
+ key_to_resolve = Path(key_to_resolve)
185
+
186
+ if isinstance(key_to_resolve, Path):
187
+ temp_map[key_to_resolve.resolve()] = v
188
+ else: # Keep non-Path keys as they are (e.g. if it's already a resolved string path from somewhere else)
189
+ temp_map[key_to_resolve] = v
190
+ resolved_output_target_files = temp_map
213
191
  else:
214
- OUTPUT_TARGET_FILES = {}
192
+ resolved_output_target_files = {}
193
+
194
+ token_input_path = CONTEXT_INPUT_SDIF_PATH.set(resolved_input_sdif_path)
195
+ token_output_files = CONTEXT_OUTPUT_TARGET_FILES.set(
196
+ resolved_output_target_files
197
+ )
198
+ token_schema_only = CONTEXT_SCHEMA_ONLY.set(schema_only)
215
199
 
216
- output_representation = defaultdict(dict)
217
- if OUTPUT_TARGET_FILES:
218
- for file_key_abs_path in list(OUTPUT_TARGET_FILES.keys()):
219
- agent_facing_name = OUTPUT_TARGET_FILES[file_key_abs_path]
220
- print(f"Representing {agent_facing_name} from {file_key_abs_path}")
200
+ try:
201
+ # We must encode the path because special characters are not allowed in mcp read_resource()
202
+ input_sdif_mcp_uri_path = base64.b64encode(
203
+ str(resolved_input_sdif_path).encode()
204
+ ).decode()
205
+ output_sdif_mcp_uri_path = (
206
+ base64.b64encode(str(output_sdif).encode()).decode()
207
+ if output_sdif
208
+ else None
209
+ )
210
+
211
+ input_schema = await self.mcp_session.read_resource(
212
+ f"schema://{input_sdif_mcp_uri_path}"
213
+ )
214
+ input_sample = await self.mcp_session.read_resource(
215
+ f"sample://{input_sdif_mcp_uri_path}"
216
+ )
217
+
218
+ output_schema_text = "N/A"
219
+ output_sample_text = "N/A"
220
+ if output_sdif_mcp_uri_path:
221
221
  try:
222
- # Representer uses the absolute path (file_key_abs_path) to read the example file.
223
- representer = get_representer(file_key_abs_path)
224
- representation, used_params = representer.represent(
225
- file_key_abs_path, **(representer_kwargs or {})
222
+ output_schema_content = await self.mcp_session.read_resource(
223
+ f"schema://{output_sdif_mcp_uri_path}"
226
224
  )
227
- output_representation[agent_facing_name] = {
228
- "representation": representation,
229
- "used_params": used_params,
230
- }
225
+ if output_schema_content.contents:
226
+ output_schema_text = output_schema_content.contents[0].text
231
227
  except Exception as e:
232
228
  print(
233
- f"Warning: Could not get representation for {agent_facing_name} (path {file_key_abs_path}): {e}"
229
+ f"Warning: Could not read schema for output_sdif {output_sdif_mcp_uri_path}: {e}"
230
+ )
231
+
232
+ try:
233
+ output_sample_content = await self.mcp_session.read_resource(
234
+ f"sample://{output_sdif_mcp_uri_path}"
234
235
  )
235
- output_representation[agent_facing_name] = (
236
- f"Error representing file: {e}"
236
+ if output_sample_content.contents:
237
+ output_sample_text = output_sample_content.contents[0].text
238
+ except Exception as e:
239
+ print(
240
+ f"Warning: Could not read sample for output_sdif {output_sdif_mcp_uri_path}: {e}"
237
241
  )
242
+ output_representation = defaultdict(dict)
243
+ if resolved_output_target_files:
244
+ for file_key_abs_path in list(resolved_output_target_files.keys()):
245
+ agent_facing_name = resolved_output_target_files[file_key_abs_path]
246
+ try:
247
+ # Representer uses the absolute path (file_key_abs_path) to read the example file.
248
+ representer = get_representer(file_key_abs_path)
249
+ representation, used_params = representer.represent(
250
+ file_key_abs_path, **(representer_kwargs or {})
251
+ )
252
+ output_representation[agent_facing_name] = {
253
+ "representation": representation,
254
+ "used_params": used_params,
255
+ }
256
+ except Exception as e:
257
+ print(
258
+ f"Warning: Could not get representation for {agent_facing_name} (path {file_key_abs_path}): {e}"
259
+ )
260
+ output_representation[agent_facing_name] = (
261
+ f"Error representing file: {e}"
262
+ )
238
263
 
239
- prompt = await self.mcp_session.get_prompt(
240
- "create_transformation",
241
- arguments={
242
- "input_file": Path(
243
- input_sdif_mcp_uri_path
244
- ).name, # Display name for prompt (from relative path)
245
- "input_schema": input_schema.contents[0].text
246
- if input_schema.contents
247
- else "Error reading input schema",
248
- "input_sample": input_sample.contents[0].text
249
- if input_sample.contents
250
- else "Error reading input sample",
251
- "output_files": str(list(OUTPUT_TARGET_FILES.values())),
252
- "output_schema": output_schema_text,
253
- "output_sample": output_sample_text
254
- if not SCHEMA_ONLY
255
- else "Sample not available. File is empty (no data).",
256
- "output_representation": str(output_representation),
257
- "instructions": instructions
258
- or "No instructions provided. Use the output example.",
259
- },
260
- )
261
- agent = Agent(
262
- name="Transformation Builder",
263
- mcp_servers=[self.mcp_server],
264
- tools=[execute_transformation],
265
- model=self.llm_model,
266
- )
267
- result = await Runner.run(agent, prompt.messages[0].content.text)
268
- transformation_code = self.parse_code(result.final_output)
269
- return transformation_code
264
+ prompt = await self.mcp_session.get_prompt(
265
+ "create_transformation",
266
+ arguments={
267
+ "input_file": Path(
268
+ input_sdif_mcp_uri_path # Use the original sdif path for display name logic if needed
269
+ ).name,
270
+ "input_schema": input_schema.contents[0].text
271
+ if input_schema.contents
272
+ else "Error reading input schema",
273
+ "input_sample": input_sample.contents[0].text
274
+ if input_sample.contents
275
+ else "Error reading input sample",
276
+ "output_files": str(list(resolved_output_target_files.values())),
277
+ "output_schema": output_schema_text,
278
+ "output_sample": output_sample_text
279
+ if not schema_only
280
+ else "Sample not available. File is empty (no data).",
281
+ "output_representation": str(output_representation),
282
+ "instructions": instructions
283
+ or "No instructions provided. Use the output example.",
284
+ },
285
+ )
286
+ agent = Agent(
287
+ name="Transformation Builder",
288
+ mcp_servers=[self.mcp_server],
289
+ tools=[execute_transformation],
290
+ model=self.llm_model,
291
+ )
292
+ result = await Runner.run(agent, prompt.messages[0].content.text)
293
+ transformation_code = self.parse_code(result.final_output)
294
+ return transformation_code
295
+ finally:
296
+ # Reset context variables after the task is done
297
+ CONTEXT_INPUT_SDIF_PATH.reset(token_input_path)
298
+ CONTEXT_OUTPUT_TARGET_FILES.reset(token_output_files)
299
+ CONTEXT_SCHEMA_ONLY.reset(token_schema_only)
270
300
 
271
301
  def parse_code(self, code) -> str:
272
302
  match = re.search(r"```(?:python)?(.*?)```", code, re.DOTALL)
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.3
2
+ Name: satif-ai
3
+ Version: 0.2.12
4
+ Summary: AI Agents for Satif
5
+ License: MIT
6
+ Author: Syncpulse
7
+ Maintainer: Bryan Djafer
8
+ Maintainer-email: bryan.djafer@syncpulse.fr
9
+ Requires-Python: >=3.10,<3.14
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Provides-Extra: xlsx
17
+ Requires-Dist: openai-agents (>=0.0.9,<0.0.10)
18
+ Requires-Dist: satif-sdk (>=0.1.0,<1.0.0)
19
+ Requires-Dist: sdif-mcp (>=0.1.0,<1.0.0)
20
+ Description-Content-Type: text/markdown
21
+
22
+ # SATIF AI
23
+
24
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
25
+ [![Python Version](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
26
+ [![Status: Experimental](https://img.shields.io/badge/Status-Experimental-orange.svg)](https://github.com/syncpulse-solutions/satif)
27
+
28
+ AI toolkit for transforming any input files into any output files.
29
+
30
+ ## ⚠️ Disclaimer
31
+
32
+ **EXPERIMENTAL STATUS**: This package is in early development and not production-ready. The API may change significantly between versions.
33
+
34
+ **BLOCKING I/O**: Despite the async API, some operations may contain blocking I/O. This package should be used for testing and experimental purposes only.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install satif-ai
40
+ ```
41
+
42
+ ## Overview
43
+
44
+ SATIF AI enables automated transformation of heterogeneous data sources (CSV, Excel, PDF, XML, etc.) into any desired output format in 2 steps:
45
+
46
+ 1. **Standardization**: Ingests heterogeneous source files (CSV, Excel, PDF, XML, etc.) and transforms them into SDIF, a structured intermediate format.
47
+ 2. **Transformation**: Applies business logic to the standardized data to generate the target output files, with transformation code generated by AI.
48
+
49
+ ## Key Features
50
+
51
+ - **Any Format Support**: Process virtually any input, even challenging unstructured content (PDFs, complex Excel sheets)
52
+ - **AI-Powered Code Generation**: Automatically generate transformation code from examples and natural language instructions
53
+ - **Robust Schema Enforcement**: Handle input data drift and schema inconsistencies through configurable validation
54
+ - **SQL-Based Data Processing**: Query and manipulate all data using SQL
55
+ - **Decoupled Processing Stages**: Standardize once, transform many times with different logic
56
+
57
+ ## Usage
58
+
59
+ ### Basic Workflow
60
+
61
+ ```python
62
+ import asyncio
63
+ from satif_ai import astandardize, atransform
64
+
65
+ async def main():
66
+ # Step 1: Standardize input files into SDIF
67
+ sdif_path = await astandardize(
68
+ datasource=["data.csv", "reference.xlsx"],
69
+ output_path="standardized.sdif",
70
+ overwrite=True
71
+ )
72
+
73
+ # Step 2: Transform SDIF into desired output using AI
74
+ await atransform(
75
+ sdif=sdif_path,
76
+ output_target_files="output.json",
77
+ instructions="Extract customer IDs and purchase totals, calculate the average purchase value per customer, and output as JSON with customer_id and avg_purchase_value fields.",
78
+ llm_model="o4-mini" # Choose AI model based on needs
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ asyncio.run(main())
83
+ ```
84
+
85
+ ## Architecture
86
+
87
+ ```
88
+ ┌─────────────────┐ ┌───────────────────────┐ ┌─────────────────┐
89
+ │ Source Files │────▶│ Standardization Layer │────▶│ SDIF File │
90
+ │ CSV/Excel/PDF/ │ │ │ │ (SQLite-based) │
91
+ │ XML/JSON/etc. │ └───────────────────────┘ └────────┬────────┘
92
+ └─────────────────┘ │
93
+
94
+ ┌─────────────────┐ ┌───────────────────────┐ │
95
+ │ Output Files │◀────│ Transformation Layer │◀─────────────┘
96
+ │ Any format │ │ (AI-generated code) │
97
+ └─────────────────┘ └───────────────────────┘
98
+ ```
99
+
100
+ SDIF (Standardized Data Interoperable Format) is the intermediate SQLite-based format that:
101
+
102
+ - Stores structured tables alongside JSON objects and binary media
103
+ - Maintains rich metadata about data origins and relationships
104
+ - Provides direct SQL queryability for complex transformations
105
+
106
+ ## Documentation
107
+
108
+ For detailed documentation, examples, and advanced features, visit [SATIF Documentation](https://satif.io/docs).
109
+
110
+ ## Contributing
111
+
112
+ Contributions are welcome! Whether it's bug reports, feature requests, or code contributions, please feel free to get involved.
113
+
114
+ ### Contribution Workflow
115
+
116
+ 1. **Fork the repository** on GitHub.
117
+ 2. **Clone your fork** locally:
118
+
119
+ ```bash
120
+ git clone https://github.com/syncpulse-solutions/satif.git
121
+ cd satif/libs/ai
122
+ ```
123
+ 3. **Create a new branch** for your feature or bug fix:
124
+
125
+ ```bash
126
+ git checkout -b feature/your-feature-name
127
+ ```
128
+
129
+ or
130
+
131
+ ```bash
132
+ git checkout -b fix/your-bug-fix-name
133
+ ```
134
+ 4. **Set up the development environment** as described in the [From Source (for Development)](#from-source-for-development) section:
135
+
136
+ ```bash
137
+ make install # or poetry install
138
+ ```
139
+ 5. **Make your changes.** Ensure your code follows the project's style guidelines.
140
+ 6. **Format and lint your code:**
141
+
142
+ ```bash
143
+ make format
144
+ make lint
145
+ ```
146
+ 7. **Run type checks:**
147
+
148
+ ```bash
149
+ make typecheck
150
+ ```
151
+ 8. **Run tests** to ensure your changes don't break existing functionality:
152
+
153
+ ```bash
154
+ make test
155
+ ```
156
+
157
+ To also generate a coverage report:
158
+
159
+ ```bash
160
+ make coverage
161
+ ```
162
+ 9. **Commit your changes** with a clear and descriptive commit message.
163
+ 10. **Push your changes** to your fork on GitHub:
164
+
165
+ ```bash
166
+ git push origin feature/your-feature-name
167
+ ```
168
+ 11. **Submit a Pull Request (PR)** to the `main` branch of the original `syncpulse-solutions/satif` repository.
169
+
170
+ ## License
171
+
172
+ This project is licensed under the MIT License.
173
+
174
+ Maintainer: Bryan Djafer (bryan.djafer@syncpulse.fr)
175
+
@@ -3,18 +3,18 @@ satif_ai/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
3
3
  satif_ai/adapters/tidy.py,sha256=1g7Wcq8agAZhaAqQDhhD8yh3iO5gZ4mwdKHsiNN3hHY,18540
4
4
  satif_ai/standardize.py,sha256=TgAB_nhcHY8zqlfT1PpgfgSswqdE-ly-dheQz-7NC7Q,5674
5
5
  satif_ai/standardizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- satif_ai/standardizers/ai.py,sha256=jtYM-ChjLtkpFaubz980CTCNAoC33iYxB3pq0_hn2lU,21045
6
+ satif_ai/standardizers/ai.py,sha256=2dz5LC5mAM7G1ZpDJPb7whdYIBLfwIPFOFNZJIhHxvk,22920
7
7
  satif_ai/standardizers/ai_csv.py,sha256=LbCRaLleujQRgSRRyt9ujbED-PIGRq1J8zRnejGM5nc,25437
8
8
  satif_ai/standardizers/ai_xlsx.py,sha256=558Bzfy8WGuk5mdnjMvvtakQXcU3rmwK3ykPjpXKwmQ,15863
9
- satif_ai/transform.py,sha256=g5XNeVCIKUgDW3UIhf02MN9xkXnWF3EJXS0Eig_hfD8,7677
9
+ satif_ai/transform.py,sha256=CoaCtIvJjJuIJ2HgU_yU8QZVGi73PcJNfke9w3sDBoc,7586
10
10
  satif_ai/transformation_builders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- satif_ai/transformation_builders/syncpulse.py,sha256=c59BZicNnqs3NDKpflBAPqw42pGb6nYB2Zps0ChGyaM,11368
11
+ satif_ai/transformation_builders/syncpulse.py,sha256=WhvS-HTzs7DQj-tfJ12Xk2rYGYdn8pv_x5jtU7WN2h4,13258
12
12
  satif_ai/utils/__init__.py,sha256=F-usaCt_vX872mXvtukuZdNMPnkVqDb8RaDgox2uow4,212
13
13
  satif_ai/utils/merge_sdif.py,sha256=y4C6pgkdyer0QugroFKUck4Eud4Ap-tJzM-eclMo3Rw,25629
14
14
  satif_ai/utils/openai_mcp.py,sha256=duCQZXG0mBs9DOOFIUvzraJhxD2IDzegWO9iOiLfFwY,3938
15
15
  satif_ai/utils/zip.py,sha256=G_GK8629Iw0TLFCQJfnqOscv7MoKF5zdzxvEAbL7Gss,5186
16
- satif_ai-0.2.10.dist-info/LICENSE,sha256=kS8EN6yAaGZd7V5z6GKSn_x3ozcZltrfRky4vMPRCw8,1072
17
- satif_ai-0.2.10.dist-info/METADATA,sha256=O5QWv8YJFtB5AIniv0LRgmSgpEaRLVdlz8WHZAru1X8,719
18
- satif_ai-0.2.10.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
19
- satif_ai-0.2.10.dist-info/entry_points.txt,sha256=Mz2SwYALjktap1bF-Q3EWBgiZVNT6QJCVsCs_fCV33Y,43
20
- satif_ai-0.2.10.dist-info/RECORD,,
16
+ satif_ai-0.2.12.dist-info/LICENSE,sha256=kS8EN6yAaGZd7V5z6GKSn_x3ozcZltrfRky4vMPRCw8,1072
17
+ satif_ai-0.2.12.dist-info/METADATA,sha256=m89TCjz21zi-fPOei5CRxxWbNxIghiMGDEQgWpRxt_U,6485
18
+ satif_ai-0.2.12.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
19
+ satif_ai-0.2.12.dist-info/entry_points.txt,sha256=Mz2SwYALjktap1bF-Q3EWBgiZVNT6QJCVsCs_fCV33Y,43
20
+ satif_ai-0.2.12.dist-info/RECORD,,
@@ -1,23 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: satif-ai
3
- Version: 0.2.10
4
- Summary: AI Agents for Satif
5
- License: MIT
6
- Author: Syncpulse
7
- Maintainer: Bryan Djafer
8
- Maintainer-email: bryan.djafer@syncpulse.fr
9
- Requires-Python: >=3.10,<3.14
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.10
13
- Classifier: Programming Language :: Python :: 3.11
14
- Classifier: Programming Language :: Python :: 3.12
15
- Classifier: Programming Language :: Python :: 3.13
16
- Provides-Extra: xlsx
17
- Requires-Dist: openai-agents (>=0.0.9,<0.0.10)
18
- Requires-Dist: satif-sdk (>=0.1.0,<1.0.0)
19
- Requires-Dist: sdif-mcp (>=0.1.0,<1.0.0)
20
- Description-Content-Type: text/markdown
21
-
22
- # SATIF AI
23
-