satif-ai 0.2.10__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- satif_ai-0.2.12/PKG-INFO +175 -0
- satif_ai-0.2.12/README.md +153 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/pyproject.toml +1 -1
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/standardizers/ai.py +112 -56
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/transform.py +0 -2
- satif_ai-0.2.12/satif_ai/transformation_builders/syncpulse.py +307 -0
- satif_ai-0.2.10/PKG-INFO +0 -23
- satif_ai-0.2.10/README.md +0 -1
- satif_ai-0.2.10/satif_ai/transformation_builders/syncpulse.py +0 -277
- {satif_ai-0.2.10 → satif_ai-0.2.12}/LICENSE +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/__init__.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/adapters/__init__.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/adapters/tidy.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/standardize.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/standardizers/__init__.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/standardizers/ai_csv.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/standardizers/ai_xlsx.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/transformation_builders/__init__.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/utils/__init__.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/utils/merge_sdif.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/utils/openai_mcp.py +0 -0
- {satif_ai-0.2.10 → satif_ai-0.2.12}/satif_ai/utils/zip.py +0 -0
satif_ai-0.2.12/PKG-INFO
ADDED
@@ -0,0 +1,175 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: satif-ai
|
3
|
+
Version: 0.2.12
|
4
|
+
Summary: AI Agents for Satif
|
5
|
+
License: MIT
|
6
|
+
Author: Syncpulse
|
7
|
+
Maintainer: Bryan Djafer
|
8
|
+
Maintainer-email: bryan.djafer@syncpulse.fr
|
9
|
+
Requires-Python: >=3.10,<3.14
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
16
|
+
Provides-Extra: xlsx
|
17
|
+
Requires-Dist: openai-agents (>=0.0.9,<0.0.10)
|
18
|
+
Requires-Dist: satif-sdk (>=0.1.0,<1.0.0)
|
19
|
+
Requires-Dist: sdif-mcp (>=0.1.0,<1.0.0)
|
20
|
+
Description-Content-Type: text/markdown
|
21
|
+
|
22
|
+
# SATIF AI
|
23
|
+
|
24
|
+
[](https://opensource.org/licenses/MIT)
|
25
|
+
[](https://www.python.org/downloads/)
|
26
|
+
[](https://github.com/syncpulse-solutions/satif)
|
27
|
+
|
28
|
+
AI toolkit for transforming any input files into any output files.
|
29
|
+
|
30
|
+
## ⚠️ Disclaimer
|
31
|
+
|
32
|
+
**EXPERIMENTAL STATUS**: This package is in early development and not production-ready. The API may change significantly between versions.
|
33
|
+
|
34
|
+
**BLOCKING I/O**: Despite the async API, some operations may contain blocking I/O. This package should be used for testing and experimental purposes only.
|
35
|
+
|
36
|
+
## Installation
|
37
|
+
|
38
|
+
```bash
|
39
|
+
pip install satif-ai
|
40
|
+
```
|
41
|
+
|
42
|
+
## Overview
|
43
|
+
|
44
|
+
SATIF AI enables automated transformation of heterogeneous data sources (CSV, Excel, PDF, XML, etc.) into any desired output format in 2 steps:
|
45
|
+
|
46
|
+
1. **Standardization**: Ingests heterogeneous source files (CSV, Excel, PDF, XML, etc.) and transforms them into SDIF, a structured intermediate format.
|
47
|
+
2. **Transformation**: Applies business logic to the standardized data to generate the target output files, with transformation code generated by AI.
|
48
|
+
|
49
|
+
## Key Features
|
50
|
+
|
51
|
+
- **Any Format Support**: Process virtually any input, even challenging unstructured content (PDFs, complex Excel sheets)
|
52
|
+
- **AI-Powered Code Generation**: Automatically generate transformation code from examples and natural language instructions
|
53
|
+
- **Robust Schema Enforcement**: Handle input data drift and schema inconsistencies through configurable validation
|
54
|
+
- **SQL-Based Data Processing**: Query and manipulate all data using SQL
|
55
|
+
- **Decoupled Processing Stages**: Standardize once, transform many times with different logic
|
56
|
+
|
57
|
+
## Usage
|
58
|
+
|
59
|
+
### Basic Workflow
|
60
|
+
|
61
|
+
```python
|
62
|
+
import asyncio
|
63
|
+
from satif_ai import astandardize, atransform
|
64
|
+
|
65
|
+
async def main():
|
66
|
+
# Step 1: Standardize input files into SDIF
|
67
|
+
sdif_path = await astandardize(
|
68
|
+
datasource=["data.csv", "reference.xlsx"],
|
69
|
+
output_path="standardized.sdif",
|
70
|
+
overwrite=True
|
71
|
+
)
|
72
|
+
|
73
|
+
# Step 2: Transform SDIF into desired output using AI
|
74
|
+
await atransform(
|
75
|
+
sdif=sdif_path,
|
76
|
+
output_target_files="output.json",
|
77
|
+
instructions="Extract customer IDs and purchase totals, calculate the average purchase value per customer, and output as JSON with customer_id and avg_purchase_value fields.",
|
78
|
+
llm_model="o4-mini" # Choose AI model based on needs
|
79
|
+
)
|
80
|
+
|
81
|
+
if __name__ == "__main__":
|
82
|
+
asyncio.run(main())
|
83
|
+
```
|
84
|
+
|
85
|
+
## Architecture
|
86
|
+
|
87
|
+
```
|
88
|
+
┌─────────────────┐ ┌───────────────────────┐ ┌─────────────────┐
|
89
|
+
│ Source Files │────▶│ Standardization Layer │────▶│ SDIF File │
|
90
|
+
│ CSV/Excel/PDF/ │ │ │ │ (SQLite-based) │
|
91
|
+
│ XML/JSON/etc. │ └───────────────────────┘ └────────┬────────┘
|
92
|
+
└─────────────────┘ │
|
93
|
+
│
|
94
|
+
┌─────────────────┐ ┌───────────────────────┐ │
|
95
|
+
│ Output Files │◀────│ Transformation Layer │◀─────────────┘
|
96
|
+
│ Any format │ │ (AI-generated code) │
|
97
|
+
└─────────────────┘ └───────────────────────┘
|
98
|
+
```
|
99
|
+
|
100
|
+
SDIF (Standardized Data Interoperable Format) is the intermediate SQLite-based format that:
|
101
|
+
|
102
|
+
- Stores structured tables alongside JSON objects and binary media
|
103
|
+
- Maintains rich metadata about data origins and relationships
|
104
|
+
- Provides direct SQL queryability for complex transformations
|
105
|
+
|
106
|
+
## Documentation
|
107
|
+
|
108
|
+
For detailed documentation, examples, and advanced features, visit [SATIF Documentation](https://satif.io/docs).
|
109
|
+
|
110
|
+
## Contributing
|
111
|
+
|
112
|
+
Contributions are welcome! Whether it's bug reports, feature requests, or code contributions, please feel free to get involved.
|
113
|
+
|
114
|
+
### Contribution Workflow
|
115
|
+
|
116
|
+
1. **Fork the repository** on GitHub.
|
117
|
+
2. **Clone your fork** locally:
|
118
|
+
|
119
|
+
```bash
|
120
|
+
git clone https://github.com/syncpulse-solutions/satif.git
|
121
|
+
cd satif/libs/ai
|
122
|
+
```
|
123
|
+
3. **Create a new branch** for your feature or bug fix:
|
124
|
+
|
125
|
+
```bash
|
126
|
+
git checkout -b feature/your-feature-name
|
127
|
+
```
|
128
|
+
|
129
|
+
or
|
130
|
+
|
131
|
+
```bash
|
132
|
+
git checkout -b fix/your-bug-fix-name
|
133
|
+
```
|
134
|
+
4. **Set up the development environment** as described in the [From Source (for Development)](#from-source-for-development) section:
|
135
|
+
|
136
|
+
```bash
|
137
|
+
make install # or poetry install
|
138
|
+
```
|
139
|
+
5. **Make your changes.** Ensure your code follows the project's style guidelines.
|
140
|
+
6. **Format and lint your code:**
|
141
|
+
|
142
|
+
```bash
|
143
|
+
make format
|
144
|
+
make lint
|
145
|
+
```
|
146
|
+
7. **Run type checks:**
|
147
|
+
|
148
|
+
```bash
|
149
|
+
make typecheck
|
150
|
+
```
|
151
|
+
8. **Run tests** to ensure your changes don't break existing functionality:
|
152
|
+
|
153
|
+
```bash
|
154
|
+
make test
|
155
|
+
```
|
156
|
+
|
157
|
+
To also generate a coverage report:
|
158
|
+
|
159
|
+
```bash
|
160
|
+
make coverage
|
161
|
+
```
|
162
|
+
9. **Commit your changes** with a clear and descriptive commit message.
|
163
|
+
10. **Push your changes** to your fork on GitHub:
|
164
|
+
|
165
|
+
```bash
|
166
|
+
git push origin feature/your-feature-name
|
167
|
+
```
|
168
|
+
11. **Submit a Pull Request (PR)** to the `main` branch of the original `syncpulse-solutions/satif` repository.
|
169
|
+
|
170
|
+
## License
|
171
|
+
|
172
|
+
This project is licensed under the MIT License.
|
173
|
+
|
174
|
+
Maintainer: Bryan Djafer (bryan.djafer@syncpulse.fr)
|
175
|
+
|
@@ -0,0 +1,153 @@
|
|
1
|
+
# SATIF AI
|
2
|
+
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
4
|
+
[](https://www.python.org/downloads/)
|
5
|
+
[](https://github.com/syncpulse-solutions/satif)
|
6
|
+
|
7
|
+
AI toolkit for transforming any input files into any output files.
|
8
|
+
|
9
|
+
## ⚠️ Disclaimer
|
10
|
+
|
11
|
+
**EXPERIMENTAL STATUS**: This package is in early development and not production-ready. The API may change significantly between versions.
|
12
|
+
|
13
|
+
**BLOCKING I/O**: Despite the async API, some operations may contain blocking I/O. This package should be used for testing and experimental purposes only.
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
|
17
|
+
```bash
|
18
|
+
pip install satif-ai
|
19
|
+
```
|
20
|
+
|
21
|
+
## Overview
|
22
|
+
|
23
|
+
SATIF AI enables automated transformation of heterogeneous data sources (CSV, Excel, PDF, XML, etc.) into any desired output format in 2 steps:
|
24
|
+
|
25
|
+
1. **Standardization**: Ingests heterogeneous source files (CSV, Excel, PDF, XML, etc.) and transforms them into SDIF, a structured intermediate format.
|
26
|
+
2. **Transformation**: Applies business logic to the standardized data to generate the target output files, with transformation code generated by AI.
|
27
|
+
|
28
|
+
## Key Features
|
29
|
+
|
30
|
+
- **Any Format Support**: Process virtually any input, even challenging unstructured content (PDFs, complex Excel sheets)
|
31
|
+
- **AI-Powered Code Generation**: Automatically generate transformation code from examples and natural language instructions
|
32
|
+
- **Robust Schema Enforcement**: Handle input data drift and schema inconsistencies through configurable validation
|
33
|
+
- **SQL-Based Data Processing**: Query and manipulate all data using SQL
|
34
|
+
- **Decoupled Processing Stages**: Standardize once, transform many times with different logic
|
35
|
+
|
36
|
+
## Usage
|
37
|
+
|
38
|
+
### Basic Workflow
|
39
|
+
|
40
|
+
```python
|
41
|
+
import asyncio
|
42
|
+
from satif_ai import astandardize, atransform
|
43
|
+
|
44
|
+
async def main():
|
45
|
+
# Step 1: Standardize input files into SDIF
|
46
|
+
sdif_path = await astandardize(
|
47
|
+
datasource=["data.csv", "reference.xlsx"],
|
48
|
+
output_path="standardized.sdif",
|
49
|
+
overwrite=True
|
50
|
+
)
|
51
|
+
|
52
|
+
# Step 2: Transform SDIF into desired output using AI
|
53
|
+
await atransform(
|
54
|
+
sdif=sdif_path,
|
55
|
+
output_target_files="output.json",
|
56
|
+
instructions="Extract customer IDs and purchase totals, calculate the average purchase value per customer, and output as JSON with customer_id and avg_purchase_value fields.",
|
57
|
+
llm_model="o4-mini" # Choose AI model based on needs
|
58
|
+
)
|
59
|
+
|
60
|
+
if __name__ == "__main__":
|
61
|
+
asyncio.run(main())
|
62
|
+
```
|
63
|
+
|
64
|
+
## Architecture
|
65
|
+
|
66
|
+
```
|
67
|
+
┌─────────────────┐ ┌───────────────────────┐ ┌─────────────────┐
|
68
|
+
│ Source Files │────▶│ Standardization Layer │────▶│ SDIF File │
|
69
|
+
│ CSV/Excel/PDF/ │ │ │ │ (SQLite-based) │
|
70
|
+
│ XML/JSON/etc. │ └───────────────────────┘ └────────┬────────┘
|
71
|
+
└─────────────────┘ │
|
72
|
+
│
|
73
|
+
┌─────────────────┐ ┌───────────────────────┐ │
|
74
|
+
│ Output Files │◀────│ Transformation Layer │◀─────────────┘
|
75
|
+
│ Any format │ │ (AI-generated code) │
|
76
|
+
└─────────────────┘ └───────────────────────┘
|
77
|
+
```
|
78
|
+
|
79
|
+
SDIF (Standardized Data Interoperable Format) is the intermediate SQLite-based format that:
|
80
|
+
|
81
|
+
- Stores structured tables alongside JSON objects and binary media
|
82
|
+
- Maintains rich metadata about data origins and relationships
|
83
|
+
- Provides direct SQL queryability for complex transformations
|
84
|
+
|
85
|
+
## Documentation
|
86
|
+
|
87
|
+
For detailed documentation, examples, and advanced features, visit [SATIF Documentation](https://satif.io/docs).
|
88
|
+
|
89
|
+
## Contributing
|
90
|
+
|
91
|
+
Contributions are welcome! Whether it's bug reports, feature requests, or code contributions, please feel free to get involved.
|
92
|
+
|
93
|
+
### Contribution Workflow
|
94
|
+
|
95
|
+
1. **Fork the repository** on GitHub.
|
96
|
+
2. **Clone your fork** locally:
|
97
|
+
|
98
|
+
```bash
|
99
|
+
git clone https://github.com/syncpulse-solutions/satif.git
|
100
|
+
cd satif/libs/ai
|
101
|
+
```
|
102
|
+
3. **Create a new branch** for your feature or bug fix:
|
103
|
+
|
104
|
+
```bash
|
105
|
+
git checkout -b feature/your-feature-name
|
106
|
+
```
|
107
|
+
|
108
|
+
or
|
109
|
+
|
110
|
+
```bash
|
111
|
+
git checkout -b fix/your-bug-fix-name
|
112
|
+
```
|
113
|
+
4. **Set up the development environment** as described in the [From Source (for Development)](#from-source-for-development) section:
|
114
|
+
|
115
|
+
```bash
|
116
|
+
make install # or poetry install
|
117
|
+
```
|
118
|
+
5. **Make your changes.** Ensure your code follows the project's style guidelines.
|
119
|
+
6. **Format and lint your code:**
|
120
|
+
|
121
|
+
```bash
|
122
|
+
make format
|
123
|
+
make lint
|
124
|
+
```
|
125
|
+
7. **Run type checks:**
|
126
|
+
|
127
|
+
```bash
|
128
|
+
make typecheck
|
129
|
+
```
|
130
|
+
8. **Run tests** to ensure your changes don't break existing functionality:
|
131
|
+
|
132
|
+
```bash
|
133
|
+
make test
|
134
|
+
```
|
135
|
+
|
136
|
+
To also generate a coverage report:
|
137
|
+
|
138
|
+
```bash
|
139
|
+
make coverage
|
140
|
+
```
|
141
|
+
9. **Commit your changes** with a clear and descriptive commit message.
|
142
|
+
10. **Push your changes** to your fork on GitHub:
|
143
|
+
|
144
|
+
```bash
|
145
|
+
git push origin feature/your-feature-name
|
146
|
+
```
|
147
|
+
11. **Submit a Pull Request (PR)** to the `main` branch of the original `syncpulse-solutions/satif` repository.
|
148
|
+
|
149
|
+
## License
|
150
|
+
|
151
|
+
This project is licensed under the MIT License.
|
152
|
+
|
153
|
+
Maintainer: Bryan Djafer (bryan.djafer@syncpulse.fr)
|
@@ -63,6 +63,38 @@ class AIStandardizer(AsyncStandardizer):
|
|
63
63
|
) -> Optional[Type[AsyncStandardizer]]:
|
64
64
|
return self.ai_standardizer_map.get(extension.lower())
|
65
65
|
|
66
|
+
def _resolve_file_path(
|
67
|
+
self, raw_path_item: Union[str, Path], temp_processing_dir: Path
|
68
|
+
) -> List[Path]:
|
69
|
+
"""
|
70
|
+
Resolves a single input path to a list of file paths.
|
71
|
+
This method contains blocking file system operations.
|
72
|
+
"""
|
73
|
+
raw_path = Path(raw_path_item).resolve()
|
74
|
+
input_file_paths: List[Path] = []
|
75
|
+
|
76
|
+
if not raw_path.exists():
|
77
|
+
raise FileNotFoundError(f"Input path not found: {raw_path}")
|
78
|
+
|
79
|
+
if raw_path.is_file():
|
80
|
+
if raw_path.suffix.lower() == ".zip":
|
81
|
+
# Zip extraction is handled asynchronously in the calling method
|
82
|
+
return [raw_path]
|
83
|
+
else:
|
84
|
+
input_file_paths.append(raw_path)
|
85
|
+
elif raw_path.is_dir():
|
86
|
+
logger.info(f"Processing directory datasource: {raw_path}")
|
87
|
+
for child_item in raw_path.iterdir():
|
88
|
+
if child_item.is_file():
|
89
|
+
input_file_paths.append(child_item)
|
90
|
+
# Deeper recursion to be implemented.
|
91
|
+
else:
|
92
|
+
logger.warning(
|
93
|
+
f"Input path '{raw_path}' is not a file or directory and will be ignored."
|
94
|
+
)
|
95
|
+
|
96
|
+
return input_file_paths
|
97
|
+
|
66
98
|
async def _resolve_input_files(
|
67
99
|
self, datasource: Datasource, temp_processing_dir: Path
|
68
100
|
) -> List[Path]:
|
@@ -70,8 +102,8 @@ class AIStandardizer(AsyncStandardizer):
|
|
70
102
|
Resolves the input datasource to a list of individual file paths.
|
71
103
|
Handles single files, lists of files, and extracts ZIP archives.
|
72
104
|
"""
|
73
|
-
input_file_paths: List[Path] = []
|
74
105
|
raw_paths_to_check: List[Union[str, Path]] = []
|
106
|
+
all_input_file_paths: List[Path] = []
|
75
107
|
|
76
108
|
if isinstance(datasource, (str, Path)):
|
77
109
|
raw_paths_to_check = [datasource]
|
@@ -88,12 +120,13 @@ class AIStandardizer(AsyncStandardizer):
|
|
88
120
|
if not raw_paths_to_check: # Should be caught by above, but defensive
|
89
121
|
raise ValueError("No input datasource paths provided.")
|
90
122
|
|
123
|
+
# Process each path item in a thread to avoid blocking the event loop
|
91
124
|
for raw_path_item in raw_paths_to_check:
|
92
|
-
|
93
|
-
|
94
|
-
|
125
|
+
resolved_paths = await asyncio.to_thread(
|
126
|
+
self._resolve_file_path, raw_path_item, temp_processing_dir
|
127
|
+
)
|
95
128
|
|
96
|
-
|
129
|
+
for raw_path in resolved_paths:
|
97
130
|
if raw_path.suffix.lower() == ".zip":
|
98
131
|
zip_extract_target = (
|
99
132
|
temp_processing_dir
|
@@ -103,7 +136,7 @@ class AIStandardizer(AsyncStandardizer):
|
|
103
136
|
extracted_from_zip = await extract_zip_archive_async(
|
104
137
|
raw_path, zip_extract_target
|
105
138
|
)
|
106
|
-
|
139
|
+
all_input_file_paths.extend(extracted_from_zip)
|
107
140
|
except Exception as e_zip:
|
108
141
|
logger.error(
|
109
142
|
f"Failed to extract ZIP archive '{raw_path}': {e_zip}",
|
@@ -113,23 +146,14 @@ class AIStandardizer(AsyncStandardizer):
|
|
113
146
|
# For now, skipping problematic zips.
|
114
147
|
continue
|
115
148
|
else:
|
116
|
-
|
117
|
-
elif raw_path.is_dir():
|
118
|
-
logger.info(f"Processing directory datasource: {raw_path}")
|
119
|
-
for child_item in raw_path.iterdir():
|
120
|
-
if child_item.is_file():
|
121
|
-
input_file_paths.append(child_item)
|
122
|
-
# Deeper recursion to be implemeted.
|
123
|
-
else:
|
124
|
-
logger.warning(
|
125
|
-
f"Input path '{raw_path}' is not a file or directory and will be ignored."
|
126
|
-
)
|
149
|
+
all_input_file_paths.append(raw_path)
|
127
150
|
|
128
|
-
if not
|
151
|
+
if not all_input_file_paths:
|
129
152
|
# This means all inputs were invalid, unresolvable, or zips failed etc.
|
130
153
|
logger.error("No processable files found after resolving datasource.")
|
131
154
|
raise ValueError("Datasource resolution resulted in no processable files.")
|
132
|
-
|
155
|
+
|
156
|
+
return all_input_file_paths
|
133
157
|
|
134
158
|
def _group_files_by_standardizer(
|
135
159
|
self, file_paths: List[Path]
|
@@ -269,7 +293,7 @@ class AIStandardizer(AsyncStandardizer):
|
|
269
293
|
|
270
294
|
return successful_intermediate_sdif_files, aggregated_file_configs
|
271
295
|
|
272
|
-
|
296
|
+
def _consolidate_results(
|
273
297
|
self,
|
274
298
|
intermediate_sdif_files: List[Path],
|
275
299
|
aggregated_file_configs: Optional[List[Dict[str, Any]]],
|
@@ -362,6 +386,59 @@ class AIStandardizer(AsyncStandardizer):
|
|
362
386
|
file_configs=aggregated_file_configs if aggregated_file_configs else None,
|
363
387
|
)
|
364
388
|
|
389
|
+
async def _setup_workspace(
|
390
|
+
self, output_path: Path, overwrite: bool
|
391
|
+
) -> Tuple[Path, Path, Path]:
|
392
|
+
"""
|
393
|
+
Sets up the temporary workspace directories and validates the output path.
|
394
|
+
Contains blocking file system operations.
|
395
|
+
"""
|
396
|
+
final_sdif_file_target = output_path.resolve()
|
397
|
+
|
398
|
+
if final_sdif_file_target.is_dir():
|
399
|
+
raise ValueError(
|
400
|
+
f"Target output_path '{final_sdif_file_target}' is a directory. "
|
401
|
+
"It must be a full file path for the target SDIF SQLite database (e.g., data.sqlite or data.sdif)."
|
402
|
+
)
|
403
|
+
if not final_sdif_file_target.suffix:
|
404
|
+
logger.warning(
|
405
|
+
f"Target output_path '{final_sdif_file_target}' has no file extension. "
|
406
|
+
"It should be a path to an SDIF SQLite database file (e.g., data.sqlite or data.sdif)."
|
407
|
+
)
|
408
|
+
elif final_sdif_file_target.suffix.lower() not in (".sdif", ".sqlite", ".db"):
|
409
|
+
logger.warning(
|
410
|
+
f"Target output_path '{final_sdif_file_target}' does not have a common SQLite extension. "
|
411
|
+
"Ensure this is the intended SQLite file path."
|
412
|
+
)
|
413
|
+
|
414
|
+
# Create a unique temporary directory for this standardization run
|
415
|
+
run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aistd_run_"))
|
416
|
+
intermediate_sdif_files_dir = run_temp_dir / "intermediate_sdif_files"
|
417
|
+
intermediate_sdif_files_dir.mkdir(parents=True, exist_ok=True)
|
418
|
+
file_processing_temp_dir = run_temp_dir / "file_processing_temp"
|
419
|
+
file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
|
420
|
+
|
421
|
+
return (
|
422
|
+
final_sdif_file_target,
|
423
|
+
intermediate_sdif_files_dir,
|
424
|
+
file_processing_temp_dir,
|
425
|
+
)
|
426
|
+
|
427
|
+
async def _cleanup_workspace(self, run_temp_dir: Path) -> None:
|
428
|
+
"""
|
429
|
+
Cleans up the temporary workspace directory.
|
430
|
+
Contains blocking file system operations.
|
431
|
+
"""
|
432
|
+
if run_temp_dir.exists():
|
433
|
+
try:
|
434
|
+
await asyncio.to_thread(shutil.rmtree, run_temp_dir)
|
435
|
+
logger.info(f"Cleaned up temporary run directory: {run_temp_dir}")
|
436
|
+
except Exception as e_clean:
|
437
|
+
logger.error(
|
438
|
+
f"Error cleaning up temporary run directory {run_temp_dir}: {e_clean}",
|
439
|
+
exc_info=True,
|
440
|
+
)
|
441
|
+
|
365
442
|
async def standardize(
|
366
443
|
self,
|
367
444
|
datasource: Datasource,
|
@@ -387,31 +464,15 @@ class AIStandardizer(AsyncStandardizer):
|
|
387
464
|
logger.info(
|
388
465
|
f"AIStandardizer starting process for output SDIF file: {output_path}"
|
389
466
|
)
|
390
|
-
final_sdif_file_target = Path(output_path).resolve()
|
391
467
|
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
logger.warning(
|
399
|
-
f"Target output_path '{final_sdif_file_target}' has no file extension. "
|
400
|
-
"It should be a path to an SDIF SQLite database file (e.g., data.sqlite or data.sdif)."
|
401
|
-
)
|
402
|
-
elif final_sdif_file_target.suffix.lower() not in (".sdif", ".sqlite", ".db"):
|
403
|
-
logger.warning(
|
404
|
-
f"Target output_path '{final_sdif_file_target}' does not have a common SQLite extension. "
|
405
|
-
"Ensure this is the intended SQLite file path."
|
406
|
-
)
|
468
|
+
# Setup workspace and validate output path - moved to a separate async function
|
469
|
+
(
|
470
|
+
final_sdif_file_target,
|
471
|
+
intermediate_sdif_files_dir,
|
472
|
+
file_processing_temp_dir,
|
473
|
+
) = await asyncio.to_thread(self._setup_workspace, Path(output_path), overwrite)
|
407
474
|
|
408
|
-
|
409
|
-
# This directory will hold intermediate files and ZIP extractions.
|
410
|
-
run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aistd_run_"))
|
411
|
-
intermediate_sdif_files_dir = run_temp_dir / "intermediate_sdif_files"
|
412
|
-
intermediate_sdif_files_dir.mkdir(parents=True, exist_ok=True)
|
413
|
-
file_processing_temp_dir = run_temp_dir / "file_processing_temp"
|
414
|
-
file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
|
475
|
+
run_temp_dir = file_processing_temp_dir.parent
|
415
476
|
|
416
477
|
try:
|
417
478
|
resolved_files = await self._resolve_input_files(
|
@@ -419,9 +480,11 @@ class AIStandardizer(AsyncStandardizer):
|
|
419
480
|
)
|
420
481
|
logger.info(f"Resolved {len(resolved_files)} file(s) for standardization.")
|
421
482
|
|
422
|
-
|
423
|
-
|
483
|
+
# File grouping - potentially move to a thread if the list is very large
|
484
|
+
grouped_by_std, unsupported = await asyncio.to_thread(
|
485
|
+
self._group_files_by_standardizer, resolved_files
|
424
486
|
)
|
487
|
+
|
425
488
|
if not grouped_by_std:
|
426
489
|
user_message = (
|
427
490
|
"No files found that can be handled by configured AI standardizers."
|
@@ -451,7 +514,8 @@ class AIStandardizer(AsyncStandardizer):
|
|
451
514
|
f"Successfully generated {len(intermediate_sdif_files)} intermediate SDIF SQLite file(s)."
|
452
515
|
)
|
453
516
|
|
454
|
-
final_result = await
|
517
|
+
final_result = await asyncio.to_thread(
|
518
|
+
self._consolidate_results,
|
455
519
|
intermediate_sdif_files,
|
456
520
|
aggregated_file_configs,
|
457
521
|
final_sdif_file_target,
|
@@ -469,13 +533,5 @@ class AIStandardizer(AsyncStandardizer):
|
|
469
533
|
raise
|
470
534
|
raise RuntimeError(f"AIStandardizer processing error: {e}") from e
|
471
535
|
finally:
|
472
|
-
# Clean up
|
473
|
-
|
474
|
-
try:
|
475
|
-
shutil.rmtree(run_temp_dir)
|
476
|
-
logger.info(f"Cleaned up temporary run directory: {run_temp_dir}")
|
477
|
-
except Exception as e_clean:
|
478
|
-
logger.error(
|
479
|
-
f"Error cleaning up temporary run directory {run_temp_dir}: {e_clean}",
|
480
|
-
exc_info=True,
|
481
|
-
)
|
536
|
+
# Clean up using a dedicated async method
|
537
|
+
await self._cleanup_workspace(run_temp_dir)
|
@@ -90,7 +90,6 @@ async def atransform(
|
|
90
90
|
# If code isn't provided, we need a builder. If a builder isn't provided, we create one.
|
91
91
|
if current_transformation_code is None:
|
92
92
|
if active_builder is None:
|
93
|
-
# Create SyncpulseTransformationBuilder
|
94
93
|
_effective_mcp_server = mcp_server if mcp_server is not None else mcp
|
95
94
|
|
96
95
|
_openai_mcp_instance = OpenAICompatibleMCP(mcp=_effective_mcp_server)
|
@@ -137,7 +136,6 @@ async def atransform(
|
|
137
136
|
if current_transformation_code is None:
|
138
137
|
raise ValueError("Transformation code could not be obtained or generated.")
|
139
138
|
|
140
|
-
# Code Executor and Transformation
|
141
139
|
_code_executor = code_executor if code_executor is not None else LocalCodeExecutor()
|
142
140
|
|
143
141
|
transformer = CodeTransformer(
|
@@ -0,0 +1,307 @@
|
|
1
|
+
import base64
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
from collections import defaultdict
|
5
|
+
from contextvars import ContextVar
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any, Dict, List, Optional, Union
|
8
|
+
|
9
|
+
from agents import Agent, Runner, function_tool
|
10
|
+
from agents.mcp.server import MCPServer
|
11
|
+
from mcp import ClientSession
|
12
|
+
from satif_core import AsyncTransformationBuilder
|
13
|
+
from satif_core.types import FilePath
|
14
|
+
from satif_sdk.code_executors.local_executor import LocalCodeExecutor
|
15
|
+
from satif_sdk.comparators import get_comparator
|
16
|
+
from satif_sdk.representers import get_representer
|
17
|
+
from satif_sdk.transformers import CodeTransformer
|
18
|
+
|
19
|
+
CONTEXT_INPUT_SDIF_PATH: ContextVar[Optional[Path]] = ContextVar(
|
20
|
+
"CONTEXT_INPUT_SDIF_PATH", default=None
|
21
|
+
)
|
22
|
+
CONTEXT_OUTPUT_TARGET_FILES: ContextVar[Optional[Dict[Union[str, Path], str]]] = (
|
23
|
+
ContextVar("CONTEXT_OUTPUT_TARGET_FILES", default=None)
|
24
|
+
)
|
25
|
+
CONTEXT_SCHEMA_ONLY: ContextVar[Optional[bool]] = ContextVar(
|
26
|
+
"CONTEXT_SCHEMA_ONLY", default=None
|
27
|
+
)
|
28
|
+
|
29
|
+
|
30
|
+
def _format_comparison_output(
|
31
|
+
comparison_result: Dict[str, Any],
|
32
|
+
schema_only_mode: Optional[bool],
|
33
|
+
source_file_display_name: str,
|
34
|
+
target_file_display_name: str,
|
35
|
+
) -> str:
|
36
|
+
"""
|
37
|
+
Formats the comparison result string, with special handling for schema_only mode
|
38
|
+
where files are equivalent due to being empty.
|
39
|
+
"""
|
40
|
+
base_message_prefix = f"Comparison for {source_file_display_name} [SOURCE] with {target_file_display_name} [TARGET]:"
|
41
|
+
|
42
|
+
if schema_only_mode is True and comparison_result.get("are_equivalent") is True:
|
43
|
+
details = comparison_result.get("details", {})
|
44
|
+
row_comparison = details.get("row_comparison", {})
|
45
|
+
|
46
|
+
row_count1 = row_comparison.get("row_count1")
|
47
|
+
row_count2 = row_comparison.get("row_count2")
|
48
|
+
|
49
|
+
if (
|
50
|
+
isinstance(row_count1, (int, float))
|
51
|
+
and row_count1 == 0
|
52
|
+
and isinstance(row_count2, (int, float))
|
53
|
+
and row_count2 == 0
|
54
|
+
):
|
55
|
+
return f"{base_message_prefix} Files have the same headers but are both empty (no data rows). This should not happen. Please verify the instructions and try again."
|
56
|
+
|
57
|
+
# Default formatting if the special condition isn't met
|
58
|
+
return f"{base_message_prefix} {comparison_result}"
|
59
|
+
|
60
|
+
|
61
|
+
@function_tool
|
62
|
+
async def execute_transformation(code: str) -> str:
|
63
|
+
"""Executes the transformation code on the input and returns the
|
64
|
+
comparison difference between the transformed output and the target output example.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
code: The code to execute on the input.
|
68
|
+
"""
|
69
|
+
input_sdif_path = CONTEXT_INPUT_SDIF_PATH.get()
|
70
|
+
output_target_files_dict = CONTEXT_OUTPUT_TARGET_FILES.get()
|
71
|
+
schema_only_flag = CONTEXT_SCHEMA_ONLY.get()
|
72
|
+
|
73
|
+
if input_sdif_path is None or output_target_files_dict is None:
|
74
|
+
return "Error: Transformation context not initialized correctly via contextvars"
|
75
|
+
|
76
|
+
code_transformer = CodeTransformer(
|
77
|
+
function=code,
|
78
|
+
code_executor=LocalCodeExecutor(disable_security_warning=True),
|
79
|
+
)
|
80
|
+
generated_output_path = code_transformer.export(input_sdif_path)
|
81
|
+
|
82
|
+
comparisons = []
|
83
|
+
comparator_kwargs = {}
|
84
|
+
if schema_only_flag:
|
85
|
+
comparator_kwargs["check_structure_only"] = True
|
86
|
+
|
87
|
+
if os.path.isdir(generated_output_path):
|
88
|
+
# If it's a directory, compare each file with its corresponding target
|
89
|
+
generated_files = os.listdir(generated_output_path)
|
90
|
+
|
91
|
+
for (
|
92
|
+
output_base_file,
|
93
|
+
output_target_file_name,
|
94
|
+
) in output_target_files_dict.items():
|
95
|
+
if output_target_file_name in generated_files:
|
96
|
+
generated_file_path = os.path.join(
|
97
|
+
generated_output_path, output_target_file_name
|
98
|
+
)
|
99
|
+
comparator = get_comparator(output_target_file_name.split(".")[-1])
|
100
|
+
comparison = comparator.compare(
|
101
|
+
generated_file_path, output_base_file, **comparator_kwargs
|
102
|
+
)
|
103
|
+
formatted_message = _format_comparison_output(
|
104
|
+
comparison,
|
105
|
+
schema_only_flag,
|
106
|
+
generated_file_path,
|
107
|
+
output_target_file_name,
|
108
|
+
)
|
109
|
+
comparisons.append(formatted_message)
|
110
|
+
else:
|
111
|
+
comparisons.append(
|
112
|
+
f"Error: {output_target_file_name} not found in the generated output"
|
113
|
+
)
|
114
|
+
else:
|
115
|
+
# If it's a single file, ensure there's only one target and compare
|
116
|
+
if len(output_target_files_dict) == 1:
|
117
|
+
output_file = list(output_target_files_dict.keys())[0]
|
118
|
+
output_target_file_name = list(output_target_files_dict.values())[0]
|
119
|
+
comparator = get_comparator(
|
120
|
+
str(output_file).split(".")[-1]
|
121
|
+
) # Ensure output_file is string for split
|
122
|
+
comparison = comparator.compare(
|
123
|
+
generated_output_path, output_file, **comparator_kwargs
|
124
|
+
)
|
125
|
+
formatted_message = _format_comparison_output(
|
126
|
+
comparison,
|
127
|
+
schema_only_flag,
|
128
|
+
str(generated_output_path),
|
129
|
+
output_target_file_name,
|
130
|
+
)
|
131
|
+
comparisons.append(formatted_message)
|
132
|
+
else:
|
133
|
+
comparisons.append(
|
134
|
+
"Error: Single output file generated but multiple target files expected"
|
135
|
+
)
|
136
|
+
|
137
|
+
return "\n".join(comparisons)
|
138
|
+
|
139
|
+
|
140
|
+
class SyncpulseTransformationBuilder(AsyncTransformationBuilder):
|
141
|
+
"""This class is used to build a transformation code that will be used to transform a SDIF database into a set of files following the format of the given output files."""
|
142
|
+
|
143
|
+
def __init__(
|
144
|
+
self,
|
145
|
+
mcp_server: MCPServer,
|
146
|
+
mcp_session: ClientSession,
|
147
|
+
llm_model: str = "o4-mini",
|
148
|
+
):
|
149
|
+
self.mcp_server = mcp_server
|
150
|
+
self.mcp_session = mcp_session
|
151
|
+
self.llm_model = llm_model
|
152
|
+
|
153
|
+
async def build(
|
154
|
+
self,
|
155
|
+
sdif: Path,
|
156
|
+
output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
|
157
|
+
output_sdif: Optional[Path] = None,
|
158
|
+
instructions: str = "",
|
159
|
+
schema_only: bool = False,
|
160
|
+
representer_kwargs: Optional[Dict[str, Any]] = None,
|
161
|
+
) -> str:
|
162
|
+
resolved_input_sdif_path = Path(sdif).resolve()
|
163
|
+
|
164
|
+
# OUTPUT_TARGET_FILES keys are absolute paths to original example files for local reading by representers/comparators.
|
165
|
+
# Values are agent-facing filenames.
|
166
|
+
resolved_output_target_files: Dict[Union[str, Path], str]
|
167
|
+
if isinstance(output_target_files, FilePath):
|
168
|
+
resolved_output_target_files = {
|
169
|
+
Path(output_target_files).resolve(): Path(output_target_files).name
|
170
|
+
}
|
171
|
+
elif isinstance(output_target_files, list):
|
172
|
+
resolved_output_target_files = {
|
173
|
+
Path(file_path).resolve(): Path(file_path).name
|
174
|
+
for file_path in output_target_files
|
175
|
+
}
|
176
|
+
elif isinstance(output_target_files, dict):
|
177
|
+
temp_map = {}
|
178
|
+
for k, v in output_target_files.items():
|
179
|
+
# Resolve Path keys to absolute paths
|
180
|
+
key_to_resolve = k
|
181
|
+
if (
|
182
|
+
isinstance(key_to_resolve, str) and Path(key_to_resolve).exists()
|
183
|
+
): # Check if string is a valid path
|
184
|
+
key_to_resolve = Path(key_to_resolve)
|
185
|
+
|
186
|
+
if isinstance(key_to_resolve, Path):
|
187
|
+
temp_map[key_to_resolve.resolve()] = v
|
188
|
+
else: # Keep non-Path keys as they are (e.g. if it's already a resolved string path from somewhere else)
|
189
|
+
temp_map[key_to_resolve] = v
|
190
|
+
resolved_output_target_files = temp_map
|
191
|
+
else:
|
192
|
+
resolved_output_target_files = {}
|
193
|
+
|
194
|
+
token_input_path = CONTEXT_INPUT_SDIF_PATH.set(resolved_input_sdif_path)
|
195
|
+
token_output_files = CONTEXT_OUTPUT_TARGET_FILES.set(
|
196
|
+
resolved_output_target_files
|
197
|
+
)
|
198
|
+
token_schema_only = CONTEXT_SCHEMA_ONLY.set(schema_only)
|
199
|
+
|
200
|
+
try:
|
201
|
+
# We must encode the path because special characters are not allowed in mcp read_resource()
|
202
|
+
input_sdif_mcp_uri_path = base64.b64encode(
|
203
|
+
str(resolved_input_sdif_path).encode()
|
204
|
+
).decode()
|
205
|
+
output_sdif_mcp_uri_path = (
|
206
|
+
base64.b64encode(str(output_sdif).encode()).decode()
|
207
|
+
if output_sdif
|
208
|
+
else None
|
209
|
+
)
|
210
|
+
|
211
|
+
input_schema = await self.mcp_session.read_resource(
|
212
|
+
f"schema://{input_sdif_mcp_uri_path}"
|
213
|
+
)
|
214
|
+
input_sample = await self.mcp_session.read_resource(
|
215
|
+
f"sample://{input_sdif_mcp_uri_path}"
|
216
|
+
)
|
217
|
+
|
218
|
+
output_schema_text = "N/A"
|
219
|
+
output_sample_text = "N/A"
|
220
|
+
if output_sdif_mcp_uri_path:
|
221
|
+
try:
|
222
|
+
output_schema_content = await self.mcp_session.read_resource(
|
223
|
+
f"schema://{output_sdif_mcp_uri_path}"
|
224
|
+
)
|
225
|
+
if output_schema_content.contents:
|
226
|
+
output_schema_text = output_schema_content.contents[0].text
|
227
|
+
except Exception as e:
|
228
|
+
print(
|
229
|
+
f"Warning: Could not read schema for output_sdif {output_sdif_mcp_uri_path}: {e}"
|
230
|
+
)
|
231
|
+
|
232
|
+
try:
|
233
|
+
output_sample_content = await self.mcp_session.read_resource(
|
234
|
+
f"sample://{output_sdif_mcp_uri_path}"
|
235
|
+
)
|
236
|
+
if output_sample_content.contents:
|
237
|
+
output_sample_text = output_sample_content.contents[0].text
|
238
|
+
except Exception as e:
|
239
|
+
print(
|
240
|
+
f"Warning: Could not read sample for output_sdif {output_sdif_mcp_uri_path}: {e}"
|
241
|
+
)
|
242
|
+
output_representation = defaultdict(dict)
|
243
|
+
if resolved_output_target_files:
|
244
|
+
for file_key_abs_path in list(resolved_output_target_files.keys()):
|
245
|
+
agent_facing_name = resolved_output_target_files[file_key_abs_path]
|
246
|
+
try:
|
247
|
+
# Representer uses the absolute path (file_key_abs_path) to read the example file.
|
248
|
+
representer = get_representer(file_key_abs_path)
|
249
|
+
representation, used_params = representer.represent(
|
250
|
+
file_key_abs_path, **(representer_kwargs or {})
|
251
|
+
)
|
252
|
+
output_representation[agent_facing_name] = {
|
253
|
+
"representation": representation,
|
254
|
+
"used_params": used_params,
|
255
|
+
}
|
256
|
+
except Exception as e:
|
257
|
+
print(
|
258
|
+
f"Warning: Could not get representation for {agent_facing_name} (path {file_key_abs_path}): {e}"
|
259
|
+
)
|
260
|
+
output_representation[agent_facing_name] = (
|
261
|
+
f"Error representing file: {e}"
|
262
|
+
)
|
263
|
+
|
264
|
+
prompt = await self.mcp_session.get_prompt(
|
265
|
+
"create_transformation",
|
266
|
+
arguments={
|
267
|
+
"input_file": Path(
|
268
|
+
input_sdif_mcp_uri_path # Use the original sdif path for display name logic if needed
|
269
|
+
).name,
|
270
|
+
"input_schema": input_schema.contents[0].text
|
271
|
+
if input_schema.contents
|
272
|
+
else "Error reading input schema",
|
273
|
+
"input_sample": input_sample.contents[0].text
|
274
|
+
if input_sample.contents
|
275
|
+
else "Error reading input sample",
|
276
|
+
"output_files": str(list(resolved_output_target_files.values())),
|
277
|
+
"output_schema": output_schema_text,
|
278
|
+
"output_sample": output_sample_text
|
279
|
+
if not schema_only
|
280
|
+
else "Sample not available. File is empty (no data).",
|
281
|
+
"output_representation": str(output_representation),
|
282
|
+
"instructions": instructions
|
283
|
+
or "No instructions provided. Use the output example.",
|
284
|
+
},
|
285
|
+
)
|
286
|
+
agent = Agent(
|
287
|
+
name="Transformation Builder",
|
288
|
+
mcp_servers=[self.mcp_server],
|
289
|
+
tools=[execute_transformation],
|
290
|
+
model=self.llm_model,
|
291
|
+
)
|
292
|
+
result = await Runner.run(agent, prompt.messages[0].content.text)
|
293
|
+
transformation_code = self.parse_code(result.final_output)
|
294
|
+
return transformation_code
|
295
|
+
finally:
|
296
|
+
# Reset context variables after the task is done
|
297
|
+
CONTEXT_INPUT_SDIF_PATH.reset(token_input_path)
|
298
|
+
CONTEXT_OUTPUT_TARGET_FILES.reset(token_output_files)
|
299
|
+
CONTEXT_SCHEMA_ONLY.reset(token_schema_only)
|
300
|
+
|
301
|
+
def parse_code(self, code) -> str:
|
302
|
+
match = re.search(r"```(?:python)?(.*?)```", code, re.DOTALL)
|
303
|
+
if match:
|
304
|
+
return match.group(1).strip()
|
305
|
+
else:
|
306
|
+
# Handle case where no code block is found
|
307
|
+
return code.strip()
|
satif_ai-0.2.10/PKG-INFO
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.3
|
2
|
-
Name: satif-ai
|
3
|
-
Version: 0.2.10
|
4
|
-
Summary: AI Agents for Satif
|
5
|
-
License: MIT
|
6
|
-
Author: Syncpulse
|
7
|
-
Maintainer: Bryan Djafer
|
8
|
-
Maintainer-email: bryan.djafer@syncpulse.fr
|
9
|
-
Requires-Python: >=3.10,<3.14
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
13
|
-
Classifier: Programming Language :: Python :: 3.11
|
14
|
-
Classifier: Programming Language :: Python :: 3.12
|
15
|
-
Classifier: Programming Language :: Python :: 3.13
|
16
|
-
Provides-Extra: xlsx
|
17
|
-
Requires-Dist: openai-agents (>=0.0.9,<0.0.10)
|
18
|
-
Requires-Dist: satif-sdk (>=0.1.0,<1.0.0)
|
19
|
-
Requires-Dist: sdif-mcp (>=0.1.0,<1.0.0)
|
20
|
-
Description-Content-Type: text/markdown
|
21
|
-
|
22
|
-
# SATIF AI
|
23
|
-
|
satif_ai-0.2.10/README.md
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
# SATIF AI
|
@@ -1,277 +0,0 @@
|
|
1
|
-
import base64
|
2
|
-
import os
|
3
|
-
import re
|
4
|
-
from collections import defaultdict
|
5
|
-
from pathlib import Path
|
6
|
-
from typing import Any, Dict, List, Optional, Union
|
7
|
-
|
8
|
-
from agents import Agent, Runner, function_tool
|
9
|
-
from agents.mcp.server import MCPServer
|
10
|
-
from mcp import ClientSession
|
11
|
-
from satif_core import AsyncTransformationBuilder
|
12
|
-
from satif_core.types import FilePath
|
13
|
-
from satif_sdk.code_executors.local_executor import LocalCodeExecutor
|
14
|
-
from satif_sdk.comparators import get_comparator
|
15
|
-
from satif_sdk.representers import get_representer
|
16
|
-
from satif_sdk.transformers import CodeTransformer
|
17
|
-
|
18
|
-
# Global variables for transformation
|
19
|
-
INPUT_SDIF_PATH: Optional[Path] = None
|
20
|
-
OUTPUT_TARGET_FILES: Optional[Dict[Union[str, Path], str]] = None
|
21
|
-
SCHEMA_ONLY: Optional[bool] = None
|
22
|
-
|
23
|
-
|
24
|
-
def _format_comparison_output(
|
25
|
-
comparison_result: Dict[str, Any],
|
26
|
-
schema_only_mode: Optional[bool],
|
27
|
-
source_file_display_name: str,
|
28
|
-
target_file_display_name: str,
|
29
|
-
) -> str:
|
30
|
-
"""
|
31
|
-
Formats the comparison result string, with special handling for schema_only mode
|
32
|
-
where files are equivalent due to being empty.
|
33
|
-
"""
|
34
|
-
base_message_prefix = f"Comparison for {source_file_display_name} [SOURCE] with {target_file_display_name} [TARGET]:"
|
35
|
-
|
36
|
-
if schema_only_mode is True and comparison_result.get("are_equivalent") is True:
|
37
|
-
details = comparison_result.get("details", {})
|
38
|
-
row_comparison = details.get("row_comparison", {})
|
39
|
-
|
40
|
-
row_count1 = row_comparison.get("row_count1")
|
41
|
-
row_count2 = row_comparison.get("row_count2")
|
42
|
-
|
43
|
-
if (
|
44
|
-
isinstance(row_count1, (int, float))
|
45
|
-
and row_count1 == 0
|
46
|
-
and isinstance(row_count2, (int, float))
|
47
|
-
and row_count2 == 0
|
48
|
-
):
|
49
|
-
return f"{base_message_prefix} Files have the same headers but are both empty (no data rows). This should not happen. Please verify the instructions and try again."
|
50
|
-
|
51
|
-
# Default formatting if the special condition isn't met
|
52
|
-
return f"{base_message_prefix} {comparison_result}"
|
53
|
-
|
54
|
-
|
55
|
-
@function_tool
|
56
|
-
async def execute_transformation(code: str) -> str:
|
57
|
-
"""Executes the transformation code on the input and returns the
|
58
|
-
comparison difference between the transformed output and the target output example.
|
59
|
-
|
60
|
-
Args:
|
61
|
-
code: The code to execute on the input.
|
62
|
-
"""
|
63
|
-
if INPUT_SDIF_PATH is None or OUTPUT_TARGET_FILES is None:
|
64
|
-
return "Error: Transformation context not initialized"
|
65
|
-
|
66
|
-
code_transformer = CodeTransformer(
|
67
|
-
function=code,
|
68
|
-
code_executor=LocalCodeExecutor(disable_security_warning=True),
|
69
|
-
)
|
70
|
-
generated_output_path = code_transformer.export(INPUT_SDIF_PATH)
|
71
|
-
|
72
|
-
comparisons = []
|
73
|
-
comparator_kwargs = {}
|
74
|
-
if SCHEMA_ONLY:
|
75
|
-
comparator_kwargs["check_structure_only"] = True
|
76
|
-
|
77
|
-
if os.path.isdir(generated_output_path):
|
78
|
-
# If it's a directory, compare each file with its corresponding target
|
79
|
-
generated_files = os.listdir(generated_output_path)
|
80
|
-
|
81
|
-
for (
|
82
|
-
output_base_file,
|
83
|
-
output_target_file_name,
|
84
|
-
) in OUTPUT_TARGET_FILES.items():
|
85
|
-
if output_target_file_name in generated_files:
|
86
|
-
generated_file_path = os.path.join(
|
87
|
-
generated_output_path, output_target_file_name
|
88
|
-
)
|
89
|
-
comparator = get_comparator(output_target_file_name.split(".")[-1])
|
90
|
-
comparison = comparator.compare(
|
91
|
-
generated_file_path, output_base_file, **comparator_kwargs
|
92
|
-
)
|
93
|
-
formatted_message = _format_comparison_output(
|
94
|
-
comparison,
|
95
|
-
SCHEMA_ONLY,
|
96
|
-
generated_file_path,
|
97
|
-
output_target_file_name,
|
98
|
-
)
|
99
|
-
comparisons.append(formatted_message)
|
100
|
-
else:
|
101
|
-
comparisons.append(
|
102
|
-
f"Error: {output_target_file_name} not found in the generated output"
|
103
|
-
)
|
104
|
-
else:
|
105
|
-
# If it's a single file, ensure there's only one target and compare
|
106
|
-
if len(OUTPUT_TARGET_FILES) == 1:
|
107
|
-
output_file = list(OUTPUT_TARGET_FILES.keys())[0]
|
108
|
-
output_target_file_name = list(OUTPUT_TARGET_FILES.values())[0]
|
109
|
-
comparator = get_comparator(output_file.split(".")[-1])
|
110
|
-
comparison = comparator.compare(
|
111
|
-
generated_output_path, output_file, **comparator_kwargs
|
112
|
-
)
|
113
|
-
formatted_message = _format_comparison_output(
|
114
|
-
comparison,
|
115
|
-
SCHEMA_ONLY,
|
116
|
-
str(generated_output_path),
|
117
|
-
output_target_file_name,
|
118
|
-
)
|
119
|
-
comparisons.append(formatted_message)
|
120
|
-
else:
|
121
|
-
comparisons.append(
|
122
|
-
"Error: Single output file generated but multiple target files expected"
|
123
|
-
)
|
124
|
-
|
125
|
-
return "\n".join(comparisons)
|
126
|
-
|
127
|
-
|
128
|
-
class SyncpulseTransformationBuilder(AsyncTransformationBuilder):
|
129
|
-
"""This class is used to build a transformation code that will be used to transform a SDIF database into a set of files following the format of the given output files."""
|
130
|
-
|
131
|
-
def __init__(
|
132
|
-
self,
|
133
|
-
mcp_server: MCPServer,
|
134
|
-
mcp_session: ClientSession,
|
135
|
-
llm_model: str = "o4-mini",
|
136
|
-
):
|
137
|
-
self.mcp_server = mcp_server
|
138
|
-
self.mcp_session = mcp_session
|
139
|
-
self.llm_model = llm_model
|
140
|
-
|
141
|
-
async def build(
|
142
|
-
self,
|
143
|
-
sdif: Path,
|
144
|
-
output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
|
145
|
-
output_sdif: Optional[Path] = None,
|
146
|
-
instructions: str = "",
|
147
|
-
schema_only: bool = False,
|
148
|
-
representer_kwargs: Optional[Dict[str, Any]] = None,
|
149
|
-
) -> str:
|
150
|
-
global INPUT_SDIF_PATH, OUTPUT_TARGET_FILES, SCHEMA_ONLY
|
151
|
-
|
152
|
-
INPUT_SDIF_PATH = Path(sdif).resolve()
|
153
|
-
SCHEMA_ONLY = schema_only
|
154
|
-
# We must encode the path because special characters are not allowed in mcp read_resource()
|
155
|
-
input_sdif_mcp_uri_path = base64.b64encode(str(sdif).encode()).decode()
|
156
|
-
output_sdif_mcp_uri_path = (
|
157
|
-
base64.b64encode(str(output_sdif).encode()).decode()
|
158
|
-
if output_sdif
|
159
|
-
else None
|
160
|
-
)
|
161
|
-
|
162
|
-
input_schema = await self.mcp_session.read_resource(
|
163
|
-
f"schema://{input_sdif_mcp_uri_path}"
|
164
|
-
)
|
165
|
-
input_sample = await self.mcp_session.read_resource(
|
166
|
-
f"sample://{input_sdif_mcp_uri_path}"
|
167
|
-
)
|
168
|
-
|
169
|
-
output_schema_text = "N/A"
|
170
|
-
output_sample_text = "N/A"
|
171
|
-
if output_sdif_mcp_uri_path:
|
172
|
-
try:
|
173
|
-
output_schema_content = await self.mcp_session.read_resource(
|
174
|
-
f"schema://{output_sdif_mcp_uri_path}"
|
175
|
-
)
|
176
|
-
if output_schema_content.contents:
|
177
|
-
output_schema_text = output_schema_content.contents[0].text
|
178
|
-
except Exception as e:
|
179
|
-
print(
|
180
|
-
f"Warning: Could not read schema for output_sdif {output_sdif_mcp_uri_path}: {e}"
|
181
|
-
)
|
182
|
-
|
183
|
-
try:
|
184
|
-
output_sample_content = await self.mcp_session.read_resource(
|
185
|
-
f"sample://{output_sdif_mcp_uri_path}"
|
186
|
-
)
|
187
|
-
if output_sample_content.contents:
|
188
|
-
output_sample_text = output_sample_content.contents[0].text
|
189
|
-
except Exception as e:
|
190
|
-
print(
|
191
|
-
f"Warning: Could not read sample for output_sdif {output_sdif_mcp_uri_path}: {e}"
|
192
|
-
)
|
193
|
-
|
194
|
-
# OUTPUT_TARGET_FILES keys are absolute paths to original example files for local reading by representers/comparators.
|
195
|
-
# Values are agent-facing filenames.
|
196
|
-
if isinstance(output_target_files, FilePath):
|
197
|
-
OUTPUT_TARGET_FILES = {
|
198
|
-
Path(output_target_files).resolve(): Path(output_target_files).name
|
199
|
-
}
|
200
|
-
elif isinstance(output_target_files, list):
|
201
|
-
OUTPUT_TARGET_FILES = {
|
202
|
-
Path(file_path).resolve(): Path(file_path).name
|
203
|
-
for file_path in output_target_files
|
204
|
-
}
|
205
|
-
elif isinstance(output_target_files, dict):
|
206
|
-
temp_map = {}
|
207
|
-
for k, v in output_target_files.items():
|
208
|
-
if isinstance(k, Path):
|
209
|
-
temp_map[k.resolve()] = v
|
210
|
-
else:
|
211
|
-
temp_map[k] = v
|
212
|
-
OUTPUT_TARGET_FILES = temp_map
|
213
|
-
else:
|
214
|
-
OUTPUT_TARGET_FILES = {}
|
215
|
-
|
216
|
-
output_representation = defaultdict(dict)
|
217
|
-
if OUTPUT_TARGET_FILES:
|
218
|
-
for file_key_abs_path in list(OUTPUT_TARGET_FILES.keys()):
|
219
|
-
agent_facing_name = OUTPUT_TARGET_FILES[file_key_abs_path]
|
220
|
-
print(f"Representing {agent_facing_name} from {file_key_abs_path}")
|
221
|
-
try:
|
222
|
-
# Representer uses the absolute path (file_key_abs_path) to read the example file.
|
223
|
-
representer = get_representer(file_key_abs_path)
|
224
|
-
representation, used_params = representer.represent(
|
225
|
-
file_key_abs_path, **(representer_kwargs or {})
|
226
|
-
)
|
227
|
-
output_representation[agent_facing_name] = {
|
228
|
-
"representation": representation,
|
229
|
-
"used_params": used_params,
|
230
|
-
}
|
231
|
-
except Exception as e:
|
232
|
-
print(
|
233
|
-
f"Warning: Could not get representation for {agent_facing_name} (path {file_key_abs_path}): {e}"
|
234
|
-
)
|
235
|
-
output_representation[agent_facing_name] = (
|
236
|
-
f"Error representing file: {e}"
|
237
|
-
)
|
238
|
-
|
239
|
-
prompt = await self.mcp_session.get_prompt(
|
240
|
-
"create_transformation",
|
241
|
-
arguments={
|
242
|
-
"input_file": Path(
|
243
|
-
input_sdif_mcp_uri_path
|
244
|
-
).name, # Display name for prompt (from relative path)
|
245
|
-
"input_schema": input_schema.contents[0].text
|
246
|
-
if input_schema.contents
|
247
|
-
else "Error reading input schema",
|
248
|
-
"input_sample": input_sample.contents[0].text
|
249
|
-
if input_sample.contents
|
250
|
-
else "Error reading input sample",
|
251
|
-
"output_files": str(list(OUTPUT_TARGET_FILES.values())),
|
252
|
-
"output_schema": output_schema_text,
|
253
|
-
"output_sample": output_sample_text
|
254
|
-
if not SCHEMA_ONLY
|
255
|
-
else "Sample not available. File is empty (no data).",
|
256
|
-
"output_representation": str(output_representation),
|
257
|
-
"instructions": instructions
|
258
|
-
or "No instructions provided. Use the output example.",
|
259
|
-
},
|
260
|
-
)
|
261
|
-
agent = Agent(
|
262
|
-
name="Transformation Builder",
|
263
|
-
mcp_servers=[self.mcp_server],
|
264
|
-
tools=[execute_transformation],
|
265
|
-
model=self.llm_model,
|
266
|
-
)
|
267
|
-
result = await Runner.run(agent, prompt.messages[0].content.text)
|
268
|
-
transformation_code = self.parse_code(result.final_output)
|
269
|
-
return transformation_code
|
270
|
-
|
271
|
-
def parse_code(self, code) -> str:
|
272
|
-
match = re.search(r"```(?:python)?(.*?)```", code, re.DOTALL)
|
273
|
-
if match:
|
274
|
-
return match.group(1).strip()
|
275
|
-
else:
|
276
|
-
# Handle case where no code block is found
|
277
|
-
return code.strip()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|