dataforge-07-mcp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_07_mcp-0.1.0/PKG-INFO +124 -0
- dataforge_07_mcp-0.1.0/README.md +105 -0
- dataforge_07_mcp-0.1.0/dataforge_07_mcp.egg-info/PKG-INFO +124 -0
- dataforge_07_mcp-0.1.0/dataforge_07_mcp.egg-info/SOURCES.txt +13 -0
- dataforge_07_mcp-0.1.0/dataforge_07_mcp.egg-info/dependency_links.txt +1 -0
- dataforge_07_mcp-0.1.0/dataforge_07_mcp.egg-info/entry_points.txt +3 -0
- dataforge_07_mcp-0.1.0/dataforge_07_mcp.egg-info/requires.txt +5 -0
- dataforge_07_mcp-0.1.0/dataforge_07_mcp.egg-info/top_level.txt +1 -0
- dataforge_07_mcp-0.1.0/dataforge_mcp/__init__.py +7 -0
- dataforge_07_mcp-0.1.0/dataforge_mcp/server.py +104 -0
- dataforge_07_mcp-0.1.0/dataforge_mcp/tools.py +396 -0
- dataforge_07_mcp-0.1.0/pyproject.toml +39 -0
- dataforge_07_mcp-0.1.0/setup.cfg +4 -0
- dataforge_07_mcp-0.1.0/tests/test_server_integration.py +62 -0
- dataforge_07_mcp-0.1.0/tests/test_tools.py +153 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge_07_mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Model Context Protocol server for DataForge data-quality tools.
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Project-URL: Homepage, https://github.com/Aegis15/dataforge
|
|
7
|
+
Project-URL: Repository, https://github.com/Aegis15/dataforge
|
|
8
|
+
Project-URL: Documentation, https://dataforge.praneshrajan15.workers.dev/playground
|
|
9
|
+
Keywords: data-quality,dataforge,mcp,model-context-protocol
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Python: <3.13,>=3.11
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: dataforge_07<0.2,>=0.1.0
|
|
16
|
+
Requires-Dist: mcp>=1.27
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest>=9.0.3; extra == "dev"
|
|
19
|
+
|
|
20
|
+
# dataforge-mcp
|
|
21
|
+
|
|
22
|
+
`dataforge-mcp` exposes DataForge's shipped CSV profiling, detection, repair,
|
|
23
|
+
verification, and transaction-revert paths as Model Context Protocol tools.
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
cd dataforge-mcp
|
|
27
|
+
python -m pip install -e ".[dev]"
|
|
28
|
+
dataforge-mcp serve --allowed-root /path/to/csv/workspace
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
For local development from this repository:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
cd dataforge-mcp
|
|
35
|
+
python -m pip install -e ".[dev]"
|
|
36
|
+
dataforge-mcp serve --allowed-root ..
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
The default transport is stdio, which is what local desktop MCP clients expect.
|
|
40
|
+
For local Streamable HTTP experiments:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
dataforge-mcp serve --transport streamable-http --host 127.0.0.1 --port 8000
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
`dry_run` is the safe default. To allow file mutation through MCP, start the
|
|
47
|
+
server with an explicit allowed root and `--enable-apply`:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
dataforge-mcp serve --allowed-root /path/to/csv/workspace --enable-apply
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Tools
|
|
54
|
+
|
|
55
|
+
- `dataforge_profile(path: str)` - summarize CSV shape plus detected issues.
|
|
56
|
+
- `dataforge_detect_errors(path: str)` - return detected issues only.
|
|
57
|
+
- `dataforge_verify_fix(fix_spec: dict)` - run one candidate fix through stale
|
|
58
|
+
value checks, safety, and verification.
|
|
59
|
+
- `dataforge_apply_repairs(path: str, mode: "dry_run" | "apply")` - propose
|
|
60
|
+
verified repairs and optionally write a reversible transaction.
|
|
61
|
+
- `dataforge_revert(txn_id: str)` - restore a transaction's original bytes.
|
|
62
|
+
|
|
63
|
+
## Client Configuration
|
|
64
|
+
|
|
65
|
+
Use the same server command for Claude Desktop, Cursor, Windsurf, or any local
|
|
66
|
+
MCP client that supports stdio servers:
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"mcpServers": {
|
|
71
|
+
"dataforge": {
|
|
72
|
+
"command": "dataforge-mcp",
|
|
73
|
+
"args": ["serve", "--allowed-root", "/path/to/csv/workspace"]
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
If your client cannot resolve the console script, replace `command` with the
|
|
80
|
+
absolute path returned by your shell:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
which dataforge-mcp
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
On Windows PowerShell:
|
|
87
|
+
|
|
88
|
+
```powershell
|
|
89
|
+
Get-Command dataforge-mcp
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Before describing a build as agent-ready, run an MCP Inspector smoke check
|
|
93
|
+
against a fixture directory and confirm the profile, detect, verify, dry-run
|
|
94
|
+
apply, and disabled-apply paths:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
npx @modelcontextprotocol/inspector dataforge-mcp serve --allowed-root /path/to/csv/workspace
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Safety Model
|
|
101
|
+
|
|
102
|
+
`apply` mode uses DataForge's detector -> repairer -> SafetyFilter ->
|
|
103
|
+
SMTVerifier -> transaction-log path. The tool writes the transaction journal and
|
|
104
|
+
source snapshot before mutating the CSV, and `dataforge_revert` restores the
|
|
105
|
+
snapshot only when the current file still matches the recorded post-state hash.
|
|
106
|
+
|
|
107
|
+
The MCP server does not enable live LLM repair fallback by default. It does not
|
|
108
|
+
send CSV contents to any external model provider. It also rejects CSV and schema
|
|
109
|
+
paths outside the configured allowed roots, and `apply` mode is disabled unless
|
|
110
|
+
the server is started with `--enable-apply` or `DATAFORGE_MCP_ENABLE_APPLY=1`.
|
|
111
|
+
|
|
112
|
+
## Release
|
|
113
|
+
|
|
114
|
+
The package is intended to release independently from the nested
|
|
115
|
+
`dataforge-mcp/` source directory as the `dataforge_07_mcp` distribution, but
|
|
116
|
+
it is not published yet. After PyPI Trusted Publishing is configured, the
|
|
117
|
+
workflow will build on tags matching:
|
|
118
|
+
|
|
119
|
+
```text
|
|
120
|
+
dataforge-mcp-v*
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
The package depends on `dataforge_07` and the official Python `mcp` SDK; it does
|
|
124
|
+
not vendor DataForge or add MCP dependencies to the core package.
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# dataforge-mcp
|
|
2
|
+
|
|
3
|
+
`dataforge-mcp` exposes DataForge's shipped CSV profiling, detection, repair,
|
|
4
|
+
verification, and transaction-revert paths as Model Context Protocol tools.
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
cd dataforge-mcp
|
|
8
|
+
python -m pip install -e ".[dev]"
|
|
9
|
+
dataforge-mcp serve --allowed-root /path/to/csv/workspace
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
For local development from this repository:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
cd dataforge-mcp
|
|
16
|
+
python -m pip install -e ".[dev]"
|
|
17
|
+
dataforge-mcp serve --allowed-root ..
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
The default transport is stdio, which is what local desktop MCP clients expect.
|
|
21
|
+
For local Streamable HTTP experiments:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
dataforge-mcp serve --transport streamable-http --host 127.0.0.1 --port 8000
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
`dry_run` is the safe default. To allow file mutation through MCP, start the
|
|
28
|
+
server with an explicit allowed root and `--enable-apply`:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
dataforge-mcp serve --allowed-root /path/to/csv/workspace --enable-apply
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Tools
|
|
35
|
+
|
|
36
|
+
- `dataforge_profile(path: str)` - summarize CSV shape plus detected issues.
|
|
37
|
+
- `dataforge_detect_errors(path: str)` - return detected issues only.
|
|
38
|
+
- `dataforge_verify_fix(fix_spec: dict)` - run one candidate fix through stale
|
|
39
|
+
value checks, safety, and verification.
|
|
40
|
+
- `dataforge_apply_repairs(path: str, mode: "dry_run" | "apply")` - propose
|
|
41
|
+
verified repairs and optionally write a reversible transaction.
|
|
42
|
+
- `dataforge_revert(txn_id: str)` - restore a transaction's original bytes.
|
|
43
|
+
|
|
44
|
+
## Client Configuration
|
|
45
|
+
|
|
46
|
+
Use the same server command for Claude Desktop, Cursor, Windsurf, or any local
|
|
47
|
+
MCP client that supports stdio servers:
|
|
48
|
+
|
|
49
|
+
```json
|
|
50
|
+
{
|
|
51
|
+
"mcpServers": {
|
|
52
|
+
"dataforge": {
|
|
53
|
+
"command": "dataforge-mcp",
|
|
54
|
+
"args": ["serve", "--allowed-root", "/path/to/csv/workspace"]
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
If your client cannot resolve the console script, replace `command` with the
|
|
61
|
+
absolute path returned by your shell:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
which dataforge-mcp
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
On Windows PowerShell:
|
|
68
|
+
|
|
69
|
+
```powershell
|
|
70
|
+
Get-Command dataforge-mcp
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Before describing a build as agent-ready, run an MCP Inspector smoke check
|
|
74
|
+
against a fixture directory and confirm the profile, detect, verify, dry-run
|
|
75
|
+
apply, and disabled-apply paths:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
npx @modelcontextprotocol/inspector dataforge-mcp serve --allowed-root /path/to/csv/workspace
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Safety Model
|
|
82
|
+
|
|
83
|
+
`apply` mode uses DataForge's detector -> repairer -> SafetyFilter ->
|
|
84
|
+
SMTVerifier -> transaction-log path. The tool writes the transaction journal and
|
|
85
|
+
source snapshot before mutating the CSV, and `dataforge_revert` restores the
|
|
86
|
+
snapshot only when the current file still matches the recorded post-state hash.
|
|
87
|
+
|
|
88
|
+
The MCP server does not enable live LLM repair fallback by default. It does not
|
|
89
|
+
send CSV contents to any external model provider. It also rejects CSV and schema
|
|
90
|
+
paths outside the configured allowed roots, and `apply` mode is disabled unless
|
|
91
|
+
the server is started with `--enable-apply` or `DATAFORGE_MCP_ENABLE_APPLY=1`.
|
|
92
|
+
|
|
93
|
+
## Release
|
|
94
|
+
|
|
95
|
+
The package is intended to release independently from the nested
|
|
96
|
+
`dataforge-mcp/` source directory as the `dataforge_07_mcp` distribution, but
|
|
97
|
+
it is not published yet. After PyPI Trusted Publishing is configured, the
|
|
98
|
+
workflow will build on tags matching:
|
|
99
|
+
|
|
100
|
+
```text
|
|
101
|
+
dataforge-mcp-v*
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
The package depends on `dataforge_07` and the official Python `mcp` SDK; it does
|
|
105
|
+
not vendor DataForge or add MCP dependencies to the core package.
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge_07_mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Model Context Protocol server for DataForge data-quality tools.
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Project-URL: Homepage, https://github.com/Aegis15/dataforge
|
|
7
|
+
Project-URL: Repository, https://github.com/Aegis15/dataforge
|
|
8
|
+
Project-URL: Documentation, https://dataforge.praneshrajan15.workers.dev/playground
|
|
9
|
+
Keywords: data-quality,dataforge,mcp,model-context-protocol
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Python: <3.13,>=3.11
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: dataforge_07<0.2,>=0.1.0
|
|
16
|
+
Requires-Dist: mcp>=1.27
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest>=9.0.3; extra == "dev"
|
|
19
|
+
|
|
20
|
+
# dataforge-mcp
|
|
21
|
+
|
|
22
|
+
`dataforge-mcp` exposes DataForge's shipped CSV profiling, detection, repair,
|
|
23
|
+
verification, and transaction-revert paths as Model Context Protocol tools.
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
cd dataforge-mcp
|
|
27
|
+
python -m pip install -e ".[dev]"
|
|
28
|
+
dataforge-mcp serve --allowed-root /path/to/csv/workspace
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
For local development from this repository:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
cd dataforge-mcp
|
|
35
|
+
python -m pip install -e ".[dev]"
|
|
36
|
+
dataforge-mcp serve --allowed-root ..
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
The default transport is stdio, which is what local desktop MCP clients expect.
|
|
40
|
+
For local Streamable HTTP experiments:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
dataforge-mcp serve --transport streamable-http --host 127.0.0.1 --port 8000
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
`dry_run` is the safe default. To allow file mutation through MCP, start the
|
|
47
|
+
server with an explicit allowed root and `--enable-apply`:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
dataforge-mcp serve --allowed-root /path/to/csv/workspace --enable-apply
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Tools
|
|
54
|
+
|
|
55
|
+
- `dataforge_profile(path: str)` - summarize CSV shape plus detected issues.
|
|
56
|
+
- `dataforge_detect_errors(path: str)` - return detected issues only.
|
|
57
|
+
- `dataforge_verify_fix(fix_spec: dict)` - run one candidate fix through stale
|
|
58
|
+
value checks, safety, and verification.
|
|
59
|
+
- `dataforge_apply_repairs(path: str, mode: "dry_run" | "apply")` - propose
|
|
60
|
+
verified repairs and optionally write a reversible transaction.
|
|
61
|
+
- `dataforge_revert(txn_id: str)` - restore a transaction's original bytes.
|
|
62
|
+
|
|
63
|
+
## Client Configuration
|
|
64
|
+
|
|
65
|
+
Use the same server command for Claude Desktop, Cursor, Windsurf, or any local
|
|
66
|
+
MCP client that supports stdio servers:
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"mcpServers": {
|
|
71
|
+
"dataforge": {
|
|
72
|
+
"command": "dataforge-mcp",
|
|
73
|
+
"args": ["serve", "--allowed-root", "/path/to/csv/workspace"]
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
If your client cannot resolve the console script, replace `command` with the
|
|
80
|
+
absolute path returned by your shell:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
which dataforge-mcp
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
On Windows PowerShell:
|
|
87
|
+
|
|
88
|
+
```powershell
|
|
89
|
+
Get-Command dataforge-mcp
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Before describing a build as agent-ready, run an MCP Inspector smoke check
|
|
93
|
+
against a fixture directory and confirm the profile, detect, verify, dry-run
|
|
94
|
+
apply, and disabled-apply paths:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
npx @modelcontextprotocol/inspector dataforge-mcp serve --allowed-root /path/to/csv/workspace
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Safety Model
|
|
101
|
+
|
|
102
|
+
`apply` mode uses DataForge's detector -> repairer -> SafetyFilter ->
|
|
103
|
+
SMTVerifier -> transaction-log path. The tool writes the transaction journal and
|
|
104
|
+
source snapshot before mutating the CSV, and `dataforge_revert` restores the
|
|
105
|
+
snapshot only when the current file still matches the recorded post-state hash.
|
|
106
|
+
|
|
107
|
+
The MCP server does not enable live LLM repair fallback by default. It does not
|
|
108
|
+
send CSV contents to any external model provider. It also rejects CSV and schema
|
|
109
|
+
paths outside the configured allowed roots, and `apply` mode is disabled unless
|
|
110
|
+
the server is started with `--enable-apply` or `DATAFORGE_MCP_ENABLE_APPLY=1`.
|
|
111
|
+
|
|
112
|
+
## Release
|
|
113
|
+
|
|
114
|
+
The package is intended to release independently from the nested
|
|
115
|
+
`dataforge-mcp/` source directory as the `dataforge_07_mcp` distribution, but
|
|
116
|
+
it is not published yet. After PyPI Trusted Publishing is configured, the
|
|
117
|
+
workflow will build on tags matching:
|
|
118
|
+
|
|
119
|
+
```text
|
|
120
|
+
dataforge-mcp-v*
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
The package depends on `dataforge_07` and the official Python `mcp` SDK; it does
|
|
124
|
+
not vendor DataForge or add MCP dependencies to the core package.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
dataforge_07_mcp.egg-info/PKG-INFO
|
|
4
|
+
dataforge_07_mcp.egg-info/SOURCES.txt
|
|
5
|
+
dataforge_07_mcp.egg-info/dependency_links.txt
|
|
6
|
+
dataforge_07_mcp.egg-info/entry_points.txt
|
|
7
|
+
dataforge_07_mcp.egg-info/requires.txt
|
|
8
|
+
dataforge_07_mcp.egg-info/top_level.txt
|
|
9
|
+
dataforge_mcp/__init__.py
|
|
10
|
+
dataforge_mcp/server.py
|
|
11
|
+
dataforge_mcp/tools.py
|
|
12
|
+
tests/test_server_integration.py
|
|
13
|
+
tests/test_tools.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dataforge_mcp
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Executable MCP server for DataForge."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from mcp.server.fastmcp import FastMCP
|
|
9
|
+
|
|
10
|
+
from dataforge_mcp.tools import (
|
|
11
|
+
configure_mcp_security,
|
|
12
|
+
dataforge_apply_repairs,
|
|
13
|
+
dataforge_detect_errors,
|
|
14
|
+
dataforge_profile,
|
|
15
|
+
dataforge_revert,
|
|
16
|
+
dataforge_verify_fix,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
TransportLiteral = Literal["stdio", "streamable-http"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_server(*, host: str = "127.0.0.1", port: int = 8000) -> FastMCP:
|
|
23
|
+
"""Create a FastMCP server with all DataForge tools registered."""
|
|
24
|
+
mcp = FastMCP(
|
|
25
|
+
"DataForge",
|
|
26
|
+
instructions=(
|
|
27
|
+
"DataForge profiles CSVs, detects data-quality issues, proposes "
|
|
28
|
+
"verified repairs, applies reversible transactions, and reverts them."
|
|
29
|
+
),
|
|
30
|
+
host=host,
|
|
31
|
+
port=port,
|
|
32
|
+
stateless_http=True,
|
|
33
|
+
json_response=True,
|
|
34
|
+
)
|
|
35
|
+
mcp.tool(name="dataforge_profile")(dataforge_profile)
|
|
36
|
+
mcp.tool(name="dataforge_detect_errors")(dataforge_detect_errors)
|
|
37
|
+
mcp.tool(name="dataforge_verify_fix")(dataforge_verify_fix)
|
|
38
|
+
mcp.tool(name="dataforge_apply_repairs")(dataforge_apply_repairs)
|
|
39
|
+
mcp.tool(name="dataforge_revert")(dataforge_revert)
|
|
40
|
+
return mcp
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def serve(
|
|
44
|
+
*,
|
|
45
|
+
transport: TransportLiteral = "stdio",
|
|
46
|
+
host: str = "127.0.0.1",
|
|
47
|
+
port: int = 8000,
|
|
48
|
+
enable_apply: bool = False,
|
|
49
|
+
allowed_roots: list[str] | None = None,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Run the DataForge MCP server."""
|
|
52
|
+
configure_mcp_security(enable_apply=enable_apply, allowed_roots=allowed_roots)
|
|
53
|
+
server = create_server(host=host, port=port)
|
|
54
|
+
server.run(transport=transport)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
58
|
+
"""Build the command-line parser for the console script."""
|
|
59
|
+
parser = argparse.ArgumentParser(prog="dataforge-mcp")
|
|
60
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
61
|
+
serve_parser = subparsers.add_parser("serve", help="Start the MCP server.")
|
|
62
|
+
serve_parser.add_argument(
|
|
63
|
+
"--transport",
|
|
64
|
+
choices=("stdio", "streamable-http"),
|
|
65
|
+
default="stdio",
|
|
66
|
+
help="MCP transport to use.",
|
|
67
|
+
)
|
|
68
|
+
serve_parser.add_argument("--host", default="127.0.0.1", help="HTTP host.")
|
|
69
|
+
serve_parser.add_argument("--port", default=8000, type=int, help="HTTP port.")
|
|
70
|
+
serve_parser.add_argument(
|
|
71
|
+
"--enable-apply",
|
|
72
|
+
action="store_true",
|
|
73
|
+
help="Allow MCP clients to mutate CSV files through reversible transactions.",
|
|
74
|
+
)
|
|
75
|
+
serve_parser.add_argument(
|
|
76
|
+
"--allowed-root",
|
|
77
|
+
action="append",
|
|
78
|
+
dest="allowed_roots",
|
|
79
|
+
help="Filesystem root that MCP tools may read or mutate. May be repeated.",
|
|
80
|
+
)
|
|
81
|
+
return parser
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def main(argv: list[str] | None = None) -> None:
|
|
85
|
+
"""Console entry point for ``dataforge-mcp``."""
|
|
86
|
+
parser = _build_parser()
|
|
87
|
+
args = parser.parse_args(argv)
|
|
88
|
+
if args.command is None:
|
|
89
|
+
parser.print_help()
|
|
90
|
+
raise SystemExit(0)
|
|
91
|
+
if args.command == "serve":
|
|
92
|
+
serve(
|
|
93
|
+
transport=args.transport,
|
|
94
|
+
host=args.host,
|
|
95
|
+
port=args.port,
|
|
96
|
+
enable_apply=args.enable_apply,
|
|
97
|
+
allowed_roots=args.allowed_roots,
|
|
98
|
+
)
|
|
99
|
+
return
|
|
100
|
+
parser.error(f"Unknown command: {args.command}")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
main()
|
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""Structured MCP tool functions backed by DataForge's public API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from collections.abc import Sequence
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from dataforge import (
|
|
13
|
+
CONTRACT_VERSION,
|
|
14
|
+
CellFix,
|
|
15
|
+
Issue,
|
|
16
|
+
ProposedFix,
|
|
17
|
+
RepairPipelineRequest,
|
|
18
|
+
SafetyContext,
|
|
19
|
+
SafetyFilter,
|
|
20
|
+
SafetyVerdict,
|
|
21
|
+
Schema,
|
|
22
|
+
SMTVerifier,
|
|
23
|
+
TransactionLogError,
|
|
24
|
+
VerificationVerdict,
|
|
25
|
+
VerifiedFix,
|
|
26
|
+
load_schema,
|
|
27
|
+
read_csv,
|
|
28
|
+
revert_transaction,
|
|
29
|
+
run_all_detectors,
|
|
30
|
+
run_repair_pipeline,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
_APPLY_ENABLED = False
|
|
34
|
+
_ALLOWED_ROOTS: tuple[Path, ...] | None = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class IssueResult(BaseModel):
|
|
38
|
+
"""MCP-safe representation of a DataForge issue."""
|
|
39
|
+
|
|
40
|
+
row: int
|
|
41
|
+
column: str
|
|
42
|
+
issue_type: str
|
|
43
|
+
severity: str
|
|
44
|
+
confidence: float
|
|
45
|
+
expected: str | None
|
|
46
|
+
actual: str
|
|
47
|
+
reason: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class FixResult(BaseModel):
|
|
51
|
+
"""MCP-safe representation of an accepted repair proposal."""
|
|
52
|
+
|
|
53
|
+
row: int
|
|
54
|
+
column: str
|
|
55
|
+
old_value: str
|
|
56
|
+
new_value: str
|
|
57
|
+
detector_id: str
|
|
58
|
+
operation: str
|
|
59
|
+
reason: str
|
|
60
|
+
confidence: float
|
|
61
|
+
provenance: str
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ProfileResult(BaseModel):
|
|
65
|
+
"""Structured result returned by the profile tool."""
|
|
66
|
+
|
|
67
|
+
path: str
|
|
68
|
+
rows: int
|
|
69
|
+
columns: int
|
|
70
|
+
column_names: list[str]
|
|
71
|
+
total_issues: int
|
|
72
|
+
issues: list[IssueResult]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class VerifyFixResult(BaseModel):
|
|
76
|
+
"""Structured result returned by the fix verifier tool."""
|
|
77
|
+
|
|
78
|
+
accept: bool
|
|
79
|
+
reason: str
|
|
80
|
+
safety_verdict: str | None = None
|
|
81
|
+
verifier_verdict: str | None = None
|
|
82
|
+
unsat_core: list[str] = Field(default_factory=list)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class TxnReceipt(BaseModel):
|
|
86
|
+
"""Structured receipt returned by the repair tool."""
|
|
87
|
+
|
|
88
|
+
path: str
|
|
89
|
+
schema_version: Literal["repair_receipt_v1"] = "repair_receipt_v1"
|
|
90
|
+
receipt_version: Literal["repair_receipt_v1"] = "repair_receipt_v1"
|
|
91
|
+
mode: Literal["dry_run", "apply"]
|
|
92
|
+
contract_version: str = CONTRACT_VERSION
|
|
93
|
+
applied: bool
|
|
94
|
+
txn_id: str | None
|
|
95
|
+
reversible: bool
|
|
96
|
+
source_sha256: str
|
|
97
|
+
post_sha256: str | None = None
|
|
98
|
+
safety_verdict: str
|
|
99
|
+
verifier_verdict: str
|
|
100
|
+
patch_plan_sha256: str | None = None
|
|
101
|
+
revert_command: str | None = None
|
|
102
|
+
allowed_columns: list[str]
|
|
103
|
+
valid_rows: list[int]
|
|
104
|
+
root_causes: list[dict[str, Any]] = Field(default_factory=list)
|
|
105
|
+
candidate_repairs: list[dict[str, Any]] = Field(default_factory=list)
|
|
106
|
+
proof_obligations: list[dict[str, Any]] = Field(default_factory=list)
|
|
107
|
+
limitations: list[str] = Field(default_factory=list)
|
|
108
|
+
issues_count: int
|
|
109
|
+
fixes_count: int
|
|
110
|
+
reason: str
|
|
111
|
+
fixes: list[FixResult]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class RevertReceipt(BaseModel):
|
|
115
|
+
"""Structured receipt returned by the revert tool."""
|
|
116
|
+
|
|
117
|
+
txn_id: str
|
|
118
|
+
source_path: str
|
|
119
|
+
restored: bool
|
|
120
|
+
reverted_at: str | None
|
|
121
|
+
reason: str
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def configure_mcp_security(
|
|
125
|
+
*,
|
|
126
|
+
enable_apply: bool = False,
|
|
127
|
+
allowed_roots: Sequence[str | Path] | None = None,
|
|
128
|
+
) -> None:
|
|
129
|
+
"""Configure process-wide MCP path and apply safety settings."""
|
|
130
|
+
global _APPLY_ENABLED, _ALLOWED_ROOTS
|
|
131
|
+
_APPLY_ENABLED = enable_apply
|
|
132
|
+
if allowed_roots is None:
|
|
133
|
+
_ALLOWED_ROOTS = None
|
|
134
|
+
return
|
|
135
|
+
_ALLOWED_ROOTS = tuple(Path(root).expanduser().resolve() for root in allowed_roots)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _env_flag_enabled(name: str) -> bool:
|
|
139
|
+
"""Return whether an environment flag is truthy."""
|
|
140
|
+
return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _apply_is_enabled() -> bool:
|
|
144
|
+
"""Return whether MCP apply mode is explicitly enabled."""
|
|
145
|
+
return _APPLY_ENABLED or _env_flag_enabled("DATAFORGE_MCP_ENABLE_APPLY")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _allowed_roots() -> tuple[Path, ...]:
|
|
149
|
+
"""Return configured allowed filesystem roots for MCP file access."""
|
|
150
|
+
raw_roots = os.environ.get("DATAFORGE_MCP_ALLOWED_ROOTS", "")
|
|
151
|
+
if raw_roots.strip():
|
|
152
|
+
return tuple(
|
|
153
|
+
Path(root).expanduser().resolve()
|
|
154
|
+
for root in raw_roots.split(os.pathsep)
|
|
155
|
+
if root.strip()
|
|
156
|
+
)
|
|
157
|
+
if _ALLOWED_ROOTS is not None:
|
|
158
|
+
return _ALLOWED_ROOTS
|
|
159
|
+
return (Path.cwd().resolve(),)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _ensure_under_allowed_root(path: Path) -> Path:
|
|
163
|
+
"""Reject paths outside the configured MCP allowlist."""
|
|
164
|
+
resolved = path.expanduser().resolve()
|
|
165
|
+
roots = _allowed_roots()
|
|
166
|
+
if not roots:
|
|
167
|
+
raise ValueError("At least one MCP allowed root must be configured.")
|
|
168
|
+
for root in roots:
|
|
169
|
+
if resolved == root or resolved.is_relative_to(root):
|
|
170
|
+
return resolved
|
|
171
|
+
allowed = ", ".join(str(root) for root in roots)
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"Path is outside configured MCP allowed roots: {resolved}. Allowed: {allowed}"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _resolve_csv_path(path: str) -> Path:
|
|
178
|
+
"""Resolve and validate a CSV path supplied by an MCP client."""
|
|
179
|
+
resolved = _ensure_under_allowed_root(Path(path))
|
|
180
|
+
if not resolved.exists():
|
|
181
|
+
raise ValueError(f"CSV file does not exist: {resolved}")
|
|
182
|
+
if not resolved.is_file():
|
|
183
|
+
raise ValueError(f"CSV path is not a file: {resolved}")
|
|
184
|
+
return resolved
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _load_optional_schema(raw_path: object) -> Schema | None:
|
|
188
|
+
"""Load an optional schema path from an untrusted payload."""
|
|
189
|
+
if raw_path is None:
|
|
190
|
+
return None
|
|
191
|
+
schema_path = _ensure_under_allowed_root(Path(str(raw_path)))
|
|
192
|
+
if not schema_path.exists():
|
|
193
|
+
raise ValueError(f"Schema file does not exist: {schema_path}")
|
|
194
|
+
return load_schema(schema_path)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _issue_to_result(issue: Issue) -> IssueResult:
|
|
198
|
+
"""Convert a DataForge issue into a stable MCP payload."""
|
|
199
|
+
return IssueResult(
|
|
200
|
+
row=issue.row,
|
|
201
|
+
column=issue.column,
|
|
202
|
+
issue_type=issue.issue_type,
|
|
203
|
+
severity=issue.severity.value,
|
|
204
|
+
confidence=issue.confidence,
|
|
205
|
+
expected=issue.expected,
|
|
206
|
+
actual=issue.actual,
|
|
207
|
+
reason=issue.reason,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _fix_to_result(proposed_fix: ProposedFix) -> FixResult:
|
|
212
|
+
"""Convert a proposed fix into a stable MCP payload."""
|
|
213
|
+
fix = proposed_fix.fix
|
|
214
|
+
return FixResult(
|
|
215
|
+
row=fix.row,
|
|
216
|
+
column=fix.column,
|
|
217
|
+
old_value=fix.old_value,
|
|
218
|
+
new_value=fix.new_value,
|
|
219
|
+
detector_id=fix.detector_id,
|
|
220
|
+
operation=fix.operation,
|
|
221
|
+
reason=proposed_fix.reason,
|
|
222
|
+
confidence=proposed_fix.confidence,
|
|
223
|
+
provenance=proposed_fix.provenance,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _verified_fix_to_result(verified_fix: VerifiedFix) -> FixResult:
|
|
228
|
+
"""Convert a public engine verified fix into a stable MCP payload."""
|
|
229
|
+
return FixResult(
|
|
230
|
+
row=verified_fix.row,
|
|
231
|
+
column=verified_fix.column,
|
|
232
|
+
old_value=verified_fix.old_value,
|
|
233
|
+
new_value=verified_fix.new_value,
|
|
234
|
+
detector_id=verified_fix.detector_id,
|
|
235
|
+
operation=verified_fix.operation,
|
|
236
|
+
reason=verified_fix.reason,
|
|
237
|
+
confidence=verified_fix.confidence,
|
|
238
|
+
provenance=verified_fix.provenance,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _run_detection(path: Path, schema: Schema | None = None) -> tuple[Any, list[Issue]]:
|
|
243
|
+
"""Read a CSV and run all DataForge detectors."""
|
|
244
|
+
df = read_csv(path)
|
|
245
|
+
return df, run_all_detectors(df, schema)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _proposed_fix_from_spec(fix_spec: dict[str, Any]) -> tuple[Path, Schema | None, ProposedFix]:
|
|
249
|
+
"""Parse a verifier payload into a CSV path, optional schema, and fix."""
|
|
250
|
+
raw_path = fix_spec.get("path")
|
|
251
|
+
if not raw_path:
|
|
252
|
+
raise ValueError("fix_spec must include a CSV 'path'.")
|
|
253
|
+
path = _resolve_csv_path(str(raw_path))
|
|
254
|
+
schema = _load_optional_schema(fix_spec.get("schema_path"))
|
|
255
|
+
raw_fix = fix_spec.get("fix")
|
|
256
|
+
if not isinstance(raw_fix, dict):
|
|
257
|
+
raw_fix = {
|
|
258
|
+
key: value
|
|
259
|
+
for key, value in fix_spec.items()
|
|
260
|
+
if key in {"row", "column", "old_value", "new_value", "detector_id", "operation"}
|
|
261
|
+
}
|
|
262
|
+
cell_fix = CellFix.model_validate(raw_fix)
|
|
263
|
+
proposed = ProposedFix(
|
|
264
|
+
fix=cell_fix,
|
|
265
|
+
reason=str(fix_spec.get("reason", "MCP-provided candidate fix.")),
|
|
266
|
+
confidence=float(fix_spec.get("confidence", 1.0)),
|
|
267
|
+
provenance=fix_spec.get("provenance", "deterministic"),
|
|
268
|
+
)
|
|
269
|
+
return path, schema, proposed
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def dataforge_profile(path: str) -> ProfileResult:
|
|
273
|
+
"""Profile a CSV file and return detected DataForge issues."""
|
|
274
|
+
csv_path = _resolve_csv_path(path)
|
|
275
|
+
df, issues = _run_detection(csv_path)
|
|
276
|
+
return ProfileResult(
|
|
277
|
+
path=str(csv_path),
|
|
278
|
+
rows=len(df.index),
|
|
279
|
+
columns=len(df.columns),
|
|
280
|
+
column_names=[str(column) for column in df.columns],
|
|
281
|
+
total_issues=len(issues),
|
|
282
|
+
issues=[_issue_to_result(issue) for issue in issues],
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def dataforge_detect_errors(path: str) -> list[IssueResult]:
|
|
287
|
+
"""Detect data-quality errors in a CSV file."""
|
|
288
|
+
csv_path = _resolve_csv_path(path)
|
|
289
|
+
_df, issues = _run_detection(csv_path)
|
|
290
|
+
return [_issue_to_result(issue) for issue in issues]
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def dataforge_verify_fix(fix_spec: dict[str, Any]) -> VerifyFixResult:
|
|
294
|
+
"""Verify whether one candidate fix may be accepted by DataForge gates."""
|
|
295
|
+
path, schema, proposed = _proposed_fix_from_spec(fix_spec)
|
|
296
|
+
df = read_csv(path)
|
|
297
|
+
fix = proposed.fix
|
|
298
|
+
if fix.column not in df.columns:
|
|
299
|
+
return VerifyFixResult(accept=False, reason=f"Column '{fix.column}' does not exist.")
|
|
300
|
+
if fix.row < 0 or fix.row >= len(df.index):
|
|
301
|
+
return VerifyFixResult(accept=False, reason=f"Row {fix.row} is out of bounds.")
|
|
302
|
+
current_value = str(df.at[fix.row, fix.column])
|
|
303
|
+
if current_value != fix.old_value:
|
|
304
|
+
return VerifyFixResult(
|
|
305
|
+
accept=False,
|
|
306
|
+
reason=(
|
|
307
|
+
f"Refusing stale fix for row {fix.row}, column '{fix.column}': "
|
|
308
|
+
f"expected '{fix.old_value}', found '{current_value}'."
|
|
309
|
+
),
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
safety_result = SafetyFilter().evaluate(proposed, schema, SafetyContext())
|
|
313
|
+
if safety_result.verdict != SafetyVerdict.ALLOW:
|
|
314
|
+
return VerifyFixResult(
|
|
315
|
+
accept=False,
|
|
316
|
+
reason=safety_result.reason,
|
|
317
|
+
safety_verdict=safety_result.verdict.value,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
verifier_result = SMTVerifier().verify(df, [proposed], schema)
|
|
321
|
+
return VerifyFixResult(
|
|
322
|
+
accept=verifier_result.verdict == VerificationVerdict.ACCEPT,
|
|
323
|
+
reason=verifier_result.reason,
|
|
324
|
+
safety_verdict=safety_result.verdict.value,
|
|
325
|
+
verifier_verdict=verifier_result.verdict.value,
|
|
326
|
+
unsat_core=list(verifier_result.unsat_core),
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def dataforge_apply_repairs(path: str, mode: Literal["dry_run", "apply"]) -> TxnReceipt:
|
|
331
|
+
"""Detect, verify, and optionally apply DataForge repairs to a CSV file."""
|
|
332
|
+
csv_path = _resolve_csv_path(path)
|
|
333
|
+
if mode not in {"dry_run", "apply"}:
|
|
334
|
+
raise ValueError("mode must be 'dry_run' or 'apply'.")
|
|
335
|
+
if mode == "apply" and not _apply_is_enabled():
|
|
336
|
+
raise ValueError(
|
|
337
|
+
"MCP apply mode is disabled. Start the server with --enable-apply or set "
|
|
338
|
+
"DATAFORGE_MCP_ENABLE_APPLY=1."
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
result = run_repair_pipeline(
|
|
342
|
+
RepairPipelineRequest(
|
|
343
|
+
source_path=csv_path,
|
|
344
|
+
mode=mode,
|
|
345
|
+
schema=None,
|
|
346
|
+
allow_llm=False,
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
receipt = result.receipt
|
|
350
|
+
return TxnReceipt(
|
|
351
|
+
path=str(csv_path),
|
|
352
|
+
mode=mode,
|
|
353
|
+
applied=receipt.applied,
|
|
354
|
+
txn_id=receipt.txn_id,
|
|
355
|
+
reversible=receipt.reversible,
|
|
356
|
+
source_sha256=receipt.source_sha256,
|
|
357
|
+
post_sha256=receipt.post_sha256,
|
|
358
|
+
safety_verdict=receipt.safety_verdict,
|
|
359
|
+
verifier_verdict=receipt.verifier_verdict,
|
|
360
|
+
patch_plan_sha256=receipt.patch_plan_sha256,
|
|
361
|
+
revert_command=receipt.revert_command,
|
|
362
|
+
allowed_columns=receipt.allowed_columns,
|
|
363
|
+
valid_rows=receipt.valid_rows,
|
|
364
|
+
root_causes=[item.model_dump() for item in receipt.root_causes],
|
|
365
|
+
candidate_repairs=[item.model_dump() for item in receipt.candidate_repairs],
|
|
366
|
+
proof_obligations=[item.model_dump() for item in receipt.proof_obligations],
|
|
367
|
+
limitations=receipt.limitations,
|
|
368
|
+
issues_count=receipt.issues_count,
|
|
369
|
+
fixes_count=receipt.fixes_count,
|
|
370
|
+
reason=receipt.reason,
|
|
371
|
+
fixes=[_verified_fix_to_result(fix) for fix in result.fixes],
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def dataforge_revert(txn_id: str) -> RevertReceipt:
|
|
376
|
+
"""Revert a previously applied DataForge repair transaction."""
|
|
377
|
+
transaction = None
|
|
378
|
+
last_error: Exception | None = None
|
|
379
|
+
for root in _allowed_roots():
|
|
380
|
+
try:
|
|
381
|
+
transaction = revert_transaction(txn_id, search_root=root)
|
|
382
|
+
break
|
|
383
|
+
except TransactionLogError as exc:
|
|
384
|
+
last_error = exc
|
|
385
|
+
continue
|
|
386
|
+
if transaction is None:
|
|
387
|
+
if last_error is not None:
|
|
388
|
+
raise ValueError(str(last_error)) from last_error
|
|
389
|
+
raise ValueError(f"Could not find transaction '{txn_id}' under configured allowed roots.")
|
|
390
|
+
return RevertReceipt(
|
|
391
|
+
txn_id=transaction.txn_id,
|
|
392
|
+
source_path=transaction.source_path,
|
|
393
|
+
restored=transaction.reverted_at is not None,
|
|
394
|
+
reverted_at=transaction.reverted_at.isoformat() if transaction.reverted_at else None,
|
|
395
|
+
reason="Source restored successfully.",
|
|
396
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dataforge_07_mcp"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Model Context Protocol server for DataForge data-quality tools."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "Apache-2.0"
|
|
7
|
+
requires-python = ">=3.11,<3.13"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"dataforge_07>=0.1.0,<0.2",
|
|
10
|
+
"mcp>=1.27",
|
|
11
|
+
]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Programming Language :: Python :: 3.11",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
]
|
|
17
|
+
keywords = ["data-quality", "dataforge", "mcp", "model-context-protocol"]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://github.com/Aegis15/dataforge"
|
|
21
|
+
Repository = "https://github.com/Aegis15/dataforge"
|
|
22
|
+
Documentation = "https://dataforge.praneshrajan15.workers.dev/playground"
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
dev = [
|
|
26
|
+
"pytest>=9.0.3",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
dataforge-mcp = "dataforge_mcp.server:main"
|
|
31
|
+
dataforge15-mcp = "dataforge_mcp.server:main"
|
|
32
|
+
|
|
33
|
+
[build-system]
|
|
34
|
+
requires = ["setuptools>=68", "wheel"]
|
|
35
|
+
build-backend = "setuptools.build_meta"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
where = ["."]
|
|
39
|
+
include = ["dataforge_mcp*"]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Stdio integration tests for the DataForge MCP server."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from mcp import ClientSession, StdioServerParameters
|
|
12
|
+
from mcp.client.stdio import stdio_client
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _write_repairable_csv(path: Path) -> None:
|
|
16
|
+
"""Write a small CSV with a deterministic decimal-shift issue."""
|
|
17
|
+
path.write_text(
|
|
18
|
+
"id,amount\n1,100\n2,105\n3,98\n4,1020\n5,103\n",
|
|
19
|
+
encoding="utf-8",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_stdio_server_lists_and_calls_profile_tool(tmp_path: Path) -> None:
|
|
24
|
+
csv_path = tmp_path / "amounts.csv"
|
|
25
|
+
_write_repairable_csv(csv_path)
|
|
26
|
+
package_root = Path(__file__).resolve().parents[1]
|
|
27
|
+
repo_root = package_root.parent
|
|
28
|
+
env = os.environ.copy()
|
|
29
|
+
env["PYTHONPATH"] = os.pathsep.join(
|
|
30
|
+
[str(package_root), str(repo_root), env.get("PYTHONPATH", "")]
|
|
31
|
+
)
|
|
32
|
+
env["DATAFORGE_MCP_ALLOWED_ROOTS"] = str(tmp_path)
|
|
33
|
+
|
|
34
|
+
async def run_client() -> None:
|
|
35
|
+
server_params = StdioServerParameters(
|
|
36
|
+
command=sys.executable,
|
|
37
|
+
args=["-m", "dataforge_mcp.server", "serve"],
|
|
38
|
+
env=env,
|
|
39
|
+
)
|
|
40
|
+
async with (
|
|
41
|
+
stdio_client(server_params) as (read_stream, write_stream),
|
|
42
|
+
ClientSession(read_stream, write_stream) as session,
|
|
43
|
+
):
|
|
44
|
+
await session.initialize()
|
|
45
|
+
tools = await session.list_tools()
|
|
46
|
+
names = {tool.name for tool in tools.tools}
|
|
47
|
+
assert "dataforge_profile" in names
|
|
48
|
+
|
|
49
|
+
result = await session.call_tool(
|
|
50
|
+
"dataforge_profile",
|
|
51
|
+
{"path": str(csv_path)},
|
|
52
|
+
)
|
|
53
|
+
assert result.isError is False
|
|
54
|
+
assert result.structuredContent is not None
|
|
55
|
+
payload = result.structuredContent
|
|
56
|
+
if "result" in payload:
|
|
57
|
+
payload = payload["result"]
|
|
58
|
+
assert json.dumps(payload)
|
|
59
|
+
assert payload["rows"] == 5
|
|
60
|
+
assert payload["total_issues"] >= 1
|
|
61
|
+
|
|
62
|
+
asyncio.run(run_client())
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Unit tests for DataForge MCP tool functions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
from dataforge_mcp.server import create_server
|
|
10
|
+
from dataforge_mcp.tools import (
|
|
11
|
+
configure_mcp_security,
|
|
12
|
+
dataforge_apply_repairs,
|
|
13
|
+
dataforge_detect_errors,
|
|
14
|
+
dataforge_profile,
|
|
15
|
+
dataforge_revert,
|
|
16
|
+
dataforge_verify_fix,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _write_repairable_csv(path: Path) -> None:
|
|
21
|
+
"""Write a small CSV with a deterministic decimal-shift repair."""
|
|
22
|
+
path.write_text(
|
|
23
|
+
"id,amount\n1,100\n2,105\n3,98\n4,1020\n5,103\n",
|
|
24
|
+
encoding="utf-8",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _fix_spec(path: Path, *, old_value: str = "1020", new_value: str = "102") -> dict[str, object]:
|
|
29
|
+
"""Build a verifier payload for the decimal-shift fixture."""
|
|
30
|
+
return {
|
|
31
|
+
"path": str(path),
|
|
32
|
+
"fix": {
|
|
33
|
+
"row": 3,
|
|
34
|
+
"column": "amount",
|
|
35
|
+
"old_value": old_value,
|
|
36
|
+
"new_value": new_value,
|
|
37
|
+
"detector_id": "decimal_shift",
|
|
38
|
+
},
|
|
39
|
+
"reason": "candidate decimal-shift repair",
|
|
40
|
+
"confidence": 0.9,
|
|
41
|
+
"provenance": "deterministic",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture(autouse=True)
|
|
46
|
+
def _mcp_security(tmp_path: Path) -> None:
|
|
47
|
+
"""Allow each test's temporary files and enable explicit apply coverage."""
|
|
48
|
+
configure_mcp_security(enable_apply=True, allowed_roots=[tmp_path])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class TestDataForgeMcpTools:
|
|
52
|
+
"""Direct coverage for MCP tool behavior."""
|
|
53
|
+
|
|
54
|
+
def test_server_registers_expected_tools(self) -> None:
|
|
55
|
+
server = create_server()
|
|
56
|
+
|
|
57
|
+
tools = server._tool_manager.list_tools()
|
|
58
|
+
names = {tool.name for tool in tools}
|
|
59
|
+
|
|
60
|
+
assert names == {
|
|
61
|
+
"dataforge_profile",
|
|
62
|
+
"dataforge_detect_errors",
|
|
63
|
+
"dataforge_verify_fix",
|
|
64
|
+
"dataforge_apply_repairs",
|
|
65
|
+
"dataforge_revert",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def test_registered_tools_have_output_schemas(self) -> None:
|
|
69
|
+
server = create_server()
|
|
70
|
+
|
|
71
|
+
for tool in server._tool_manager.list_tools():
|
|
72
|
+
schema = tool.output_schema
|
|
73
|
+
assert schema["type"] == "object"
|
|
74
|
+
assert "properties" in schema
|
|
75
|
+
|
|
76
|
+
def test_profile_and_detect_errors_return_decimal_shift_issue(self, tmp_path: Path) -> None:
|
|
77
|
+
csv_path = tmp_path / "amounts.csv"
|
|
78
|
+
_write_repairable_csv(csv_path)
|
|
79
|
+
|
|
80
|
+
profile = dataforge_profile(str(csv_path))
|
|
81
|
+
issues = dataforge_detect_errors(str(csv_path))
|
|
82
|
+
|
|
83
|
+
assert profile.rows == 5
|
|
84
|
+
assert profile.columns == 2
|
|
85
|
+
assert profile.total_issues >= 1
|
|
86
|
+
assert any(issue.issue_type == "decimal_shift" for issue in issues)
|
|
87
|
+
|
|
88
|
+
def test_verify_fix_accepts_valid_candidate(self, tmp_path: Path) -> None:
|
|
89
|
+
csv_path = tmp_path / "amounts.csv"
|
|
90
|
+
_write_repairable_csv(csv_path)
|
|
91
|
+
|
|
92
|
+
result = dataforge_verify_fix(_fix_spec(csv_path))
|
|
93
|
+
|
|
94
|
+
assert result.accept is True
|
|
95
|
+
assert result.safety_verdict == "allow"
|
|
96
|
+
assert result.verifier_verdict == "accept"
|
|
97
|
+
|
|
98
|
+
def test_verify_fix_rejects_stale_candidate(self, tmp_path: Path) -> None:
|
|
99
|
+
csv_path = tmp_path / "amounts.csv"
|
|
100
|
+
_write_repairable_csv(csv_path)
|
|
101
|
+
|
|
102
|
+
result = dataforge_verify_fix(_fix_spec(csv_path, old_value="999"))
|
|
103
|
+
|
|
104
|
+
assert result.accept is False
|
|
105
|
+
assert "stale fix" in result.reason.lower()
|
|
106
|
+
|
|
107
|
+
def test_dry_run_does_not_mutate_source(self, tmp_path: Path) -> None:
|
|
108
|
+
csv_path = tmp_path / "amounts.csv"
|
|
109
|
+
_write_repairable_csv(csv_path)
|
|
110
|
+
original = csv_path.read_bytes()
|
|
111
|
+
|
|
112
|
+
receipt = dataforge_apply_repairs(str(csv_path), "dry_run")
|
|
113
|
+
|
|
114
|
+
assert receipt.receipt_version == "repair_receipt_v1"
|
|
115
|
+
assert receipt.applied is False
|
|
116
|
+
assert receipt.txn_id is None
|
|
117
|
+
assert receipt.fixes_count >= 1
|
|
118
|
+
assert receipt.root_causes
|
|
119
|
+
assert receipt.candidate_repairs
|
|
120
|
+
assert receipt.proof_obligations
|
|
121
|
+
assert receipt.patch_plan_sha256 is not None
|
|
122
|
+
assert receipt.limitations
|
|
123
|
+
assert csv_path.read_bytes() == original
|
|
124
|
+
|
|
125
|
+
def test_apply_requires_explicit_enablement(self, tmp_path: Path) -> None:
|
|
126
|
+
csv_path = tmp_path / "amounts.csv"
|
|
127
|
+
_write_repairable_csv(csv_path)
|
|
128
|
+
configure_mcp_security(enable_apply=False, allowed_roots=[tmp_path])
|
|
129
|
+
|
|
130
|
+
with pytest.raises(ValueError, match="apply mode is disabled"):
|
|
131
|
+
dataforge_apply_repairs(str(csv_path), "apply")
|
|
132
|
+
|
|
133
|
+
def test_apply_then_revert_restores_source_bytes(
|
|
134
|
+
self,
|
|
135
|
+
tmp_path: Path,
|
|
136
|
+
monkeypatch,
|
|
137
|
+
) -> None:
|
|
138
|
+
monkeypatch.chdir(tmp_path)
|
|
139
|
+
csv_path = tmp_path / "amounts.csv"
|
|
140
|
+
_write_repairable_csv(csv_path)
|
|
141
|
+
original = csv_path.read_bytes()
|
|
142
|
+
|
|
143
|
+
receipt = dataforge_apply_repairs(str(csv_path), "apply")
|
|
144
|
+
|
|
145
|
+
assert receipt.applied is True
|
|
146
|
+
assert receipt.txn_id is not None
|
|
147
|
+
assert re.fullmatch(r"txn-\d{4}-\d{2}-\d{2}-[0-9a-f]{6}", receipt.txn_id)
|
|
148
|
+
assert csv_path.read_bytes() != original
|
|
149
|
+
|
|
150
|
+
revert = dataforge_revert(receipt.txn_id)
|
|
151
|
+
|
|
152
|
+
assert revert.restored is True
|
|
153
|
+
assert csv_path.read_bytes() == original
|