ml-validate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_validate-0.1.0/PKG-INFO +180 -0
- ml_validate-0.1.0/README.md +146 -0
- ml_validate-0.1.0/ml_validate/__init__.py +40 -0
- ml_validate-0.1.0/ml_validate/_data/__init__.py +0 -0
- ml_validate-0.1.0/ml_validate/_data/repositories/chemotion/schema.json +1105 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Databases/croissant/RAI/1.0/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Databases/croissant/geo/v1.0/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Databases/croissant/v1.0/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Datasets/coco/2023/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Datasets/huggingface/2.0/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Datasets/nfdi4chem/0.1/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Documentation/modelcard/1.0/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Documentation/xdm/2024.1/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Machine Learning/featurestore/3.0/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_data/schemas/Machine Learning/ml-schema/v2.0/schema.json +11 -0
- ml_validate-0.1.0/ml_validate/_validators/__init__.py +13 -0
- ml_validate-0.1.0/ml_validate/_validators/base.py +24 -0
- ml_validate-0.1.0/ml_validate/_validators/croissant.py +70 -0
- ml_validate-0.1.0/ml_validate/_validators/generic.py +44 -0
- ml_validate-0.1.0/ml_validate/_validators/jsonschema_validator.py +69 -0
- ml_validate-0.1.0/ml_validate/_validators/ml_schema.py +48 -0
- ml_validate-0.1.0/ml_validate/cli.py +284 -0
- ml_validate-0.1.0/ml_validate/exceptions.py +10 -0
- ml_validate-0.1.0/ml_validate/registry.py +202 -0
- ml_validate-0.1.0/ml_validate/result.py +34 -0
- ml_validate-0.1.0/ml_validate/validator.py +183 -0
- ml_validate-0.1.0/ml_validate.egg-info/PKG-INFO +180 -0
- ml_validate-0.1.0/ml_validate.egg-info/SOURCES.txt +35 -0
- ml_validate-0.1.0/ml_validate.egg-info/dependency_links.txt +1 -0
- ml_validate-0.1.0/ml_validate.egg-info/entry_points.txt +2 -0
- ml_validate-0.1.0/ml_validate.egg-info/requires.txt +13 -0
- ml_validate-0.1.0/ml_validate.egg-info/top_level.txt +1 -0
- ml_validate-0.1.0/pyproject.toml +59 -0
- ml_validate-0.1.0/setup.cfg +4 -0
- ml_validate-0.1.0/tests/test_cli.py +115 -0
- ml_validate-0.1.0/tests/test_registry.py +59 -0
- ml_validate-0.1.0/tests/test_validator.py +92 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ml-validate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Offline-first ML dataset metadata validator for NFDI schemas (Croissant, ML-Schema, XDM, Chemotion, and more)
|
|
5
|
+
Author: NFDI Data Registry
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/your-org/nfdi-data-registry
|
|
8
|
+
Project-URL: Documentation, https://github.com/your-org/nfdi-data-registry/tree/main/ml_validate
|
|
9
|
+
Keywords: nfdi,validation,croissant,ml,schema,metadata,fair
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: jsonschema>=4.17
|
|
24
|
+
Requires-Dist: typer>=0.9
|
|
25
|
+
Requires-Dist: rich>=13.0
|
|
26
|
+
Provides-Extra: fetch
|
|
27
|
+
Requires-Dist: httpx>=0.25; extra == "fetch"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy; extra == "dev"
|
|
33
|
+
Requires-Dist: httpx>=0.25; extra == "dev"
|
|
34
|
+
|
|
35
|
+
# ml-validate
|
|
36
|
+
|
|
37
|
+
Offline-first Python validator for NFDI and ML metadata schemas.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- Bundled schema registry (no network required for schemas)
|
|
42
|
+
- Python API via `Validator` and `Profiler`
|
|
43
|
+
- CLI for CI/CD pipelines
|
|
44
|
+
- Supports Croissant, ML-Schema, JSON Schema-based formats, and repository schemas
|
|
45
|
+
|
|
46
|
+
## Install
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install ml-validate
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
For URL validation support:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install "ml-validate[fetch]"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quickstart
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from ml_validate import Validator
|
|
62
|
+
|
|
63
|
+
validator = Validator()
|
|
64
|
+
result = validator.validate("dataset.json", "croissant-1.0")
|
|
65
|
+
print(result.passed, result.score)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
ml-validate validate dataset.json --schema croissant-1.0
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Example Data (Chemotion JSON-LD)
|
|
73
|
+
|
|
74
|
+
This repository now includes reusable example files in `examples/` derived from:
|
|
75
|
+
|
|
76
|
+
- `../example_datasets/JSON-LD_Analysis_874350-20260219122553.json`
|
|
77
|
+
|
|
78
|
+
Included examples:
|
|
79
|
+
|
|
80
|
+
- `examples/chemotion_jsonld_sample.json`: curated, realistic JSON-LD sample
|
|
81
|
+
- `examples/chemotion_jsonld_incomplete.json`: intentionally incomplete sample for failure demos
|
|
82
|
+
|
|
83
|
+
The sample keeps the metadata structure and key fields used by the validators while avoiding large personal-author blocks.
|
|
84
|
+
|
|
85
|
+
## CLI Examples
|
|
86
|
+
|
|
87
|
+
Validate a realistic Chemotion-style JSON-LD example with Croissant heuristics:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
ml-validate validate examples/chemotion_jsonld_sample.json --schema croissant-1.0
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Run a profile across two schemas and get machine-readable output:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
ml-validate profile examples/chemotion_jsonld_sample.json --schemas croissant-1.0,w3c-ml-2.0 --output json
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Show a failing validation case:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
ml-validate validate examples/chemotion_jsonld_incomplete.json --schema croissant-1.0
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Python API Examples
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from pathlib import Path
|
|
109
|
+
from ml_validate import Profiler, Validator
|
|
110
|
+
|
|
111
|
+
payload = Path("examples/chemotion_jsonld_sample.json").read_text(encoding="utf-8")
|
|
112
|
+
|
|
113
|
+
validator = Validator()
|
|
114
|
+
single = validator.validate(payload, "croissant-1.0", source_type="json")
|
|
115
|
+
print(single.passed, single.score)
|
|
116
|
+
|
|
117
|
+
profiler = Profiler()
|
|
118
|
+
results = profiler.profile(payload, schema_ids=["croissant-1.0", "w3c-ml-2.0"], source_type="json")
|
|
119
|
+
for schema_id, result in results.items():
|
|
120
|
+
print(schema_id, result.status, result.score)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## CI/CD Example (GitHub Actions)
|
|
124
|
+
|
|
125
|
+
You can validate metadata files automatically in pull requests and fail the build when rules are not met.
|
|
126
|
+
|
|
127
|
+
### PR validation workflow
|
|
128
|
+
|
|
129
|
+
Create `.github/workflows/metadata-validation.yml`:
|
|
130
|
+
|
|
131
|
+
```yaml
|
|
132
|
+
name: Metadata Validation
|
|
133
|
+
|
|
134
|
+
on:
|
|
135
|
+
pull_request:
|
|
136
|
+
paths:
|
|
137
|
+
- "datasets/**/*.json"
|
|
138
|
+
- ".github/workflows/metadata-validation.yml"
|
|
139
|
+
|
|
140
|
+
jobs:
|
|
141
|
+
validate:
|
|
142
|
+
runs-on: ubuntu-latest
|
|
143
|
+
steps:
|
|
144
|
+
- name: Checkout
|
|
145
|
+
uses: actions/checkout@v4
|
|
146
|
+
|
|
147
|
+
- name: Set up Python
|
|
148
|
+
uses: actions/setup-python@v5
|
|
149
|
+
with:
|
|
150
|
+
python-version: "3.12"
|
|
151
|
+
|
|
152
|
+
- name: Install validator
|
|
153
|
+
run: pip install ml-validate
|
|
154
|
+
|
|
155
|
+
- name: Validate a dataset (strict)
|
|
156
|
+
run: |
|
|
157
|
+
ml-validate validate datasets/metadata.json \
|
|
158
|
+
--schema croissant-1.0 \
|
|
159
|
+
--fail-under 90
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Multi-schema profile gate
|
|
163
|
+
|
|
164
|
+
Use profiling if your pipeline must satisfy more than one schema:
|
|
165
|
+
|
|
166
|
+
```yaml
|
|
167
|
+
- name: Profile against multiple schemas
|
|
168
|
+
run: |
|
|
169
|
+
ml-validate profile datasets/metadata.json \
|
|
170
|
+
--schemas croissant-1.0,w3c-ml-2.0 \
|
|
171
|
+
--fail-under 80
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Publish workflow note
|
|
175
|
+
|
|
176
|
+
If you publish this package from GitHub Actions, see:
|
|
177
|
+
|
|
178
|
+
- `.github/workflows/publish.yml`
|
|
179
|
+
|
|
180
|
+
That workflow builds on tag push (for example `v0.1.0`) and uploads distributions to PyPI.
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# ml-validate
|
|
2
|
+
|
|
3
|
+
Offline-first Python validator for NFDI and ML metadata schemas.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Bundled schema registry (no network required for schemas)
|
|
8
|
+
- Python API via `Validator` and `Profiler`
|
|
9
|
+
- CLI for CI/CD pipelines
|
|
10
|
+
- Supports Croissant, ML-Schema, JSON Schema-based formats, and repository schemas
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install ml-validate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
For URL validation support:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install "ml-validate[fetch]"
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quickstart
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from ml_validate import Validator
|
|
28
|
+
|
|
29
|
+
validator = Validator()
|
|
30
|
+
result = validator.validate("dataset.json", "croissant-1.0")
|
|
31
|
+
print(result.passed, result.score)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
ml-validate validate dataset.json --schema croissant-1.0
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Example Data (Chemotion JSON-LD)
|
|
39
|
+
|
|
40
|
+
This repository now includes reusable example files in `examples/` derived from:
|
|
41
|
+
|
|
42
|
+
- `../example_datasets/JSON-LD_Analysis_874350-20260219122553.json`
|
|
43
|
+
|
|
44
|
+
Included examples:
|
|
45
|
+
|
|
46
|
+
- `examples/chemotion_jsonld_sample.json`: curated, realistic JSON-LD sample
|
|
47
|
+
- `examples/chemotion_jsonld_incomplete.json`: intentionally incomplete sample for failure demos
|
|
48
|
+
|
|
49
|
+
The sample keeps the metadata structure and key fields used by the validators while avoiding large personal-author blocks.
|
|
50
|
+
|
|
51
|
+
## CLI Examples
|
|
52
|
+
|
|
53
|
+
Validate a realistic Chemotion-style JSON-LD example with Croissant heuristics:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
ml-validate validate examples/chemotion_jsonld_sample.json --schema croissant-1.0
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Run a profile across two schemas and get machine-readable output:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
ml-validate profile examples/chemotion_jsonld_sample.json --schemas croissant-1.0,w3c-ml-2.0 --output json
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Show a failing validation case:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
ml-validate validate examples/chemotion_jsonld_incomplete.json --schema croissant-1.0
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Python API Examples
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from pathlib import Path
|
|
75
|
+
from ml_validate import Profiler, Validator
|
|
76
|
+
|
|
77
|
+
payload = Path("examples/chemotion_jsonld_sample.json").read_text(encoding="utf-8")
|
|
78
|
+
|
|
79
|
+
validator = Validator()
|
|
80
|
+
single = validator.validate(payload, "croissant-1.0", source_type="json")
|
|
81
|
+
print(single.passed, single.score)
|
|
82
|
+
|
|
83
|
+
profiler = Profiler()
|
|
84
|
+
results = profiler.profile(payload, schema_ids=["croissant-1.0", "w3c-ml-2.0"], source_type="json")
|
|
85
|
+
for schema_id, result in results.items():
|
|
86
|
+
print(schema_id, result.status, result.score)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## CI/CD Example (GitHub Actions)
|
|
90
|
+
|
|
91
|
+
You can validate metadata files automatically in pull requests and fail the build when rules are not met.
|
|
92
|
+
|
|
93
|
+
### PR validation workflow
|
|
94
|
+
|
|
95
|
+
Create `.github/workflows/metadata-validation.yml`:
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
name: Metadata Validation
|
|
99
|
+
|
|
100
|
+
on:
|
|
101
|
+
pull_request:
|
|
102
|
+
paths:
|
|
103
|
+
- "datasets/**/*.json"
|
|
104
|
+
- ".github/workflows/metadata-validation.yml"
|
|
105
|
+
|
|
106
|
+
jobs:
|
|
107
|
+
validate:
|
|
108
|
+
runs-on: ubuntu-latest
|
|
109
|
+
steps:
|
|
110
|
+
- name: Checkout
|
|
111
|
+
uses: actions/checkout@v4
|
|
112
|
+
|
|
113
|
+
- name: Set up Python
|
|
114
|
+
uses: actions/setup-python@v5
|
|
115
|
+
with:
|
|
116
|
+
python-version: "3.12"
|
|
117
|
+
|
|
118
|
+
- name: Install validator
|
|
119
|
+
run: pip install ml-validate
|
|
120
|
+
|
|
121
|
+
- name: Validate a dataset (strict)
|
|
122
|
+
run: |
|
|
123
|
+
ml-validate validate datasets/metadata.json \
|
|
124
|
+
--schema croissant-1.0 \
|
|
125
|
+
--fail-under 90
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Multi-schema profile gate
|
|
129
|
+
|
|
130
|
+
Use profiling if your pipeline must satisfy more than one schema:
|
|
131
|
+
|
|
132
|
+
```yaml
|
|
133
|
+
- name: Profile against multiple schemas
|
|
134
|
+
run: |
|
|
135
|
+
ml-validate profile datasets/metadata.json \
|
|
136
|
+
--schemas croissant-1.0,w3c-ml-2.0 \
|
|
137
|
+
--fail-under 80
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Publish workflow note
|
|
141
|
+
|
|
142
|
+
If you publish this package from GitHub Actions, see:
|
|
143
|
+
|
|
144
|
+
- `.github/workflows/publish.yml`
|
|
145
|
+
|
|
146
|
+
That workflow builds on tag push (for example `v0.1.0`) and uploads distributions to PyPI.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ml-validate — Offline-first ML metadata schema validator.
|
|
3
|
+
|
|
4
|
+
Public API::
|
|
5
|
+
|
|
6
|
+
from ml_validate import Validator, Profiler, SchemaRegistry, ValidationResult
|
|
7
|
+
|
|
8
|
+
# List available schemas
|
|
9
|
+
registry = SchemaRegistry()
|
|
10
|
+
for schema in registry.list():
|
|
11
|
+
print(schema.id, schema.name)
|
|
12
|
+
|
|
13
|
+
# Validate a single dataset
|
|
14
|
+
v = Validator()
|
|
15
|
+
result = v.validate('{"name": "my-dataset", "license": "CC-BY-4.0"}', 'croissant-1.0')
|
|
16
|
+
print(result.score, result.status)
|
|
17
|
+
|
|
18
|
+
# Profile against all standard schemas
|
|
19
|
+
p = Profiler()
|
|
20
|
+
results = p.profile('{"name": "my-dataset"}')
|
|
21
|
+
for schema_id, r in results.items():
|
|
22
|
+
print(schema_id, r.score)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from ml_validate.result import ValidationResult
|
|
26
|
+
from ml_validate.exceptions import SchemaNotFoundError, ValidationInputError
|
|
27
|
+
from ml_validate.registry import SchemaRegistry, SchemaInfo
|
|
28
|
+
from ml_validate.validator import Validator, Profiler
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"ValidationResult",
|
|
32
|
+
"SchemaNotFoundError",
|
|
33
|
+
"ValidationInputError",
|
|
34
|
+
"SchemaRegistry",
|
|
35
|
+
"SchemaInfo",
|
|
36
|
+
"Validator",
|
|
37
|
+
"Profiler",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
__version__ = "0.1.0"
|
|
File without changes
|