ml-validate 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. ml_validate-0.1.0/PKG-INFO +180 -0
  2. ml_validate-0.1.0/README.md +146 -0
  3. ml_validate-0.1.0/ml_validate/__init__.py +40 -0
  4. ml_validate-0.1.0/ml_validate/_data/__init__.py +0 -0
  5. ml_validate-0.1.0/ml_validate/_data/repositories/chemotion/schema.json +1105 -0
  6. ml_validate-0.1.0/ml_validate/_data/schemas/Databases/croissant/RAI/1.0/schema.json +11 -0
  7. ml_validate-0.1.0/ml_validate/_data/schemas/Databases/croissant/geo/v1.0/schema.json +11 -0
  8. ml_validate-0.1.0/ml_validate/_data/schemas/Databases/croissant/v1.0/schema.json +11 -0
  9. ml_validate-0.1.0/ml_validate/_data/schemas/Datasets/coco/2023/schema.json +11 -0
  10. ml_validate-0.1.0/ml_validate/_data/schemas/Datasets/huggingface/2.0/schema.json +11 -0
  11. ml_validate-0.1.0/ml_validate/_data/schemas/Datasets/nfdi4chem/0.1/schema.json +11 -0
  12. ml_validate-0.1.0/ml_validate/_data/schemas/Documentation/modelcard/1.0/schema.json +11 -0
  13. ml_validate-0.1.0/ml_validate/_data/schemas/Documentation/xdm/2024.1/schema.json +11 -0
  14. ml_validate-0.1.0/ml_validate/_data/schemas/Machine Learning/featurestore/3.0/schema.json +11 -0
  15. ml_validate-0.1.0/ml_validate/_data/schemas/Machine Learning/ml-schema/v2.0/schema.json +11 -0
  16. ml_validate-0.1.0/ml_validate/_validators/__init__.py +13 -0
  17. ml_validate-0.1.0/ml_validate/_validators/base.py +24 -0
  18. ml_validate-0.1.0/ml_validate/_validators/croissant.py +70 -0
  19. ml_validate-0.1.0/ml_validate/_validators/generic.py +44 -0
  20. ml_validate-0.1.0/ml_validate/_validators/jsonschema_validator.py +69 -0
  21. ml_validate-0.1.0/ml_validate/_validators/ml_schema.py +48 -0
  22. ml_validate-0.1.0/ml_validate/cli.py +284 -0
  23. ml_validate-0.1.0/ml_validate/exceptions.py +10 -0
  24. ml_validate-0.1.0/ml_validate/registry.py +202 -0
  25. ml_validate-0.1.0/ml_validate/result.py +34 -0
  26. ml_validate-0.1.0/ml_validate/validator.py +183 -0
  27. ml_validate-0.1.0/ml_validate.egg-info/PKG-INFO +180 -0
  28. ml_validate-0.1.0/ml_validate.egg-info/SOURCES.txt +35 -0
  29. ml_validate-0.1.0/ml_validate.egg-info/dependency_links.txt +1 -0
  30. ml_validate-0.1.0/ml_validate.egg-info/entry_points.txt +2 -0
  31. ml_validate-0.1.0/ml_validate.egg-info/requires.txt +13 -0
  32. ml_validate-0.1.0/ml_validate.egg-info/top_level.txt +1 -0
  33. ml_validate-0.1.0/pyproject.toml +59 -0
  34. ml_validate-0.1.0/setup.cfg +4 -0
  35. ml_validate-0.1.0/tests/test_cli.py +115 -0
  36. ml_validate-0.1.0/tests/test_registry.py +59 -0
  37. ml_validate-0.1.0/tests/test_validator.py +92 -0
@@ -0,0 +1,180 @@
1
+ Metadata-Version: 2.4
2
+ Name: ml-validate
3
+ Version: 0.1.0
4
+ Summary: Offline-first ML dataset metadata validator for NFDI schemas (Croissant, ML-Schema, XDM, Chemotion, and more)
5
+ Author: NFDI Data Registry
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/your-org/nfdi-data-registry
8
+ Project-URL: Documentation, https://github.com/your-org/nfdi-data-registry/tree/main/ml_validate
9
+ Keywords: nfdi,validation,croissant,ml,schema,metadata,fair
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Quality Assurance
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: jsonschema>=4.17
24
+ Requires-Dist: typer>=0.9
25
+ Requires-Dist: rich>=13.0
26
+ Provides-Extra: fetch
27
+ Requires-Dist: httpx>=0.25; extra == "fetch"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=7; extra == "dev"
30
+ Requires-Dist: pytest-cov; extra == "dev"
31
+ Requires-Dist: ruff; extra == "dev"
32
+ Requires-Dist: mypy; extra == "dev"
33
+ Requires-Dist: httpx>=0.25; extra == "dev"
34
+
35
+ # ml-validate
36
+
37
+ Offline-first Python validator for NFDI and ML metadata schemas.
38
+
39
+ ## Features
40
+
41
+ - Bundled schema registry (no network required for schemas)
42
+ - Python API via `Validator` and `Profiler`
43
+ - CLI for CI/CD pipelines
44
+ - Supports Croissant, ML-Schema, JSON Schema-based formats, and repository schemas
45
+
46
+ ## Install
47
+
48
+ ```bash
49
+ pip install ml-validate
50
+ ```
51
+
52
+ For URL validation support:
53
+
54
+ ```bash
55
+ pip install "ml-validate[fetch]"
56
+ ```
57
+
58
+ ## Quickstart
59
+
60
+ ```python
61
+ from ml_validate import Validator
62
+
63
+ validator = Validator()
64
+ result = validator.validate("dataset.json", "croissant-1.0")
65
+ print(result.passed, result.score)
66
+ ```
67
+
68
+ ```bash
69
+ ml-validate validate dataset.json --schema croissant-1.0
70
+ ```
71
+
72
+ ## Example Data (Chemotion JSON-LD)
73
+
74
+ This repository now includes reusable example files in `examples/` derived from:
75
+
76
+ - `../example_datasets/JSON-LD_Analysis_874350-20260219122553.json`
77
+
78
+ Included examples:
79
+
80
+ - `examples/chemotion_jsonld_sample.json`: curated, realistic JSON-LD sample
81
+ - `examples/chemotion_jsonld_incomplete.json`: intentionally incomplete sample for failure demos
82
+
83
+ The sample keeps the metadata structure and key fields used by the validators while avoiding large personal-author blocks.
84
+
85
+ ## CLI Examples
86
+
87
+ Validate a realistic Chemotion-style JSON-LD example with Croissant heuristics:
88
+
89
+ ```bash
90
+ ml-validate validate examples/chemotion_jsonld_sample.json --schema croissant-1.0
91
+ ```
92
+
93
+ Run a profile across two schemas and get machine-readable output:
94
+
95
+ ```bash
96
+ ml-validate profile examples/chemotion_jsonld_sample.json --schemas croissant-1.0,w3c-ml-2.0 --output json
97
+ ```
98
+
99
+ Show a failing validation case:
100
+
101
+ ```bash
102
+ ml-validate validate examples/chemotion_jsonld_incomplete.json --schema croissant-1.0
103
+ ```
104
+
105
+ ## Python API Examples
106
+
107
+ ```python
108
+ from pathlib import Path
109
+ from ml_validate import Profiler, Validator
110
+
111
+ payload = Path("examples/chemotion_jsonld_sample.json").read_text(encoding="utf-8")
112
+
113
+ validator = Validator()
114
+ single = validator.validate(payload, "croissant-1.0", source_type="json")
115
+ print(single.passed, single.score)
116
+
117
+ profiler = Profiler()
118
+ results = profiler.profile(payload, schema_ids=["croissant-1.0", "w3c-ml-2.0"], source_type="json")
119
+ for schema_id, result in results.items():
120
+ print(schema_id, result.status, result.score)
121
+ ```
122
+
123
+ ## CI/CD Example (GitHub Actions)
124
+
125
+ You can validate metadata files automatically in pull requests and fail the build when rules are not met.
126
+
127
+ ### PR validation workflow
128
+
129
+ Create `.github/workflows/metadata-validation.yml`:
130
+
131
+ ```yaml
132
+ name: Metadata Validation
133
+
134
+ on:
135
+ pull_request:
136
+ paths:
137
+ - "datasets/**/*.json"
138
+ - ".github/workflows/metadata-validation.yml"
139
+
140
+ jobs:
141
+ validate:
142
+ runs-on: ubuntu-latest
143
+ steps:
144
+ - name: Checkout
145
+ uses: actions/checkout@v4
146
+
147
+ - name: Set up Python
148
+ uses: actions/setup-python@v5
149
+ with:
150
+ python-version: "3.12"
151
+
152
+ - name: Install validator
153
+ run: pip install ml-validate
154
+
155
+ - name: Validate a dataset (strict)
156
+ run: |
157
+ ml-validate validate datasets/metadata.json \
158
+ --schema croissant-1.0 \
159
+ --fail-under 90
160
+ ```
161
+
162
+ ### Multi-schema profile gate
163
+
164
+ Use profiling if your pipeline must satisfy more than one schema:
165
+
166
+ ```yaml
167
+ - name: Profile against multiple schemas
168
+ run: |
169
+ ml-validate profile datasets/metadata.json \
170
+ --schemas croissant-1.0,w3c-ml-2.0 \
171
+ --fail-under 80
172
+ ```
173
+
174
+ ### Publish workflow note
175
+
176
+ If you publish this package from GitHub Actions, see:
177
+
178
+ - `.github/workflows/publish.yml`
179
+
180
+ That workflow builds on tag push (for example `v0.1.0`) and uploads distributions to PyPI.
@@ -0,0 +1,146 @@
1
+ # ml-validate
2
+
3
+ Offline-first Python validator for NFDI and ML metadata schemas.
4
+
5
+ ## Features
6
+
7
+ - Bundled schema registry (no network required for schemas)
8
+ - Python API via `Validator` and `Profiler`
9
+ - CLI for CI/CD pipelines
10
+ - Supports Croissant, ML-Schema, JSON Schema-based formats, and repository schemas
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ pip install ml-validate
16
+ ```
17
+
18
+ For URL validation support:
19
+
20
+ ```bash
21
+ pip install "ml-validate[fetch]"
22
+ ```
23
+
24
+ ## Quickstart
25
+
26
+ ```python
27
+ from ml_validate import Validator
28
+
29
+ validator = Validator()
30
+ result = validator.validate("dataset.json", "croissant-1.0")
31
+ print(result.passed, result.score)
32
+ ```
33
+
34
+ ```bash
35
+ ml-validate validate dataset.json --schema croissant-1.0
36
+ ```
37
+
38
+ ## Example Data (Chemotion JSON-LD)
39
+
40
+ This repository now includes reusable example files in `examples/` derived from:
41
+
42
+ - `../example_datasets/JSON-LD_Analysis_874350-20260219122553.json`
43
+
44
+ Included examples:
45
+
46
+ - `examples/chemotion_jsonld_sample.json`: curated, realistic JSON-LD sample
47
+ - `examples/chemotion_jsonld_incomplete.json`: intentionally incomplete sample for failure demos
48
+
49
+ The sample keeps the metadata structure and key fields used by the validators while avoiding large personal-author blocks.
50
+
51
+ ## CLI Examples
52
+
53
+ Validate a realistic Chemotion-style JSON-LD example with Croissant heuristics:
54
+
55
+ ```bash
56
+ ml-validate validate examples/chemotion_jsonld_sample.json --schema croissant-1.0
57
+ ```
58
+
59
+ Run a profile across two schemas and get machine-readable output:
60
+
61
+ ```bash
62
+ ml-validate profile examples/chemotion_jsonld_sample.json --schemas croissant-1.0,w3c-ml-2.0 --output json
63
+ ```
64
+
65
+ Show a failing validation case:
66
+
67
+ ```bash
68
+ ml-validate validate examples/chemotion_jsonld_incomplete.json --schema croissant-1.0
69
+ ```
70
+
71
+ ## Python API Examples
72
+
73
+ ```python
74
+ from pathlib import Path
75
+ from ml_validate import Profiler, Validator
76
+
77
+ payload = Path("examples/chemotion_jsonld_sample.json").read_text(encoding="utf-8")
78
+
79
+ validator = Validator()
80
+ single = validator.validate(payload, "croissant-1.0", source_type="json")
81
+ print(single.passed, single.score)
82
+
83
+ profiler = Profiler()
84
+ results = profiler.profile(payload, schema_ids=["croissant-1.0", "w3c-ml-2.0"], source_type="json")
85
+ for schema_id, result in results.items():
86
+ print(schema_id, result.status, result.score)
87
+ ```
88
+
89
+ ## CI/CD Example (GitHub Actions)
90
+
91
+ You can validate metadata files automatically in pull requests and fail the build when rules are not met.
92
+
93
+ ### PR validation workflow
94
+
95
+ Create `.github/workflows/metadata-validation.yml`:
96
+
97
+ ```yaml
98
+ name: Metadata Validation
99
+
100
+ on:
101
+ pull_request:
102
+ paths:
103
+ - "datasets/**/*.json"
104
+ - ".github/workflows/metadata-validation.yml"
105
+
106
+ jobs:
107
+ validate:
108
+ runs-on: ubuntu-latest
109
+ steps:
110
+ - name: Checkout
111
+ uses: actions/checkout@v4
112
+
113
+ - name: Set up Python
114
+ uses: actions/setup-python@v5
115
+ with:
116
+ python-version: "3.12"
117
+
118
+ - name: Install validator
119
+ run: pip install ml-validate
120
+
121
+ - name: Validate a dataset (strict)
122
+ run: |
123
+ ml-validate validate datasets/metadata.json \
124
+ --schema croissant-1.0 \
125
+ --fail-under 90
126
+ ```
127
+
128
+ ### Multi-schema profile gate
129
+
130
+ Use profiling if your pipeline must satisfy more than one schema:
131
+
132
+ ```yaml
133
+ - name: Profile against multiple schemas
134
+ run: |
135
+ ml-validate profile datasets/metadata.json \
136
+ --schemas croissant-1.0,w3c-ml-2.0 \
137
+ --fail-under 80
138
+ ```
139
+
140
+ ### Publish workflow note
141
+
142
+ If you publish this package from GitHub Actions, see:
143
+
144
+ - `.github/workflows/publish.yml`
145
+
146
+ That workflow builds on tag push (for example `v0.1.0`) and uploads distributions to PyPI.
@@ -0,0 +1,40 @@
1
+ """
2
+ ml-validate — Offline-first ML metadata schema validator.
3
+
4
+ Public API::
5
+
6
+ from ml_validate import Validator, Profiler, SchemaRegistry, ValidationResult
7
+
8
+ # List available schemas
9
+ registry = SchemaRegistry()
10
+ for schema in registry.list():
11
+ print(schema.id, schema.name)
12
+
13
+ # Validate a single dataset
14
+ v = Validator()
15
+ result = v.validate('{"name": "my-dataset", "license": "CC-BY-4.0"}', 'croissant-1.0')
16
+ print(result.score, result.status)
17
+
18
+ # Profile against all standard schemas
19
+ p = Profiler()
20
+ results = p.profile('{"name": "my-dataset"}')
21
+ for schema_id, r in results.items():
22
+ print(schema_id, r.score)
23
+ """
24
+
25
+ from ml_validate.result import ValidationResult
26
+ from ml_validate.exceptions import SchemaNotFoundError, ValidationInputError
27
+ from ml_validate.registry import SchemaRegistry, SchemaInfo
28
+ from ml_validate.validator import Validator, Profiler
29
+
30
+ __all__ = [
31
+ "ValidationResult",
32
+ "SchemaNotFoundError",
33
+ "ValidationInputError",
34
+ "SchemaRegistry",
35
+ "SchemaInfo",
36
+ "Validator",
37
+ "Profiler",
38
+ ]
39
+
40
+ __version__ = "0.1.0"
File without changes