cml-schemas 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 NHS England
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cml-schemas
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: CML Spark schemas
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: NHS England Data Science team
|
|
8
|
+
Author-email: datascience@nhs.net
|
|
9
|
+
Requires-Python: >=3.10,<4
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Requires-Dist: pyspark (>=3.5)
|
|
18
|
+
Project-URL: Repository, https://github.com/nhsengland/cml-schemas
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# Central Metrics Library Schemas (Python)
|
|
22
|
+
|
|
23
|
+
A lightweight Python package providing validated **schemas for the Central Metrics Library (CML)** in (eventually) multiple formats.
|
|
24
|
+
|
|
25
|
+
**Current focus:** Apache Spark (`pyspark.sql.types.StructType`).
|
|
26
|
+
|
|
27
|
+
> The Central Metrics Library (CML) defines a common structure for **metrics** and the **metadata** that describes them, so analytical teams can produce, discover, and reuse metrics consistently across the NHS. This package implements those schemas for use in data pipelines.
|
|
28
|
+
|
|
29
|
+
***
|
|
30
|
+
|
|
31
|
+
## Why this exists
|
|
32
|
+
|
|
33
|
+
Today, metrics live in many places and many shapes—hard to find, easy to duplicate, and sometimes inconsistent. The CML aims to unify metric structures and metadata into a single, curated, service-managed library so analysts can source authoritative, consistently defined metrics, supported by appropriate security tagging and clear SME-owned definitions (purpose, methods, limitations, differences from similar measures). This repo hosts code-first schemas aligned to that aim.
|
|
34
|
+
|
|
35
|
+
***
|
|
36
|
+
|
|
37
|
+
## Status: **BETA**
|
|
38
|
+
|
|
39
|
+
The CML—and therefore these schemas—are in **beta** while we pilot with analytical teams and iterate on feedback. Expect **breaking changes** as the specification evolves. Please adopt **resilient coding practices** and pin schema versions where appropriate.
|
|
40
|
+
|
|
41
|
+
***
|
|
42
|
+
|
|
43
|
+
## What's in the box
|
|
44
|
+
|
|
45
|
+
**Spark schemas** for core CML entities (initial set):
|
|
46
|
+
|
|
47
|
+
* `METRIC_SCHEMA` — the measured value(s) and identifiers
|
|
48
|
+
* `DIMENSIONS_SCHEMA` — base schema for dimensions used to slice metrics
|
|
49
|
+
|
|
50
|
+
**Helper functions:**
|
|
51
|
+
|
|
52
|
+
* `create_dimensions_schema(dimensions)` — builds a full dimensions schema from a list of dimension column names
|
|
53
|
+
* `select_from_schema(df, schema)` — selects and reorders DataFrame columns to match a schema
|
|
54
|
+
* `validate_schema(df, schema)` — validates a DataFrame's column names and types against a schema
|
|
55
|
+
|
|
56
|
+
These mirror the "draft standardised schema" referenced in the CML materials and will track the official spec as it matures.
|
|
57
|
+
|
|
58
|
+
**Coming soon:**
|
|
59
|
+
|
|
60
|
+
* `metadata` schema — descriptive info: purpose, methodology, caveats, lineage, etc.
|
|
61
|
+
* `relationship` schema — links between metrics and other artefacts
|
|
62
|
+
|
|
63
|
+
***
|
|
64
|
+
|
|
65
|
+
## Installation
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install cml-schemas
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
> Tip: Pin to a specific version (`cml-schemas==x.y.z`) to protect your pipelines from breaking changes during beta.
|
|
72
|
+
|
|
73
|
+
***
|
|
74
|
+
|
|
75
|
+
## Quick start (Spark)
|
|
76
|
+
|
|
77
|
+
### Use a built-in schema
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from cml_schemas import spark_schemas
|
|
81
|
+
|
|
82
|
+
# Create an empty, schema-correct DataFrame
|
|
83
|
+
empty_df = spark.createDataFrame([], schema=spark_schemas.METRIC_SCHEMA)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Build a dimensions schema dynamically
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from cml_schemas import spark_schemas
|
|
90
|
+
|
|
91
|
+
dimensions = ["AgeGroup", "Region", "Ethnicity"]
|
|
92
|
+
schema = spark_schemas.create_dimensions_schema(dimensions)
|
|
93
|
+
|
|
94
|
+
empty_df = spark.createDataFrame([], schema=schema)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Validate a DataFrame against a schema
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from cml_schemas import spark_schemas
|
|
101
|
+
|
|
102
|
+
# Raises TypeError with all mismatches listed if validation fails
|
|
103
|
+
spark_schemas.validate_schema(df, spark_schemas.METRIC_SCHEMA)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Select and reorder columns to match a schema
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from cml_schemas import spark_schemas
|
|
110
|
+
|
|
111
|
+
# Selects only the columns defined in the schema, in schema order
|
|
112
|
+
df = spark_schemas.select_from_schema(df, spark_schemas.METRIC_SCHEMA)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
***
|
|
116
|
+
|
|
117
|
+
## Principles for usage
|
|
118
|
+
|
|
119
|
+
* **Spec-first**: Schemas track the CML Data Specification (draft during beta). When the official fields or formats change, this package revs a minor or major version, with changelog notes. We recommend locking to a specific version of this package to avoid breaking changes when the schema is updated.
|
|
120
|
+
* **Build from tidy data where possible**: Aim to produce metrics by first producing outputs in tidy-data format and converting from there to the CML spec. See the [CML conversion helper functions](https://github.com/nhsengland/cml-conversion-helpers).
|
|
121
|
+
* **RAP**: Aim to develop your pipelines in line with RAP (Reproducible Analytical Pipelines) principles — see the [RAP Community of Practice website](https://nhsdigital.github.io/rap-community-of-practice/) for guidance.
|
|
122
|
+
|
|
123
|
+
***
|
|
124
|
+
|
|
125
|
+
## How this maps to the CML artefacts
|
|
126
|
+
|
|
127
|
+
* **CML Proforma & Spec**: Informs field names, types, nullability, and relationships for `metric`, `metadata`, `relationship`, `dimension`.
|
|
128
|
+
Producers can continue to complete the proforma as documentation while using these programmatic schemas in code.
|
|
129
|
+
* **Ownership & curation**: This repo does not own business definitions; SMEs own and maintain metric definitions. We only provide the technical shapes to carry those definitions consistently.
|
|
130
|
+
* **Discovery & serving**: FDP National/Metadata Explore Hub will surface metrics/metadata to end users. This package helps you produce compliant data for that ecosystem.
|
|
131
|
+
|
|
132
|
+
***
|
|
133
|
+
|
|
134
|
+
## Versioning
|
|
135
|
+
|
|
136
|
+
* **0.x**: Beta; spec and code may change.
|
|
137
|
+
|
|
138
|
+
During beta, **breaking changes can occur**—please pin versions and read the [changelog](CHANGELOG.md).
|
|
139
|
+
|
|
140
|
+
***
|
|
141
|
+
|
|
142
|
+
## Contributing
|
|
143
|
+
|
|
144
|
+
We welcome issues and PRs, especially for:
|
|
145
|
+
|
|
146
|
+
* Gaps or mismatches vs the CML spec (with references)
|
|
147
|
+
* Additional runtime formats (e.g., JSON Schema, SQL DDL)
|
|
148
|
+
* Validation and test data generators
|
|
149
|
+
* Developer experience improvements
|
|
150
|
+
|
|
151
|
+
***
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
MIT
|
|
156
|
+
|
|
157
|
+
***
|
|
158
|
+
|
|
159
|
+
## Acknowledgements
|
|
160
|
+
|
|
161
|
+
This package is inspired by and aligned to the **Central Metrics Library** initiative, developed with analytical teams and Platform Modernisation to fit the developing **FDP National** platform.
|
|
162
|
+
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Central Metrics Library Schemas (Python)
|
|
2
|
+
|
|
3
|
+
A lightweight Python package providing validated **schemas for the Central Metrics Library (CML)** in (eventually) multiple formats.
|
|
4
|
+
|
|
5
|
+
**Current focus:** Apache Spark (`pyspark.sql.types.StructType`).
|
|
6
|
+
|
|
7
|
+
> The Central Metrics Library (CML) defines a common structure for **metrics** and the **metadata** that describes them, so analytical teams can produce, discover, and reuse metrics consistently across the NHS. This package implements those schemas for use in data pipelines.
|
|
8
|
+
|
|
9
|
+
***
|
|
10
|
+
|
|
11
|
+
## Why this exists
|
|
12
|
+
|
|
13
|
+
Today, metrics live in many places and many shapes—hard to find, easy to duplicate, and sometimes inconsistent. The CML aims to unify metric structures and metadata into a single, curated, service-managed library so analysts can source authoritative, consistently defined metrics, supported by appropriate security tagging and clear SME-owned definitions (purpose, methods, limitations, differences from similar measures). This repo hosts code-first schemas aligned to that aim.
|
|
14
|
+
|
|
15
|
+
***
|
|
16
|
+
|
|
17
|
+
## Status: **BETA**
|
|
18
|
+
|
|
19
|
+
The CML—and therefore these schemas—are in **beta** while we pilot with analytical teams and iterate on feedback. Expect **breaking changes** as the specification evolves. Please adopt **resilient coding practices** and pin schema versions where appropriate.
|
|
20
|
+
|
|
21
|
+
***
|
|
22
|
+
|
|
23
|
+
## What's in the box
|
|
24
|
+
|
|
25
|
+
**Spark schemas** for core CML entities (initial set):
|
|
26
|
+
|
|
27
|
+
* `METRIC_SCHEMA` — the measured value(s) and identifiers
|
|
28
|
+
* `DIMENSIONS_SCHEMA` — base schema for dimensions used to slice metrics
|
|
29
|
+
|
|
30
|
+
**Helper functions:**
|
|
31
|
+
|
|
32
|
+
* `create_dimensions_schema(dimensions)` — builds a full dimensions schema from a list of dimension column names
|
|
33
|
+
* `select_from_schema(df, schema)` — selects and reorders DataFrame columns to match a schema
|
|
34
|
+
* `validate_schema(df, schema)` — validates a DataFrame's column names and types against a schema
|
|
35
|
+
|
|
36
|
+
These mirror the "draft standardised schema" referenced in the CML materials and will track the official spec as it matures.
|
|
37
|
+
|
|
38
|
+
**Coming soon:**
|
|
39
|
+
|
|
40
|
+
* `metadata` schema — descriptive info: purpose, methodology, caveats, lineage, etc.
|
|
41
|
+
* `relationship` schema — links between metrics and other artefacts
|
|
42
|
+
|
|
43
|
+
***
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install cml-schemas
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
> Tip: Pin to a specific version (`cml-schemas==x.y.z`) to protect your pipelines from breaking changes during beta.
|
|
52
|
+
|
|
53
|
+
***
|
|
54
|
+
|
|
55
|
+
## Quick start (Spark)
|
|
56
|
+
|
|
57
|
+
### Use a built-in schema
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from cml_schemas import spark_schemas
|
|
61
|
+
|
|
62
|
+
# Create an empty, schema-correct DataFrame
|
|
63
|
+
empty_df = spark.createDataFrame([], schema=spark_schemas.METRIC_SCHEMA)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Build a dimensions schema dynamically
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from cml_schemas import spark_schemas
|
|
70
|
+
|
|
71
|
+
dimensions = ["AgeGroup", "Region", "Ethnicity"]
|
|
72
|
+
schema = spark_schemas.create_dimensions_schema(dimensions)
|
|
73
|
+
|
|
74
|
+
empty_df = spark.createDataFrame([], schema=schema)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Validate a DataFrame against a schema
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from cml_schemas import spark_schemas
|
|
81
|
+
|
|
82
|
+
# Raises TypeError with all mismatches listed if validation fails
|
|
83
|
+
spark_schemas.validate_schema(df, spark_schemas.METRIC_SCHEMA)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Select and reorder columns to match a schema
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from cml_schemas import spark_schemas
|
|
90
|
+
|
|
91
|
+
# Selects only the columns defined in the schema, in schema order
|
|
92
|
+
df = spark_schemas.select_from_schema(df, spark_schemas.METRIC_SCHEMA)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
***
|
|
96
|
+
|
|
97
|
+
## Principles for usage
|
|
98
|
+
|
|
99
|
+
* **Spec-first**: Schemas track the CML Data Specification (draft during beta). When the official fields or formats change, this package revs a minor or major version, with changelog notes. We recommend locking to a specific version of this package to avoid breaking changes when the schema is updated.
|
|
100
|
+
* **Build from tidy data where possible**: Aim to produce metrics by first producing outputs in tidy-data format and converting from there to the CML spec. See the [CML conversion helper functions](https://github.com/nhsengland/cml-conversion-helpers).
|
|
101
|
+
* **RAP**: Aim to develop your pipelines in line with RAP (Reproducible Analytical Pipelines) principles — see the [RAP Community of Practice website](https://nhsdigital.github.io/rap-community-of-practice/) for guidance.
|
|
102
|
+
|
|
103
|
+
***
|
|
104
|
+
|
|
105
|
+
## How this maps to the CML artefacts
|
|
106
|
+
|
|
107
|
+
* **CML Proforma & Spec**: Informs field names, types, nullability, and relationships for `metric`, `metadata`, `relationship`, `dimension`.
|
|
108
|
+
Producers can continue to complete the proforma as documentation while using these programmatic schemas in code.
|
|
109
|
+
* **Ownership & curation**: This repo does not own business definitions; SMEs own and maintain metric definitions. We only provide the technical shapes to carry those definitions consistently.
|
|
110
|
+
* **Discovery & serving**: FDP National/Metadata Explore Hub will surface metrics/metadata to end users. This package helps you produce compliant data for that ecosystem.
|
|
111
|
+
|
|
112
|
+
***
|
|
113
|
+
|
|
114
|
+
## Versioning
|
|
115
|
+
|
|
116
|
+
* **0.x**: Beta; spec and code may change.
|
|
117
|
+
|
|
118
|
+
During beta, **breaking changes can occur**—please pin versions and read the [changelog](CHANGELOG.md).
|
|
119
|
+
|
|
120
|
+
***
|
|
121
|
+
|
|
122
|
+
## Contributing
|
|
123
|
+
|
|
124
|
+
We welcome issues and PRs, especially for:
|
|
125
|
+
|
|
126
|
+
* Gaps or mismatches vs the CML spec (with references)
|
|
127
|
+
* Additional runtime formats (e.g., JSON Schema, SQL DDL)
|
|
128
|
+
* Validation and test data generators
|
|
129
|
+
* Developer experience improvements
|
|
130
|
+
|
|
131
|
+
***
|
|
132
|
+
|
|
133
|
+
## License
|
|
134
|
+
|
|
135
|
+
MIT
|
|
136
|
+
|
|
137
|
+
***
|
|
138
|
+
|
|
139
|
+
## Acknowledgements
|
|
140
|
+
|
|
141
|
+
This package is inspired by and aligned to the **Central Metrics Library** initiative, developed with analytical teams and Platform Modernisation to fit the developing **FDP National** platform.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "cml-schemas"
|
|
3
|
+
version = "0.1.2"
|
|
4
|
+
description = "CML Spark schemas"
|
|
5
|
+
authors = ["NHS England Data Science team <datascience@nhs.net>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
repository = "https://github.com/nhsengland/cml-schemas"
|
|
9
|
+
packages = [{include = "cml_schemas", from = "src"}]
|
|
10
|
+
|
|
11
|
+
[tool.poetry.dependencies]
|
|
12
|
+
python = ">=3.10, <4"
|
|
13
|
+
pyspark = ">=3.5"
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["poetry-core"]
|
|
17
|
+
build-backend = "poetry.core.masonry.api"
|
|
18
|
+
|
|
19
|
+
[dependency-groups]
|
|
20
|
+
dev = [
|
|
21
|
+
"pytest (>=9.0.2,<10.0.0)"
|
|
22
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
METRIC_SCHEMA = StructType([
|
|
6
|
+
StructField("datapoint_id", StringType(), nullable=False),
|
|
7
|
+
StructField("metric_id", StringType(), nullable=False),
|
|
8
|
+
StructField("metric_dimension_id", StringType(), nullable=False),
|
|
9
|
+
StructField("dimension_cohort_id", StringType(), nullable=False),
|
|
10
|
+
StructField("location_id", StringType(), nullable=False),
|
|
11
|
+
StructField("location_type", StringType(), nullable=False),
|
|
12
|
+
StructField("reporting_period_start_datetime", TimestampType(), nullable=False),
|
|
13
|
+
StructField("last_record_timestamp", TimestampType(), nullable=False),
|
|
14
|
+
StructField("last_ingest_timestamp", TimestampType(), nullable=False),
|
|
15
|
+
StructField("publication_date", TimestampType(), nullable=False),
|
|
16
|
+
StructField("metric_value", StringType(), nullable=True),
|
|
17
|
+
StructField("additional_metric_values", StringType(), nullable=True),
|
|
18
|
+
])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
DIMENSIONS_SCHEMA = StructType([
|
|
22
|
+
StructField("dimension_cohort_id", StringType(), nullable=False),
|
|
23
|
+
StructField("metric_id", StringType(), nullable=False),
|
|
24
|
+
])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_dimensions_schema(dimensions: list[str]) -> StructType:
|
|
28
|
+
dimension_fields = [StructField(col, StringType(), nullable=True) for col in dimensions]
|
|
29
|
+
return StructType(DIMENSIONS_SCHEMA.fields + dimension_fields)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def select_from_schema(df: DataFrame, schema: StructType) -> DataFrame:
|
|
33
|
+
return df.select(*[field.name for field in schema.fields])
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_schema(df: DataFrame, schema: StructType) -> None:
|
|
37
|
+
df_types = dict(df.dtypes)
|
|
38
|
+
errors = []
|
|
39
|
+
|
|
40
|
+
for field in schema.fields:
|
|
41
|
+
col_name = field.name
|
|
42
|
+
|
|
43
|
+
if col_name not in df_types:
|
|
44
|
+
errors.append(f" - '{col_name}': column is missing")
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
actual = df_types[col_name]
|
|
48
|
+
expected = field.dataType.simpleString()
|
|
49
|
+
if actual != expected:
|
|
50
|
+
errors.append(f" - '{col_name}': expected {expected}, got {actual}")
|
|
51
|
+
|
|
52
|
+
if errors:
|
|
53
|
+
raise TypeError("Schema validation failed:\n" + "\n".join(errors))
|