dml-dev 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dml_dev-0.1.1/MANIFEST.in +4 -0
- dml_dev-0.1.1/PKG-INFO +137 -0
- dml_dev-0.1.1/README.md +109 -0
- dml_dev-0.1.1/agent.md +78 -0
- dml_dev-0.1.1/dml_code/pipeline/__init__.py +1 -0
- dml_dev-0.1.0/project_code/pipeline/build.py → dml_dev-0.1.1/dml_code/pipeline/step1_build.py +9 -9
- dml_dev-0.1.0/project_code/pipeline/estimate.py → dml_dev-0.1.1/dml_code/pipeline/step2_estimate.py +14 -13
- {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/build_helpers.py +47 -41
- {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/estimate_helpers.py +34 -33
- dml_dev-0.1.1/dml_code/src/outputs.py +662 -0
- {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/paths.py +7 -14
- {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/utils.py +1 -17
- dml_dev-0.1.1/dml_dev.egg-info/PKG-INFO +137 -0
- dml_dev-0.1.1/dml_dev.egg-info/SOURCES.txt +30 -0
- dml_dev-0.1.1/dml_dev.egg-info/entry_points.txt +3 -0
- {dml_dev-0.1.0 → dml_dev-0.1.1}/dml_dev.egg-info/requires.txt +1 -0
- dml_dev-0.1.1/dml_dev.egg-info/top_level.txt +2 -0
- dml_dev-0.1.1/project_configuration/__init__.py +5 -0
- dml_dev-0.1.1/project_configuration/build_spec.py +43 -0
- dml_dev-0.1.1/project_configuration/estimation_experiments/__init__.py +1 -0
- dml_dev-0.1.1/project_configuration/estimation_experiments/example_experiment.yaml +16 -0
- dml_dev-0.1.1/project_configuration/estimation_experiments/synthetic_example.yaml +16 -0
- dml_dev-0.1.1/project_configuration/registries/__init__.py +1 -0
- dml_dev-0.1.1/project_configuration/registries/covariate_sets.py +24 -0
- dml_dev-0.1.1/project_configuration/registries/filter_sets.py +11 -0
- dml_dev-0.1.1/project_configuration/registries/models.py +32 -0
- dml_dev-0.1.1/project_configuration/registries/programs.py +21 -0
- {dml_dev-0.1.0 → dml_dev-0.1.1}/pyproject.toml +9 -8
- dml_dev-0.1.0/MANIFEST.in +0 -4
- dml_dev-0.1.0/PKG-INFO +0 -35
- dml_dev-0.1.0/README.md +0 -8
- dml_dev-0.1.0/agent.md +0 -87
- dml_dev-0.1.0/config_template/__init__.py +0 -1
- dml_dev-0.1.0/config_template/build_spec.py +0 -15
- dml_dev-0.1.0/config_template/experiments/__init__.py +0 -2
- dml_dev-0.1.0/config_template/experiments/example_experiment.yaml +0 -7
- dml_dev-0.1.0/config_template/registries/__init__.py +0 -1
- dml_dev-0.1.0/config_template/registries/covariate_sets.py +0 -3
- dml_dev-0.1.0/config_template/registries/filter_sets.py +0 -4
- dml_dev-0.1.0/config_template/registries/models.py +0 -6
- dml_dev-0.1.0/config_template/registries/programs.py +0 -4
- dml_dev-0.1.0/dml_dev.egg-info/PKG-INFO +0 -35
- dml_dev-0.1.0/dml_dev.egg-info/SOURCES.txt +0 -29
- dml_dev-0.1.0/dml_dev.egg-info/entry_points.txt +0 -3
- dml_dev-0.1.0/dml_dev.egg-info/top_level.txt +0 -2
- dml_dev-0.1.0/project_code/pipeline/__init__.py +0 -2
- dml_dev-0.1.0/project_code/src/plotting.py +0 -253
- {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/__init__.py +0 -0
- {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/__init__.py +0 -0
- {dml_dev-0.1.0 → dml_dev-0.1.1}/dml_dev.egg-info/dependency_links.txt +0 -0
- {dml_dev-0.1.0 → dml_dev-0.1.1}/setup.cfg +0 -0
dml_dev-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dml-dev
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: DoubleML build, estimation, plotting, and utility pipelines.
|
|
5
|
+
Author: DML Pipeline Contributors
|
|
6
|
+
Keywords: administrative-data,causal-inference,doubleml,observational-data,program-evaluation
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering
|
|
13
|
+
Requires-Python: >=3.12
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: doubleml
|
|
16
|
+
Requires-Dist: joblib
|
|
17
|
+
Requires-Dist: oi-tools[figures]
|
|
18
|
+
Requires-Dist: plotnine
|
|
19
|
+
Requires-Dist: polars
|
|
20
|
+
Requires-Dist: pyarrow
|
|
21
|
+
Requires-Dist: psutil
|
|
22
|
+
Requires-Dist: PyYAML
|
|
23
|
+
Requires-Dist: scikit-learn
|
|
24
|
+
Requires-Dist: threadpoolctl
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: build; extra == "dev"
|
|
27
|
+
Requires-Dist: twine; extra == "dev"
|
|
28
|
+
|
|
29
|
+
# DML Pipeline
|
|
30
|
+
|
|
31
|
+
This repo is a small framework for running DoubleML on administrative-style
|
|
32
|
+
program data. It separates project-specific choices from reusable pipeline code:
|
|
33
|
+
you edit `project_configuration/`, then run the pipeline in `dml_code/`.
|
|
34
|
+
|
|
35
|
+
The repo is currently filled with a synthetic example so you can run the whole
|
|
36
|
+
flow before replacing it with real project data.
|
|
37
|
+
|
|
38
|
+
## Mental Model
|
|
39
|
+
|
|
40
|
+
The workflow has two main steps:
|
|
41
|
+
|
|
42
|
+
1. **Build an analysis dataset.** Start from a databank and program file,
|
|
43
|
+
join them, construct event-time variables, and write processed panels to
|
|
44
|
+
`data/build_output/`.
|
|
45
|
+
2. **Estimate DML effects.** Read a YAML experiment, resolve its program,
|
|
46
|
+
covariates, filters, and models from the registries, then write logs to
|
|
47
|
+
`outputs/raw/`.
|
|
48
|
+
|
|
49
|
+
After estimation, scripts can turn the raw logs into plots and tables.
|
|
50
|
+
|
|
51
|
+
```text
|
|
52
|
+
project_configuration/ + data/build/
|
|
53
|
+
|
|
|
54
|
+
v
|
|
55
|
+
dml_code.pipeline.step1_build
|
|
56
|
+
|
|
|
57
|
+
v
|
|
58
|
+
data/build_output/
|
|
59
|
+
|
|
|
60
|
+
v
|
|
61
|
+
dml_code.pipeline.step2_estimate
|
|
62
|
+
|
|
|
63
|
+
v
|
|
64
|
+
outputs/raw/ -> outputs/plots/ and outputs/tables/
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Run The Example
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
python project_scripts/generate_example.py
|
|
71
|
+
python -m dml_code.pipeline.step1_build example_program
|
|
72
|
+
python -m dml_code.pipeline.step2_estimate synthetic_example
|
|
73
|
+
python project_scripts/plot_example.py
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
The first command creates synthetic input data in `data/build/`. Step 1 writes
|
|
77
|
+
processed panels to `data/build_output/`. Step 2 writes estimation and
|
|
78
|
+
prediction logs to `outputs/raw/`. The plotting script writes diagnostics to
|
|
79
|
+
`outputs/plots/` and `outputs/tables/`.
|
|
80
|
+
|
|
81
|
+
## What You Edit
|
|
82
|
+
|
|
83
|
+
Most project setup happens in `project_configuration/`.
|
|
84
|
+
|
|
85
|
+
- `project_configuration/build_spec.py`: define the databank files, columns to carry through,
|
|
86
|
+
relative-time columns to generate, and any generated features created after
|
|
87
|
+
panel construction.
|
|
88
|
+
- `project_configuration/registries/programs.py`: define each program: its source file,
|
|
89
|
+
treatment column, enrollment-year column, and program-specific columns.
|
|
90
|
+
- `project_configuration/registries/covariate_sets.py`: name reusable covariate lists and mark
|
|
91
|
+
categorical covariates for dummy encoding.
|
|
92
|
+
- `project_configuration/registries/filter_sets.py`: name reusable Polars filters for
|
|
93
|
+
estimation samples.
|
|
94
|
+
- `project_configuration/registries/models.py`: name outcome and propensity learners.
|
|
95
|
+
- `project_configuration/estimation_experiments/*.yaml`: choose combinations of programs, outcomes,
|
|
96
|
+
covariates, filters, models, and control sampling rates to estimate.
|
|
97
|
+
|
|
98
|
+
The pipeline code in `dml_code/` is meant to stay reusable.
|
|
99
|
+
|
|
100
|
+
- `dml_code/pipeline/`: runnable steps, `step1_build.py` and
|
|
101
|
+
`step2_estimate.py`.
|
|
102
|
+
- `dml_code/src/`: shared helpers for building, estimating, paths, outputs,
|
|
103
|
+
and logging.
|
|
104
|
+
|
|
105
|
+
`project_scripts/` is for ad hoc project work tied to particular runs:
|
|
106
|
+
generating example data, viewing outputs, making plots, running diagnostics,
|
|
107
|
+
and writing small experiment-specific analyses.
|
|
108
|
+
|
|
109
|
+
## How To Add A Real Project
|
|
110
|
+
|
|
111
|
+
1. Put source parquet files somewhere under `data/` or point `project_configuration/` at their
|
|
112
|
+
real locations.
|
|
113
|
+
2. Update `project_configuration/build_spec.py` with the databank files and feature-generation
|
|
114
|
+
logic.
|
|
115
|
+
3. Add program definitions in `project_configuration/registries/programs.py`.
|
|
116
|
+
4. Add covariate sets, filters, and models in the registry files.
|
|
117
|
+
5. Create or copy a YAML file in `project_configuration/estimation_experiments/`.
|
|
118
|
+
6. Run step 1 for a program, then step 2 for an experiment.
|
|
119
|
+
|
|
120
|
+
Example:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
python -m dml_code.pipeline.step1_build my_program
|
|
124
|
+
python -m dml_code.pipeline.step2_estimate my_experiment
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Use `project_scripts/` for project-specific follow-up work: viewing outputs
|
|
128
|
+
from particular runs, making plots and tables, running diagnostics, robustness
|
|
129
|
+
checks, and other exploratory analyses.
|
|
130
|
+
|
|
131
|
+
## Where Results Go
|
|
132
|
+
|
|
133
|
+
- `data/build/`: input data used by the example.
|
|
134
|
+
- `data/build_output/`: processed analysis datasets created by step 1.
|
|
135
|
+
- `outputs/raw/`: machine-readable estimation, prediction, and diagnostic logs.
|
|
136
|
+
- `outputs/plots/`: generated figures.
|
|
137
|
+
- `outputs/tables/`: generated tables.
|
dml_dev-0.1.1/README.md
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# DML Pipeline
|
|
2
|
+
|
|
3
|
+
This repo is a small framework for running DoubleML on administrative-style
|
|
4
|
+
program data. It separates project-specific choices from reusable pipeline code:
|
|
5
|
+
you edit `project_configuration/`, then run the pipeline in `dml_code/`.
|
|
6
|
+
|
|
7
|
+
The repo is currently filled with a synthetic example so you can run the whole
|
|
8
|
+
flow before replacing it with real project data.
|
|
9
|
+
|
|
10
|
+
## Mental Model
|
|
11
|
+
|
|
12
|
+
The workflow has two main steps:
|
|
13
|
+
|
|
14
|
+
1. **Build an analysis dataset.** Start from a databank and program file,
|
|
15
|
+
join them, construct event-time variables, and write processed panels to
|
|
16
|
+
`data/build_output/`.
|
|
17
|
+
2. **Estimate DML effects.** Read a YAML experiment, resolve its program,
|
|
18
|
+
covariates, filters, and models from the registries, then write logs to
|
|
19
|
+
`outputs/raw/`.
|
|
20
|
+
|
|
21
|
+
After estimation, scripts can turn the raw logs into plots and tables.
|
|
22
|
+
|
|
23
|
+
```text
|
|
24
|
+
project_configuration/ + data/build/
|
|
25
|
+
|
|
|
26
|
+
v
|
|
27
|
+
dml_code.pipeline.step1_build
|
|
28
|
+
|
|
|
29
|
+
v
|
|
30
|
+
data/build_output/
|
|
31
|
+
|
|
|
32
|
+
v
|
|
33
|
+
dml_code.pipeline.step2_estimate
|
|
34
|
+
|
|
|
35
|
+
v
|
|
36
|
+
outputs/raw/ -> outputs/plots/ and outputs/tables/
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Run The Example
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
python project_scripts/generate_example.py
|
|
43
|
+
python -m dml_code.pipeline.step1_build example_program
|
|
44
|
+
python -m dml_code.pipeline.step2_estimate synthetic_example
|
|
45
|
+
python project_scripts/plot_example.py
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
The first command creates synthetic input data in `data/build/`. Step 1 writes
|
|
49
|
+
processed panels to `data/build_output/`. Step 2 writes estimation and
|
|
50
|
+
prediction logs to `outputs/raw/`. The plotting script writes diagnostics to
|
|
51
|
+
`outputs/plots/` and `outputs/tables/`.
|
|
52
|
+
|
|
53
|
+
## What You Edit
|
|
54
|
+
|
|
55
|
+
Most project setup happens in `project_configuration/`.
|
|
56
|
+
|
|
57
|
+
- `project_configuration/build_spec.py`: define the databank files, columns to carry through,
|
|
58
|
+
relative-time columns to generate, and any generated features created after
|
|
59
|
+
panel construction.
|
|
60
|
+
- `project_configuration/registries/programs.py`: define each program: its source file,
|
|
61
|
+
treatment column, enrollment-year column, and program-specific columns.
|
|
62
|
+
- `project_configuration/registries/covariate_sets.py`: name reusable covariate lists and mark
|
|
63
|
+
categorical covariates for dummy encoding.
|
|
64
|
+
- `project_configuration/registries/filter_sets.py`: name reusable Polars filters for
|
|
65
|
+
estimation samples.
|
|
66
|
+
- `project_configuration/registries/models.py`: name outcome and propensity learners.
|
|
67
|
+
- `project_configuration/estimation_experiments/*.yaml`: choose combinations of programs, outcomes,
|
|
68
|
+
covariates, filters, models, and control sampling rates to estimate.
|
|
69
|
+
|
|
70
|
+
The pipeline code in `dml_code/` is meant to stay reusable.
|
|
71
|
+
|
|
72
|
+
- `dml_code/pipeline/`: runnable steps, `step1_build.py` and
|
|
73
|
+
`step2_estimate.py`.
|
|
74
|
+
- `dml_code/src/`: shared helpers for building, estimating, paths, outputs,
|
|
75
|
+
and logging.
|
|
76
|
+
|
|
77
|
+
`project_scripts/` is for ad hoc project work tied to particular runs:
|
|
78
|
+
generating example data, viewing outputs, making plots, running diagnostics,
|
|
79
|
+
and writing small experiment-specific analyses.
|
|
80
|
+
|
|
81
|
+
## How To Add A Real Project
|
|
82
|
+
|
|
83
|
+
1. Put source parquet files somewhere under `data/` or point `project_configuration/` at their
|
|
84
|
+
real locations.
|
|
85
|
+
2. Update `project_configuration/build_spec.py` with the databank files and feature-generation
|
|
86
|
+
logic.
|
|
87
|
+
3. Add program definitions in `project_configuration/registries/programs.py`.
|
|
88
|
+
4. Add covariate sets, filters, and models in the registry files.
|
|
89
|
+
5. Create or copy a YAML file in `project_configuration/estimation_experiments/`.
|
|
90
|
+
6. Run step 1 for a program, then step 2 for an experiment.
|
|
91
|
+
|
|
92
|
+
Example:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
python -m dml_code.pipeline.step1_build my_program
|
|
96
|
+
python -m dml_code.pipeline.step2_estimate my_experiment
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Use `project_scripts/` for project-specific follow-up work: viewing outputs
|
|
100
|
+
from particular runs, making plots and tables, running diagnostics, robustness
|
|
101
|
+
checks, and other exploratory analyses.
|
|
102
|
+
|
|
103
|
+
## Where Results Go
|
|
104
|
+
|
|
105
|
+
- `data/build/`: input data used by the example.
|
|
106
|
+
- `data/build_output/`: processed analysis datasets created by step 1.
|
|
107
|
+
- `outputs/raw/`: machine-readable estimation, prediction, and diagnostic logs.
|
|
108
|
+
- `outputs/plots/`: generated figures.
|
|
109
|
+
- `outputs/tables/`: generated tables.
|
dml_dev-0.1.1/agent.md
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Repository Guide
|
|
2
|
+
|
|
3
|
+
This repository contains a general-purpose Polars build pipeline and DoubleML
|
|
4
|
+
estimation pipeline for administrative observational data. The package exposes
|
|
5
|
+
the full implementation under `dml_code`, including executable pipeline
|
|
6
|
+
entrypoints and shared helper modules.
|
|
7
|
+
|
|
8
|
+
## Runtime Paths
|
|
9
|
+
|
|
10
|
+
Runtime locations are defined in `dml_code/src/paths.py`. `LOCAL_DIR` is
|
|
11
|
+
resolved from the repository location, and data, project configuration, and
|
|
12
|
+
outputs are defined relative to it.
|
|
13
|
+
|
|
14
|
+
## Main Pipeline
|
|
15
|
+
|
|
16
|
+
### Build Processed Panels
|
|
17
|
+
|
|
18
|
+
`dml_code/pipeline/step1_build.py` builds program-specific processed parquet
|
|
19
|
+
files from a configured administrative data source and program registry entry.
|
|
20
|
+
|
|
21
|
+
The script:
|
|
22
|
+
|
|
23
|
+
1. Loads source panel files from the build spec.
|
|
24
|
+
2. Loads a selected program source definition.
|
|
25
|
+
3. Joins program records to the source panel on the configured join key.
|
|
26
|
+
4. Constructs relative-time panels for treated and eligible comparison records.
|
|
27
|
+
5. Creates relative-time variables from calendar-time source columns.
|
|
28
|
+
6. Applies configured post-panel feature transforms.
|
|
29
|
+
7. Writes processed parquet files for downstream estimation.
|
|
30
|
+
|
|
31
|
+
CLI usage:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
python dml_code/pipeline/step1_build.py <program_pointer>
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Estimate Effects
|
|
38
|
+
|
|
39
|
+
`dml_code/pipeline/step2_estimate.py` runs DoubleML estimation from a YAML
|
|
40
|
+
experiment spec.
|
|
41
|
+
|
|
42
|
+
The script:
|
|
43
|
+
|
|
44
|
+
1. Loads an experiment YAML file.
|
|
45
|
+
2. Expands registry pointer lists into concrete runs.
|
|
46
|
+
3. Loads processed parquet files for the selected program.
|
|
47
|
+
4. Cleans missing values and applies configured filters.
|
|
48
|
+
5. Selects treatment, outcome, covariates, and join key columns.
|
|
49
|
+
6. Encodes configured categorical covariates.
|
|
50
|
+
7. Fits DoubleML IRM with configured outcome and propensity models.
|
|
51
|
+
8. Logs estimation summaries and row-level predictions.
|
|
52
|
+
|
|
53
|
+
CLI usage:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
python dml_code/pipeline/step2_estimate.py <experiment_name>
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Configuration
|
|
60
|
+
|
|
61
|
+
The repository includes a runnable example configuration under `project_configuration`.
|
|
62
|
+
|
|
63
|
+
Expected project configuration shape:
|
|
64
|
+
|
|
65
|
+
- `project_configuration/build_spec.py`: databank spec and generated feature transforms.
|
|
66
|
+
- `project_configuration/estimation_experiments/*.yaml`: experiment specs with registry pointer lists.
|
|
67
|
+
- `project_configuration/registries/programs.py`: program spec registry.
|
|
68
|
+
- `project_configuration/registries/covariate_sets.py`: covariate set registry.
|
|
69
|
+
- `project_configuration/registries/filter_sets.py`: filter expression registry.
|
|
70
|
+
- `project_configuration/registries/models.py`: outcome and propensity model registries.
|
|
71
|
+
|
|
72
|
+
## Shared Modules
|
|
73
|
+
|
|
74
|
+
- `dml_code/src/paths.py`: path configuration and output path helpers.
|
|
75
|
+
- `dml_code/src/build_helpers.py`: build dataclasses and panel construction helpers.
|
|
76
|
+
- `dml_code/src/estimate_helpers.py`: experiment loading, run expansion, validation, and DoubleML fitting.
|
|
77
|
+
- `dml_code/src/outputs.py`: diagnostic plots and tables from estimation and prediction logs.
|
|
78
|
+
- `dml_code/src/utils.py`: logging, table export, timing, and resource helpers.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Executable DML pipeline steps."""
|
dml_dev-0.1.0/project_code/pipeline/build.py → dml_dev-0.1.1/dml_code/pipeline/step1_build.py
RENAMED
|
@@ -12,12 +12,12 @@ from pathlib import Path
|
|
|
12
12
|
LOCAL_DIR = Path(__file__).resolve().parents[2]
|
|
13
13
|
sys.path.insert(0, str(LOCAL_DIR))
|
|
14
14
|
|
|
15
|
-
from
|
|
15
|
+
from dml_code.src.build_helpers import (
|
|
16
16
|
backup_existing_output,
|
|
17
17
|
build_cohort_file,
|
|
18
|
-
|
|
18
|
+
get_generated_features,
|
|
19
19
|
get_program_spec,
|
|
20
|
-
|
|
20
|
+
get_databank_spec,
|
|
21
21
|
time_elapsed,
|
|
22
22
|
)
|
|
23
23
|
|
|
@@ -27,19 +27,19 @@ def main(program: str) -> None:
|
|
|
27
27
|
|
|
28
28
|
start = time.time()
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
databank_spec = get_databank_spec()
|
|
31
31
|
program_spec = get_program_spec(program)
|
|
32
|
-
|
|
32
|
+
generated_features = get_generated_features()
|
|
33
33
|
|
|
34
34
|
backup_existing_output(program)
|
|
35
35
|
|
|
36
|
-
for
|
|
36
|
+
for databank_path in databank_spec.paths:
|
|
37
37
|
build_cohort_file(
|
|
38
|
-
|
|
38
|
+
databank_path=databank_path,
|
|
39
39
|
program=program,
|
|
40
|
-
|
|
40
|
+
databank_spec=databank_spec,
|
|
41
41
|
program_spec=program_spec,
|
|
42
|
-
|
|
42
|
+
generated_features=generated_features,
|
|
43
43
|
)
|
|
44
44
|
|
|
45
45
|
end = time.time()
|
dml_dev-0.1.0/project_code/pipeline/estimate.py → dml_dev-0.1.1/dml_code/pipeline/step2_estimate.py
RENAMED
|
@@ -19,14 +19,14 @@ if os.environ.get("NCPUS"):
|
|
|
19
19
|
LOCAL_DIR = Path(__file__).resolve().parents[2]
|
|
20
20
|
sys.path.insert(0, str(LOCAL_DIR))
|
|
21
21
|
|
|
22
|
-
from
|
|
22
|
+
from dml_code.src.estimate_helpers import (
|
|
23
23
|
fit_doubleml_irm,
|
|
24
24
|
get_experiment,
|
|
25
25
|
prepare_estimation_data,
|
|
26
26
|
unpack_runs,
|
|
27
27
|
validate_runs,
|
|
28
28
|
)
|
|
29
|
-
from
|
|
29
|
+
from dml_code.src.utils import log_process_resources, log_results, time_elapsed, trim_memory
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def main(experiment_name: str) -> None:
|
|
@@ -39,7 +39,7 @@ def main(experiment_name: str) -> None:
|
|
|
39
39
|
stop_resource_logging = log_process_resources(interval=30)
|
|
40
40
|
try:
|
|
41
41
|
for run_number, run in enumerate(runs, start=1):
|
|
42
|
-
print(f"
|
|
42
|
+
print(f"\nRun {run_number}/{len(runs)}: {run.program_name}, outcome={run.outcome}")
|
|
43
43
|
start = time.time()
|
|
44
44
|
|
|
45
45
|
df, x_cols, summary = prepare_estimation_data(run)
|
|
@@ -56,7 +56,7 @@ def main(experiment_name: str) -> None:
|
|
|
56
56
|
estimation_run_time = time_elapsed(start_estimation, end)
|
|
57
57
|
estimation_run_time_hours = (end - start_estimation) / (60 * 60)
|
|
58
58
|
|
|
59
|
-
print("
|
|
59
|
+
print("Writing estimation and prediction logs...")
|
|
60
60
|
estimation_log = pl.DataFrame({
|
|
61
61
|
"program": [run.program_name],
|
|
62
62
|
"treatment": [run.treatment],
|
|
@@ -78,12 +78,12 @@ def main(experiment_name: str) -> None:
|
|
|
78
78
|
"estimation_run_time": [estimation_run_time],
|
|
79
79
|
"estimation_run_time_hours": [estimation_run_time_hours],
|
|
80
80
|
"timestamp": [datetime.now()],
|
|
81
|
-
"
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
"
|
|
85
|
-
"
|
|
86
|
-
"
|
|
81
|
+
"num_controls": [summary["num_controls"]],
|
|
82
|
+
"num_unique_controls": [summary["num_unique_controls"]],
|
|
83
|
+
"num_covariates": [summary["num_covariates"]],
|
|
84
|
+
"num_treated": [summary["num_treated"]],
|
|
85
|
+
"num_null_rows_dropped": [summary["num_null_rows_dropped"]],
|
|
86
|
+
"num_rows": [summary["num_rows"]],
|
|
87
87
|
"run_number": [run_number],
|
|
88
88
|
})
|
|
89
89
|
predictions_log = pl.DataFrame({
|
|
@@ -98,9 +98,10 @@ def main(experiment_name: str) -> None:
|
|
|
98
98
|
log_results("estimation", estimation_log, experiment_name, run_number)
|
|
99
99
|
log_results("predictions", predictions_log, experiment_name, run_number)
|
|
100
100
|
|
|
101
|
-
print(
|
|
102
|
-
|
|
103
|
-
|
|
101
|
+
print(
|
|
102
|
+
f"Run {run_number}/{len(runs)} complete. "
|
|
103
|
+
f"Estimation: {estimation_run_time}; total: {total_run_time}.\n"
|
|
104
|
+
)
|
|
104
105
|
|
|
105
106
|
del dml_obj, df
|
|
106
107
|
trim_memory()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from collections.abc import Callable, Sequence
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
+
import importlib
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
import shutil
|
|
5
6
|
import sys
|
|
@@ -8,8 +9,13 @@ import time
|
|
|
8
9
|
|
|
9
10
|
import polars as pl
|
|
10
11
|
|
|
11
|
-
from
|
|
12
|
-
|
|
12
|
+
from dml_code.src.paths import (
|
|
13
|
+
CONFIG_DIR,
|
|
14
|
+
CONFIG_PACKAGE,
|
|
15
|
+
processed_data_out_folder,
|
|
16
|
+
processed_data_out_path,
|
|
17
|
+
)
|
|
18
|
+
from dml_code.src.utils import time_elapsed, trim_memory
|
|
13
19
|
|
|
14
20
|
TREATMENT_COL = "treatment"
|
|
15
21
|
OBSERVATION_COL = "observation_year"
|
|
@@ -26,8 +32,8 @@ class RelativeCol:
|
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
@dataclass(frozen=True, kw_only=True)
|
|
29
|
-
class
|
|
30
|
-
"""
|
|
35
|
+
class DatabankSpec:
|
|
36
|
+
"""Shared input files and columns to carry into every program build."""
|
|
31
37
|
|
|
32
38
|
paths: Sequence[Path]
|
|
33
39
|
passthrough_cols: Sequence[pl.Expr]
|
|
@@ -36,7 +42,7 @@ class BuildSource:
|
|
|
36
42
|
|
|
37
43
|
|
|
38
44
|
@dataclass(frozen=True, kw_only=True)
|
|
39
|
-
class
|
|
45
|
+
class ProgramSpec(DatabankSpec):
|
|
40
46
|
"""Program-specific source data and column mappings."""
|
|
41
47
|
|
|
42
48
|
name: str
|
|
@@ -46,36 +52,36 @@ class ProgramSource(BuildSource):
|
|
|
46
52
|
|
|
47
53
|
@dataclass(frozen=True, init=False)
|
|
48
54
|
class BuildSpec:
|
|
49
|
-
"""Complete build recipe:
|
|
55
|
+
"""Complete build recipe: databank, programs, and generated features."""
|
|
50
56
|
|
|
51
|
-
|
|
52
|
-
programs: dict[str,
|
|
53
|
-
|
|
57
|
+
databank: DatabankSpec
|
|
58
|
+
programs: dict[str, ProgramSpec]
|
|
59
|
+
generated_features: Sequence[Transform]
|
|
54
60
|
|
|
55
61
|
def __init__(
|
|
56
62
|
self,
|
|
57
|
-
|
|
58
|
-
programs: dict[str,
|
|
59
|
-
|
|
63
|
+
databank: DatabankSpec | None = None,
|
|
64
|
+
programs: dict[str, ProgramSpec] | None = None,
|
|
65
|
+
generated_features: Sequence[Transform] = (),
|
|
60
66
|
):
|
|
61
|
-
if
|
|
62
|
-
raise ValueError("BuildSpec requires
|
|
67
|
+
if databank is None:
|
|
68
|
+
raise ValueError("BuildSpec requires databank")
|
|
63
69
|
|
|
64
|
-
object.__setattr__(self, "
|
|
70
|
+
object.__setattr__(self, "databank", databank)
|
|
65
71
|
object.__setattr__(self, "programs", programs or {})
|
|
66
|
-
object.__setattr__(self, "
|
|
72
|
+
object.__setattr__(self, "generated_features", generated_features)
|
|
67
73
|
|
|
68
74
|
|
|
69
75
|
def get_build_spec() -> BuildSpec:
|
|
70
76
|
"""Load the configured build recipe lazily to avoid import cycles."""
|
|
71
77
|
|
|
72
78
|
sys.path.insert(0, str(CONFIG_DIR.parent))
|
|
73
|
-
|
|
79
|
+
build_spec_module = importlib.import_module(f"{CONFIG_PACKAGE}.build_spec")
|
|
74
80
|
|
|
75
|
-
return BUILD_SPEC
|
|
81
|
+
return build_spec_module.BUILD_SPEC
|
|
76
82
|
|
|
77
83
|
|
|
78
|
-
def get_program_spec(program: str) ->
|
|
84
|
+
def get_program_spec(program: str) -> ProgramSpec:
|
|
79
85
|
"""Return the configured source definition for one program."""
|
|
80
86
|
|
|
81
87
|
try:
|
|
@@ -84,16 +90,16 @@ def get_program_spec(program: str) -> ProgramSource:
|
|
|
84
90
|
raise ValueError(f"Unknown program: {program}") from e
|
|
85
91
|
|
|
86
92
|
|
|
87
|
-
def
|
|
88
|
-
"""Return the shared
|
|
93
|
+
def get_databank_spec() -> DatabankSpec:
|
|
94
|
+
"""Return the shared databank input definition."""
|
|
89
95
|
|
|
90
|
-
return get_build_spec().
|
|
96
|
+
return get_build_spec().databank
|
|
91
97
|
|
|
92
98
|
|
|
93
|
-
def
|
|
94
|
-
"""Return transforms applied after
|
|
99
|
+
def get_generated_features() -> Sequence[Transform]:
|
|
100
|
+
"""Return generated feature transforms applied after panel construction."""
|
|
95
101
|
|
|
96
|
-
return get_build_spec().
|
|
102
|
+
return get_build_spec().generated_features
|
|
97
103
|
|
|
98
104
|
|
|
99
105
|
def backup_existing_output(program: str) -> None:
|
|
@@ -110,7 +116,7 @@ def backup_existing_output(program: str) -> None:
|
|
|
110
116
|
|
|
111
117
|
|
|
112
118
|
|
|
113
|
-
def load_program_lf(program_spec:
|
|
119
|
+
def load_program_lf(program_spec: ProgramSpec) -> pl.LazyFrame:
|
|
114
120
|
"""Load treated program records and normalize key build columns."""
|
|
115
121
|
|
|
116
122
|
return (
|
|
@@ -180,11 +186,11 @@ def apply_transforms(
|
|
|
180
186
|
|
|
181
187
|
|
|
182
188
|
def build_cohort_file(
|
|
183
|
-
|
|
189
|
+
databank_path: Path,
|
|
184
190
|
program: str,
|
|
185
|
-
|
|
186
|
-
program_spec:
|
|
187
|
-
|
|
191
|
+
databank_spec: DatabankSpec,
|
|
192
|
+
program_spec: ProgramSpec,
|
|
193
|
+
generated_features: Sequence[Transform],
|
|
188
194
|
) -> None:
|
|
189
195
|
"""Build and write one processed parquet file for one birth cohort.
|
|
190
196
|
|
|
@@ -193,21 +199,21 @@ def build_cohort_file(
|
|
|
193
199
|
"""
|
|
194
200
|
|
|
195
201
|
start = time.time()
|
|
196
|
-
cohort = int(
|
|
202
|
+
cohort = int(databank_path.stem.split("=")[1])
|
|
197
203
|
print(f"\n \n Starting cohort {cohort}")
|
|
198
204
|
|
|
199
205
|
# Temporary cohort window used to avoid scanning out-of-scope source files.
|
|
200
206
|
if cohort < 1940 or cohort > 1995:
|
|
201
207
|
return
|
|
202
208
|
|
|
203
|
-
|
|
204
|
-
|
|
209
|
+
databank_lf = pl.scan_parquet(databank_path).with_columns(
|
|
210
|
+
databank_spec.join_key_col.alias(JOIN_KEY)
|
|
205
211
|
)
|
|
206
212
|
program_lf = load_program_lf(program_spec)
|
|
207
213
|
treated_enrollment_years = get_treated_enrollment_years(program_lf)
|
|
208
214
|
|
|
209
215
|
# Join once at calendar time, then slice into event-time panels below.
|
|
210
|
-
merged_lf =
|
|
216
|
+
merged_lf = databank_lf.join(program_lf, on=JOIN_KEY, how="left")
|
|
211
217
|
merged_lf = merged_lf.with_columns(pl.col(TREATMENT_COL).fill_null(0))
|
|
212
218
|
|
|
213
219
|
available_cols = set(merged_lf.collect_schema().names())
|
|
@@ -216,11 +222,11 @@ def build_cohort_file(
|
|
|
216
222
|
pl.col(OBSERVATION_COL),
|
|
217
223
|
pl.col(TREATMENT_COL),
|
|
218
224
|
*program_spec.passthrough_cols,
|
|
219
|
-
*
|
|
225
|
+
*databank_spec.passthrough_cols,
|
|
220
226
|
]
|
|
221
227
|
passthrough_cols_as_lag = [
|
|
222
228
|
*program_spec.passthrough_cols_as_lag,
|
|
223
|
-
*
|
|
229
|
+
*databank_spec.passthrough_cols_as_lag,
|
|
224
230
|
]
|
|
225
231
|
|
|
226
232
|
missing_cols = set()
|
|
@@ -263,8 +269,8 @@ def build_cohort_file(
|
|
|
263
269
|
[pl.scan_parquet(path) for path in cohort_panel_paths],
|
|
264
270
|
how="vertical_relaxed",
|
|
265
271
|
)
|
|
266
|
-
# Add
|
|
267
|
-
result = apply_transforms(result,
|
|
272
|
+
# Add generated features after all relative columns exist.
|
|
273
|
+
result = apply_transforms(result, generated_features)
|
|
268
274
|
|
|
269
275
|
out_path = processed_data_out_path(program, cohort)
|
|
270
276
|
result.sink_parquet(out_path, engine="streaming")
|
|
@@ -278,12 +284,12 @@ def build_cohort_file(
|
|
|
278
284
|
|
|
279
285
|
|
|
280
286
|
def add_derived_columns(program: str) -> None:
|
|
281
|
-
"""Re-apply
|
|
287
|
+
"""Re-apply generated feature transforms to files that have already been built."""
|
|
282
288
|
|
|
283
289
|
folder = processed_data_out_folder(program)
|
|
284
|
-
|
|
290
|
+
generated_features = get_generated_features()
|
|
285
291
|
|
|
286
292
|
for path in folder.iterdir():
|
|
287
293
|
lf = pl.scan_parquet(path)
|
|
288
|
-
lf = apply_transforms(lf,
|
|
294
|
+
lf = apply_transforms(lf, generated_features)
|
|
289
295
|
lf.sink_parquet(path, engine="streaming")
|