dml-dev 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. dml_dev-0.1.1/MANIFEST.in +4 -0
  2. dml_dev-0.1.1/PKG-INFO +137 -0
  3. dml_dev-0.1.1/README.md +109 -0
  4. dml_dev-0.1.1/agent.md +78 -0
  5. dml_dev-0.1.1/dml_code/pipeline/__init__.py +1 -0
  6. dml_dev-0.1.0/project_code/pipeline/build.py → dml_dev-0.1.1/dml_code/pipeline/step1_build.py +9 -9
  7. dml_dev-0.1.0/project_code/pipeline/estimate.py → dml_dev-0.1.1/dml_code/pipeline/step2_estimate.py +14 -13
  8. {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/build_helpers.py +47 -41
  9. {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/estimate_helpers.py +34 -33
  10. dml_dev-0.1.1/dml_code/src/outputs.py +662 -0
  11. {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/paths.py +7 -14
  12. {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/utils.py +1 -17
  13. dml_dev-0.1.1/dml_dev.egg-info/PKG-INFO +137 -0
  14. dml_dev-0.1.1/dml_dev.egg-info/SOURCES.txt +30 -0
  15. dml_dev-0.1.1/dml_dev.egg-info/entry_points.txt +3 -0
  16. {dml_dev-0.1.0 → dml_dev-0.1.1}/dml_dev.egg-info/requires.txt +1 -0
  17. dml_dev-0.1.1/dml_dev.egg-info/top_level.txt +2 -0
  18. dml_dev-0.1.1/project_configuration/__init__.py +5 -0
  19. dml_dev-0.1.1/project_configuration/build_spec.py +43 -0
  20. dml_dev-0.1.1/project_configuration/estimation_experiments/__init__.py +1 -0
  21. dml_dev-0.1.1/project_configuration/estimation_experiments/example_experiment.yaml +16 -0
  22. dml_dev-0.1.1/project_configuration/estimation_experiments/synthetic_example.yaml +16 -0
  23. dml_dev-0.1.1/project_configuration/registries/__init__.py +1 -0
  24. dml_dev-0.1.1/project_configuration/registries/covariate_sets.py +24 -0
  25. dml_dev-0.1.1/project_configuration/registries/filter_sets.py +11 -0
  26. dml_dev-0.1.1/project_configuration/registries/models.py +32 -0
  27. dml_dev-0.1.1/project_configuration/registries/programs.py +21 -0
  28. {dml_dev-0.1.0 → dml_dev-0.1.1}/pyproject.toml +9 -8
  29. dml_dev-0.1.0/MANIFEST.in +0 -4
  30. dml_dev-0.1.0/PKG-INFO +0 -35
  31. dml_dev-0.1.0/README.md +0 -8
  32. dml_dev-0.1.0/agent.md +0 -87
  33. dml_dev-0.1.0/config_template/__init__.py +0 -1
  34. dml_dev-0.1.0/config_template/build_spec.py +0 -15
  35. dml_dev-0.1.0/config_template/experiments/__init__.py +0 -2
  36. dml_dev-0.1.0/config_template/experiments/example_experiment.yaml +0 -7
  37. dml_dev-0.1.0/config_template/registries/__init__.py +0 -1
  38. dml_dev-0.1.0/config_template/registries/covariate_sets.py +0 -3
  39. dml_dev-0.1.0/config_template/registries/filter_sets.py +0 -4
  40. dml_dev-0.1.0/config_template/registries/models.py +0 -6
  41. dml_dev-0.1.0/config_template/registries/programs.py +0 -4
  42. dml_dev-0.1.0/dml_dev.egg-info/PKG-INFO +0 -35
  43. dml_dev-0.1.0/dml_dev.egg-info/SOURCES.txt +0 -29
  44. dml_dev-0.1.0/dml_dev.egg-info/entry_points.txt +0 -3
  45. dml_dev-0.1.0/dml_dev.egg-info/top_level.txt +0 -2
  46. dml_dev-0.1.0/project_code/pipeline/__init__.py +0 -2
  47. dml_dev-0.1.0/project_code/src/plotting.py +0 -253
  48. {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/__init__.py +0 -0
  49. {dml_dev-0.1.0/project_code → dml_dev-0.1.1/dml_code}/src/__init__.py +0 -0
  50. {dml_dev-0.1.0 → dml_dev-0.1.1}/dml_dev.egg-info/dependency_links.txt +0 -0
  51. {dml_dev-0.1.0 → dml_dev-0.1.1}/setup.cfg +0 -0
@@ -0,0 +1,4 @@
1
+ recursive-include dml_code *.py
2
+ recursive-include project_configuration *.py *.yaml
3
+ include README.md
4
+ include agent.md
dml_dev-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: dml-dev
3
+ Version: 0.1.1
4
+ Summary: DoubleML build, estimation, plotting, and utility pipelines.
5
+ Author: DML Pipeline Contributors
6
+ Keywords: administrative-data,causal-inference,doubleml,observational-data,program-evaluation
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Topic :: Scientific/Engineering
13
+ Requires-Python: >=3.12
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: doubleml
16
+ Requires-Dist: joblib
17
+ Requires-Dist: oi-tools[figures]
18
+ Requires-Dist: plotnine
19
+ Requires-Dist: polars
20
+ Requires-Dist: pyarrow
21
+ Requires-Dist: psutil
22
+ Requires-Dist: PyYAML
23
+ Requires-Dist: scikit-learn
24
+ Requires-Dist: threadpoolctl
25
+ Provides-Extra: dev
26
+ Requires-Dist: build; extra == "dev"
27
+ Requires-Dist: twine; extra == "dev"
28
+
29
+ # DML Pipeline
30
+
31
+ This repo is a small framework for running DoubleML on administrative-style
32
+ program data. It separates project-specific choices from reusable pipeline code:
33
+ you edit `project_configuration/`, then run the pipeline in `dml_code/`.
34
+
35
+ The repo is currently filled with a synthetic example so you can run the whole
36
+ flow before replacing it with real project data.
37
+
38
+ ## Mental Model
39
+
40
+ The workflow has two main steps:
41
+
42
+ 1. **Build an analysis dataset.** Start from a databank and program file,
43
+ join them, construct event-time variables, and write processed panels to
44
+ `data/build_output/`.
45
+ 2. **Estimate DML effects.** Read a YAML experiment, resolve its program,
46
+ covariates, filters, and models from the registries, then write logs to
47
+ `outputs/raw/`.
48
+
49
+ After estimation, scripts can turn the raw logs into plots and tables.
50
+
51
+ ```text
52
+ project_configuration/ + data/build/
53
+ |
54
+ v
55
+ dml_code.pipeline.step1_build
56
+ |
57
+ v
58
+ data/build_output/
59
+ |
60
+ v
61
+ dml_code.pipeline.step2_estimate
62
+ |
63
+ v
64
+ outputs/raw/ -> outputs/plots/ and outputs/tables/
65
+ ```
66
+
67
+ ## Run The Example
68
+
69
+ ```bash
70
+ python project_scripts/generate_example.py
71
+ python -m dml_code.pipeline.step1_build example_program
72
+ python -m dml_code.pipeline.step2_estimate synthetic_example
73
+ python project_scripts/plot_example.py
74
+ ```
75
+
76
+ The first command creates synthetic input data in `data/build/`. Step 1 writes
77
+ processed panels to `data/build_output/`. Step 2 writes estimation and
78
+ prediction logs to `outputs/raw/`. The plotting script writes diagnostics to
79
+ `outputs/plots/` and `outputs/tables/`.
80
+
81
+ ## What You Edit
82
+
83
+ Most project setup happens in `project_configuration/`.
84
+
85
+ - `project_configuration/build_spec.py`: define the databank files, columns to carry through,
86
+ relative-time columns to generate, and any generated features created after
87
+ panel construction.
88
+ - `project_configuration/registries/programs.py`: define each program: its source file,
89
+ treatment column, enrollment-year column, and program-specific columns.
90
+ - `project_configuration/registries/covariate_sets.py`: name reusable covariate lists and mark
91
+ categorical covariates for dummy encoding.
92
+ - `project_configuration/registries/filter_sets.py`: name reusable Polars filters for
93
+ estimation samples.
94
+ - `project_configuration/registries/models.py`: name outcome and propensity learners.
95
+ - `project_configuration/estimation_experiments/*.yaml`: choose combinations of programs, outcomes,
96
+ covariates, filters, models, and control sampling rates to estimate.
97
+
98
+ The pipeline code in `dml_code/` is meant to stay reusable.
99
+
100
+ - `dml_code/pipeline/`: runnable steps, `step1_build.py` and
101
+ `step2_estimate.py`.
102
+ - `dml_code/src/`: shared helpers for building, estimating, paths, outputs,
103
+ and logging.
104
+
105
+ `project_scripts/` is for ad hoc project work tied to particular runs:
106
+ generating example data, viewing outputs, making plots, running diagnostics,
107
+ and writing small experiment-specific analyses.
108
+
109
+ ## How To Add A Real Project
110
+
111
+ 1. Put source parquet files somewhere under `data/` or point `project_configuration/` at their
112
+ real locations.
113
+ 2. Update `project_configuration/build_spec.py` with the databank files and feature-generation
114
+ logic.
115
+ 3. Add program definitions in `project_configuration/registries/programs.py`.
116
+ 4. Add covariate sets, filters, and models in the registry files.
117
+ 5. Create or copy a YAML file in `project_configuration/estimation_experiments/`.
118
+ 6. Run step 1 for a program, then step 2 for an experiment.
119
+
120
+ Example:
121
+
122
+ ```bash
123
+ python -m dml_code.pipeline.step1_build my_program
124
+ python -m dml_code.pipeline.step2_estimate my_experiment
125
+ ```
126
+
127
+ Use `project_scripts/` for project-specific follow-up work: viewing outputs
128
+ from particular runs, making plots and tables, running diagnostics, robustness
129
+ checks, and other exploratory analyses.
130
+
131
+ ## Where Results Go
132
+
133
+ - `data/build/`: input data used by the example.
134
+ - `data/build_output/`: processed analysis datasets created by step 1.
135
+ - `outputs/raw/`: machine-readable estimation, prediction, and diagnostic logs.
136
+ - `outputs/plots/`: generated figures.
137
+ - `outputs/tables/`: generated tables.
@@ -0,0 +1,109 @@
1
+ # DML Pipeline
2
+
3
+ This repo is a small framework for running DoubleML on administrative-style
4
+ program data. It separates project-specific choices from reusable pipeline code:
5
+ you edit `project_configuration/`, then run the pipeline in `dml_code/`.
6
+
7
+ The repo is currently filled with a synthetic example so you can run the whole
8
+ flow before replacing it with real project data.
9
+
10
+ ## Mental Model
11
+
12
+ The workflow has two main steps:
13
+
14
+ 1. **Build an analysis dataset.** Start from a databank and program file,
15
+ join them, construct event-time variables, and write processed panels to
16
+ `data/build_output/`.
17
+ 2. **Estimate DML effects.** Read a YAML experiment, resolve its program,
18
+ covariates, filters, and models from the registries, then write logs to
19
+ `outputs/raw/`.
20
+
21
+ After estimation, scripts can turn the raw logs into plots and tables.
22
+
23
+ ```text
24
+ project_configuration/ + data/build/
25
+ |
26
+ v
27
+ dml_code.pipeline.step1_build
28
+ |
29
+ v
30
+ data/build_output/
31
+ |
32
+ v
33
+ dml_code.pipeline.step2_estimate
34
+ |
35
+ v
36
+ outputs/raw/ -> outputs/plots/ and outputs/tables/
37
+ ```
38
+
39
+ ## Run The Example
40
+
41
+ ```bash
42
+ python project_scripts/generate_example.py
43
+ python -m dml_code.pipeline.step1_build example_program
44
+ python -m dml_code.pipeline.step2_estimate synthetic_example
45
+ python project_scripts/plot_example.py
46
+ ```
47
+
48
+ The first command creates synthetic input data in `data/build/`. Step 1 writes
49
+ processed panels to `data/build_output/`. Step 2 writes estimation and
50
+ prediction logs to `outputs/raw/`. The plotting script writes diagnostics to
51
+ `outputs/plots/` and `outputs/tables/`.
52
+
53
+ ## What You Edit
54
+
55
+ Most project setup happens in `project_configuration/`.
56
+
57
+ - `project_configuration/build_spec.py`: define the databank files, columns to carry through,
58
+ relative-time columns to generate, and any generated features created after
59
+ panel construction.
60
+ - `project_configuration/registries/programs.py`: define each program: its source file,
61
+ treatment column, enrollment-year column, and program-specific columns.
62
+ - `project_configuration/registries/covariate_sets.py`: name reusable covariate lists and mark
63
+ categorical covariates for dummy encoding.
64
+ - `project_configuration/registries/filter_sets.py`: name reusable Polars filters for
65
+ estimation samples.
66
+ - `project_configuration/registries/models.py`: name outcome and propensity learners.
67
+ - `project_configuration/estimation_experiments/*.yaml`: choose combinations of programs, outcomes,
68
+ covariates, filters, models, and control sampling rates to estimate.
69
+
70
+ The pipeline code in `dml_code/` is meant to stay reusable.
71
+
72
+ - `dml_code/pipeline/`: runnable steps, `step1_build.py` and
73
+ `step2_estimate.py`.
74
+ - `dml_code/src/`: shared helpers for building, estimating, paths, outputs,
75
+ and logging.
76
+
77
+ `project_scripts/` is for ad hoc project work tied to particular runs:
78
+ generating example data, viewing outputs, making plots, running diagnostics,
79
+ and writing small experiment-specific analyses.
80
+
81
+ ## How To Add A Real Project
82
+
83
+ 1. Put source parquet files somewhere under `data/` or point `project_configuration/` at their
84
+ real locations.
85
+ 2. Update `project_configuration/build_spec.py` with the databank files and feature-generation
86
+ logic.
87
+ 3. Add program definitions in `project_configuration/registries/programs.py`.
88
+ 4. Add covariate sets, filters, and models in the registry files.
89
+ 5. Create or copy a YAML file in `project_configuration/estimation_experiments/`.
90
+ 6. Run step 1 for a program, then step 2 for an experiment.
91
+
92
+ Example:
93
+
94
+ ```bash
95
+ python -m dml_code.pipeline.step1_build my_program
96
+ python -m dml_code.pipeline.step2_estimate my_experiment
97
+ ```
98
+
99
+ Use `project_scripts/` for project-specific follow-up work: viewing outputs
100
+ from particular runs, making plots and tables, running diagnostics, robustness
101
+ checks, and other exploratory analyses.
102
+
103
+ ## Where Results Go
104
+
105
+ - `data/build/`: input data used by the example.
106
+ - `data/build_output/`: processed analysis datasets created by step 1.
107
+ - `outputs/raw/`: machine-readable estimation, prediction, and diagnostic logs.
108
+ - `outputs/plots/`: generated figures.
109
+ - `outputs/tables/`: generated tables.
dml_dev-0.1.1/agent.md ADDED
@@ -0,0 +1,78 @@
1
+ # Repository Guide
2
+
3
+ This repository contains a general-purpose Polars build pipeline and DoubleML
4
+ estimation pipeline for administrative observational data. The package exposes
5
+ the full implementation under `dml_code`, including executable pipeline
6
+ entrypoints and shared helper modules.
7
+
8
+ ## Runtime Paths
9
+
10
+ Runtime locations are defined in `dml_code/src/paths.py`. `LOCAL_DIR` is
11
+ resolved from the repository location, and data, project configuration, and
12
+ outputs are defined relative to it.
13
+
14
+ ## Main Pipeline
15
+
16
+ ### Build Processed Panels
17
+
18
+ `dml_code/pipeline/step1_build.py` builds program-specific processed parquet
19
+ files from a configured administrative data source and program registry entry.
20
+
21
+ The script:
22
+
23
+ 1. Loads source panel files from the build spec.
24
+ 2. Loads a selected program source definition.
25
+ 3. Joins program records to the source panel on the configured join key.
26
+ 4. Constructs relative-time panels for treated and eligible comparison records.
27
+ 5. Creates relative-time variables from calendar-time source columns.
28
+ 6. Applies configured post-panel feature transforms.
29
+ 7. Writes processed parquet files for downstream estimation.
30
+
31
+ CLI usage:
32
+
33
+ ```bash
34
+ python dml_code/pipeline/step1_build.py <program_pointer>
35
+ ```
36
+
37
+ ### Estimate Effects
38
+
39
+ `dml_code/pipeline/step2_estimate.py` runs DoubleML estimation from a YAML
40
+ experiment spec.
41
+
42
+ The script:
43
+
44
+ 1. Loads an experiment YAML file.
45
+ 2. Expands registry pointer lists into concrete runs.
46
+ 3. Loads processed parquet files for the selected program.
47
+ 4. Cleans missing values and applies configured filters.
48
+ 5. Selects treatment, outcome, covariates, and join key columns.
49
+ 6. Encodes configured categorical covariates.
50
+ 7. Fits DoubleML IRM with configured outcome and propensity models.
51
+ 8. Logs estimation summaries and row-level predictions.
52
+
53
+ CLI usage:
54
+
55
+ ```bash
56
+ python dml_code/pipeline/step2_estimate.py <experiment_name>
57
+ ```
58
+
59
+ ## Configuration
60
+
61
+ The repository includes a runnable example configuration under `project_configuration`.
62
+
63
+ Expected project configuration shape:
64
+
65
+ - `project_configuration/build_spec.py`: databank spec and generated feature transforms.
66
+ - `project_configuration/estimation_experiments/*.yaml`: experiment specs with registry pointer lists.
67
+ - `project_configuration/registries/programs.py`: program spec registry.
68
+ - `project_configuration/registries/covariate_sets.py`: covariate set registry.
69
+ - `project_configuration/registries/filter_sets.py`: filter expression registry.
70
+ - `project_configuration/registries/models.py`: outcome and propensity model registries.
71
+
72
+ ## Shared Modules
73
+
74
+ - `dml_code/src/paths.py`: path configuration and output path helpers.
75
+ - `dml_code/src/build_helpers.py`: build dataclasses and panel construction helpers.
76
+ - `dml_code/src/estimate_helpers.py`: experiment loading, run expansion, validation, and DoubleML fitting.
77
+ - `dml_code/src/outputs.py`: diagnostic plots and tables from estimation and prediction logs.
78
+ - `dml_code/src/utils.py`: logging, table export, timing, and resource helpers.
@@ -0,0 +1 @@
1
+ """Executable DML pipeline steps."""
@@ -12,12 +12,12 @@ from pathlib import Path
12
12
  LOCAL_DIR = Path(__file__).resolve().parents[2]
13
13
  sys.path.insert(0, str(LOCAL_DIR))
14
14
 
15
- from project_code.src.build_helpers import (
15
+ from dml_code.src.build_helpers import (
16
16
  backup_existing_output,
17
17
  build_cohort_file,
18
- get_post_panel_transforms,
18
+ get_generated_features,
19
19
  get_program_spec,
20
- get_source_data_spec,
20
+ get_databank_spec,
21
21
  time_elapsed,
22
22
  )
23
23
 
@@ -27,19 +27,19 @@ def main(program: str) -> None:
27
27
 
28
28
  start = time.time()
29
29
 
30
- source_data_spec = get_source_data_spec()
30
+ databank_spec = get_databank_spec()
31
31
  program_spec = get_program_spec(program)
32
- post_panel_transforms = get_post_panel_transforms()
32
+ generated_features = get_generated_features()
33
33
 
34
34
  backup_existing_output(program)
35
35
 
36
- for source_data_path in source_data_spec.paths:
36
+ for databank_path in databank_spec.paths:
37
37
  build_cohort_file(
38
- source_data_path=source_data_path,
38
+ databank_path=databank_path,
39
39
  program=program,
40
- source_data_spec=source_data_spec,
40
+ databank_spec=databank_spec,
41
41
  program_spec=program_spec,
42
- post_panel_transforms=post_panel_transforms,
42
+ generated_features=generated_features,
43
43
  )
44
44
 
45
45
  end = time.time()
@@ -19,14 +19,14 @@ if os.environ.get("NCPUS"):
19
19
  LOCAL_DIR = Path(__file__).resolve().parents[2]
20
20
  sys.path.insert(0, str(LOCAL_DIR))
21
21
 
22
- from project_code.src.estimate_helpers import (
22
+ from dml_code.src.estimate_helpers import (
23
23
  fit_doubleml_irm,
24
24
  get_experiment,
25
25
  prepare_estimation_data,
26
26
  unpack_runs,
27
27
  validate_runs,
28
28
  )
29
- from project_code.src.utils import log_process_resources, log_results, time_elapsed, trim_memory
29
+ from dml_code.src.utils import log_process_resources, log_results, time_elapsed, trim_memory
30
30
 
31
31
 
32
32
  def main(experiment_name: str) -> None:
@@ -39,7 +39,7 @@ def main(experiment_name: str) -> None:
39
39
  stop_resource_logging = log_process_resources(interval=30)
40
40
  try:
41
41
  for run_number, run in enumerate(runs, start=1):
42
- print(f"Starting run #{run_number} of {len(runs)} \n")
42
+ print(f"\nRun {run_number}/{len(runs)}: {run.program_name}, outcome={run.outcome}")
43
43
  start = time.time()
44
44
 
45
45
  df, x_cols, summary = prepare_estimation_data(run)
@@ -56,7 +56,7 @@ def main(experiment_name: str) -> None:
56
56
  estimation_run_time = time_elapsed(start_estimation, end)
57
57
  estimation_run_time_hours = (end - start_estimation) / (60 * 60)
58
58
 
59
- print("\n Starting logging...\n")
59
+ print("Writing estimation and prediction logs...")
60
60
  estimation_log = pl.DataFrame({
61
61
  "program": [run.program_name],
62
62
  "treatment": [run.treatment],
@@ -78,12 +78,12 @@ def main(experiment_name: str) -> None:
78
78
  "estimation_run_time": [estimation_run_time],
79
79
  "estimation_run_time_hours": [estimation_run_time_hours],
80
80
  "timestamp": [datetime.now()],
81
- "n_controls": [summary["n_controls"]],
82
- "n_unique_controls": [summary["n_unique_controls"]],
83
- "n_covariates": [summary["n_covariates"]],
84
- "n_treated": [summary["n_treated"]],
85
- "n_null_rows_dropped": [summary["n_null_rows_dropped"]],
86
- "n_rows": [summary["n_rows"]],
81
+ "num_controls": [summary["num_controls"]],
82
+ "num_unique_controls": [summary["num_unique_controls"]],
83
+ "num_covariates": [summary["num_covariates"]],
84
+ "num_treated": [summary["num_treated"]],
85
+ "num_null_rows_dropped": [summary["num_null_rows_dropped"]],
86
+ "num_rows": [summary["num_rows"]],
87
87
  "run_number": [run_number],
88
88
  })
89
89
  predictions_log = pl.DataFrame({
@@ -98,9 +98,10 @@ def main(experiment_name: str) -> None:
98
98
  log_results("estimation", estimation_log, experiment_name, run_number)
99
99
  log_results("predictions", predictions_log, experiment_name, run_number)
100
100
 
101
- print(f"""\n Run #{run_number} complete
102
- \n Estimation run time: {estimation_run_time}
103
- \n Total run time: {total_run_time}\n \n""")
101
+ print(
102
+ f"Run {run_number}/{len(runs)} complete. "
103
+ f"Estimation: {estimation_run_time}; total: {total_run_time}.\n"
104
+ )
104
105
 
105
106
  del dml_obj, df
106
107
  trim_memory()
@@ -1,5 +1,6 @@
1
1
  from collections.abc import Callable, Sequence
2
2
  from dataclasses import dataclass, field
3
+ import importlib
3
4
  from pathlib import Path
4
5
  import shutil
5
6
  import sys
@@ -8,8 +9,13 @@ import time
8
9
 
9
10
  import polars as pl
10
11
 
11
- from project_code.src.paths import CONFIG_DIR, processed_data_out_folder, processed_data_out_path
12
- from project_code.src.utils import time_elapsed, trim_memory
12
+ from dml_code.src.paths import (
13
+ CONFIG_DIR,
14
+ CONFIG_PACKAGE,
15
+ processed_data_out_folder,
16
+ processed_data_out_path,
17
+ )
18
+ from dml_code.src.utils import time_elapsed, trim_memory
13
19
 
14
20
  TREATMENT_COL = "treatment"
15
21
  OBSERVATION_COL = "observation_year"
@@ -26,8 +32,8 @@ class RelativeCol:
26
32
 
27
33
 
28
34
  @dataclass(frozen=True, kw_only=True)
29
- class BuildSource:
30
- """Input files and columns to carry from one side of the build join."""
35
+ class DatabankSpec:
36
+ """Shared input files and columns to carry into every program build."""
31
37
 
32
38
  paths: Sequence[Path]
33
39
  passthrough_cols: Sequence[pl.Expr]
@@ -36,7 +42,7 @@ class BuildSource:
36
42
 
37
43
 
38
44
  @dataclass(frozen=True, kw_only=True)
39
- class ProgramSource(BuildSource):
45
+ class ProgramSpec(DatabankSpec):
40
46
  """Program-specific source data and column mappings."""
41
47
 
42
48
  name: str
@@ -46,36 +52,36 @@ class ProgramSource(BuildSource):
46
52
 
47
53
  @dataclass(frozen=True, init=False)
48
54
  class BuildSpec:
49
- """Complete build recipe: source data, programs, and post-panel transforms."""
55
+ """Complete build recipe: databank, programs, and generated features."""
50
56
 
51
- source_data: BuildSource
52
- programs: dict[str, ProgramSource]
53
- post_panel_transforms: Sequence[Transform]
57
+ databank: DatabankSpec
58
+ programs: dict[str, ProgramSpec]
59
+ generated_features: Sequence[Transform]
54
60
 
55
61
  def __init__(
56
62
  self,
57
- source_data: BuildSource | None = None,
58
- programs: dict[str, ProgramSource] | None = None,
59
- post_panel_transforms: Sequence[Transform] = (),
63
+ databank: DatabankSpec | None = None,
64
+ programs: dict[str, ProgramSpec] | None = None,
65
+ generated_features: Sequence[Transform] = (),
60
66
  ):
61
- if source_data is None:
62
- raise ValueError("BuildSpec requires source_data")
67
+ if databank is None:
68
+ raise ValueError("BuildSpec requires databank")
63
69
 
64
- object.__setattr__(self, "source_data", source_data)
70
+ object.__setattr__(self, "databank", databank)
65
71
  object.__setattr__(self, "programs", programs or {})
66
- object.__setattr__(self, "post_panel_transforms", post_panel_transforms)
72
+ object.__setattr__(self, "generated_features", generated_features)
67
73
 
68
74
 
69
75
  def get_build_spec() -> BuildSpec:
70
76
  """Load the configured build recipe lazily to avoid import cycles."""
71
77
 
72
78
  sys.path.insert(0, str(CONFIG_DIR.parent))
73
- from config.build_spec import BUILD_SPEC
79
+ build_spec_module = importlib.import_module(f"{CONFIG_PACKAGE}.build_spec")
74
80
 
75
- return BUILD_SPEC
81
+ return build_spec_module.BUILD_SPEC
76
82
 
77
83
 
78
- def get_program_spec(program: str) -> ProgramSource:
84
+ def get_program_spec(program: str) -> ProgramSpec:
79
85
  """Return the configured source definition for one program."""
80
86
 
81
87
  try:
@@ -84,16 +90,16 @@ def get_program_spec(program: str) -> ProgramSource:
84
90
  raise ValueError(f"Unknown program: {program}") from e
85
91
 
86
92
 
87
- def get_source_data_spec() -> BuildSource:
88
- """Return the shared source data input definition."""
93
+ def get_databank_spec() -> DatabankSpec:
94
+ """Return the shared databank input definition."""
89
95
 
90
- return get_build_spec().source_data
96
+ return get_build_spec().databank
91
97
 
92
98
 
93
- def get_post_panel_transforms() -> Sequence[Transform]:
94
- """Return transforms applied after event-time panel construction."""
99
+ def get_generated_features() -> Sequence[Transform]:
100
+ """Return generated feature transforms applied after panel construction."""
95
101
 
96
- return get_build_spec().post_panel_transforms
102
+ return get_build_spec().generated_features
97
103
 
98
104
 
99
105
  def backup_existing_output(program: str) -> None:
@@ -110,7 +116,7 @@ def backup_existing_output(program: str) -> None:
110
116
 
111
117
 
112
118
 
113
- def load_program_lf(program_spec: ProgramSource) -> pl.LazyFrame:
119
+ def load_program_lf(program_spec: ProgramSpec) -> pl.LazyFrame:
114
120
  """Load treated program records and normalize key build columns."""
115
121
 
116
122
  return (
@@ -180,11 +186,11 @@ def apply_transforms(
180
186
 
181
187
 
182
188
  def build_cohort_file(
183
- source_data_path: Path,
189
+ databank_path: Path,
184
190
  program: str,
185
- source_data_spec: BuildSource,
186
- program_spec: ProgramSource,
187
- post_panel_transforms: Sequence[Transform],
191
+ databank_spec: DatabankSpec,
192
+ program_spec: ProgramSpec,
193
+ generated_features: Sequence[Transform],
188
194
  ) -> None:
189
195
  """Build and write one processed parquet file for one birth cohort.
190
196
 
@@ -193,21 +199,21 @@ def build_cohort_file(
193
199
  """
194
200
 
195
201
  start = time.time()
196
- cohort = int(source_data_path.stem.split("=")[1])
202
+ cohort = int(databank_path.stem.split("=")[1])
197
203
  print(f"\n \n Starting cohort {cohort}")
198
204
 
199
205
  # Temporary cohort window used to avoid scanning out-of-scope source files.
200
206
  if cohort < 1940 or cohort > 1995:
201
207
  return
202
208
 
203
- source_data_lf = pl.scan_parquet(source_data_path).with_columns(
204
- source_data_spec.join_key_col.alias(JOIN_KEY)
209
+ databank_lf = pl.scan_parquet(databank_path).with_columns(
210
+ databank_spec.join_key_col.alias(JOIN_KEY)
205
211
  )
206
212
  program_lf = load_program_lf(program_spec)
207
213
  treated_enrollment_years = get_treated_enrollment_years(program_lf)
208
214
 
209
215
  # Join once at calendar time, then slice into event-time panels below.
210
- merged_lf = source_data_lf.join(program_lf, on=JOIN_KEY, how="left")
216
+ merged_lf = databank_lf.join(program_lf, on=JOIN_KEY, how="left")
211
217
  merged_lf = merged_lf.with_columns(pl.col(TREATMENT_COL).fill_null(0))
212
218
 
213
219
  available_cols = set(merged_lf.collect_schema().names())
@@ -216,11 +222,11 @@ def build_cohort_file(
216
222
  pl.col(OBSERVATION_COL),
217
223
  pl.col(TREATMENT_COL),
218
224
  *program_spec.passthrough_cols,
219
- *source_data_spec.passthrough_cols,
225
+ *databank_spec.passthrough_cols,
220
226
  ]
221
227
  passthrough_cols_as_lag = [
222
228
  *program_spec.passthrough_cols_as_lag,
223
- *source_data_spec.passthrough_cols_as_lag,
229
+ *databank_spec.passthrough_cols_as_lag,
224
230
  ]
225
231
 
226
232
  missing_cols = set()
@@ -263,8 +269,8 @@ def build_cohort_file(
263
269
  [pl.scan_parquet(path) for path in cohort_panel_paths],
264
270
  how="vertical_relaxed",
265
271
  )
266
- # Add common post-panel features after all relative columns exist.
267
- result = apply_transforms(result, post_panel_transforms)
272
+ # Add generated features after all relative columns exist.
273
+ result = apply_transforms(result, generated_features)
268
274
 
269
275
  out_path = processed_data_out_path(program, cohort)
270
276
  result.sink_parquet(out_path, engine="streaming")
@@ -278,12 +284,12 @@ def build_cohort_file(
278
284
 
279
285
 
280
286
  def add_derived_columns(program: str) -> None:
281
- """Re-apply post-panel transforms to files that have already been built."""
287
+ """Re-apply generated feature transforms to files that have already been built."""
282
288
 
283
289
  folder = processed_data_out_folder(program)
284
- post_panel_transforms = get_post_panel_transforms()
290
+ generated_features = get_generated_features()
285
291
 
286
292
  for path in folder.iterdir():
287
293
  lf = pl.scan_parquet(path)
288
- lf = apply_transforms(lf, post_panel_transforms)
294
+ lf = apply_transforms(lf, generated_features)
289
295
  lf.sink_parquet(path, engine="streaming")