dml-dev 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. dml_dev-0.1.0/MANIFEST.in +4 -0
  2. dml_dev-0.1.0/PKG-INFO +35 -0
  3. dml_dev-0.1.0/README.md +8 -0
  4. dml_dev-0.1.0/agent.md +87 -0
  5. dml_dev-0.1.0/config_template/__init__.py +1 -0
  6. dml_dev-0.1.0/config_template/build_spec.py +15 -0
  7. dml_dev-0.1.0/config_template/experiments/__init__.py +2 -0
  8. dml_dev-0.1.0/config_template/experiments/example_experiment.yaml +7 -0
  9. dml_dev-0.1.0/config_template/registries/__init__.py +1 -0
  10. dml_dev-0.1.0/config_template/registries/covariate_sets.py +3 -0
  11. dml_dev-0.1.0/config_template/registries/filter_sets.py +4 -0
  12. dml_dev-0.1.0/config_template/registries/models.py +6 -0
  13. dml_dev-0.1.0/config_template/registries/programs.py +4 -0
  14. dml_dev-0.1.0/dml_dev.egg-info/PKG-INFO +35 -0
  15. dml_dev-0.1.0/dml_dev.egg-info/SOURCES.txt +29 -0
  16. dml_dev-0.1.0/dml_dev.egg-info/dependency_links.txt +1 -0
  17. dml_dev-0.1.0/dml_dev.egg-info/entry_points.txt +3 -0
  18. dml_dev-0.1.0/dml_dev.egg-info/requires.txt +13 -0
  19. dml_dev-0.1.0/dml_dev.egg-info/top_level.txt +2 -0
  20. dml_dev-0.1.0/project_code/__init__.py +2 -0
  21. dml_dev-0.1.0/project_code/pipeline/__init__.py +2 -0
  22. dml_dev-0.1.0/project_code/pipeline/build.py +61 -0
  23. dml_dev-0.1.0/project_code/pipeline/estimate.py +122 -0
  24. dml_dev-0.1.0/project_code/src/__init__.py +2 -0
  25. dml_dev-0.1.0/project_code/src/build_helpers.py +289 -0
  26. dml_dev-0.1.0/project_code/src/estimate_helpers.py +263 -0
  27. dml_dev-0.1.0/project_code/src/paths.py +35 -0
  28. dml_dev-0.1.0/project_code/src/plotting.py +253 -0
  29. dml_dev-0.1.0/project_code/src/utils.py +95 -0
  30. dml_dev-0.1.0/pyproject.toml +64 -0
  31. dml_dev-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,4 @@
1
+ recursive-include project_code *.py
2
+ recursive-include config_template *.py *.yaml
3
+ include README.md
4
+ include agent.md
dml_dev-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.4
2
+ Name: dml-dev
3
+ Version: 0.1.0
4
+ Summary: DoubleML build, estimation, plotting, and utility pipelines.
5
+ Author: DML Pipeline Contributors
6
+ Keywords: administrative-data,causal-inference,doubleml,observational-data,program-evaluation
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Topic :: Scientific/Engineering
13
+ Requires-Python: >=3.12
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: doubleml
16
+ Requires-Dist: joblib
17
+ Requires-Dist: oi-tools[figures]
18
+ Requires-Dist: plotnine
19
+ Requires-Dist: polars
20
+ Requires-Dist: psutil
21
+ Requires-Dist: PyYAML
22
+ Requires-Dist: scikit-learn
23
+ Requires-Dist: threadpoolctl
24
+ Provides-Extra: dev
25
+ Requires-Dist: build; extra == "dev"
26
+ Requires-Dist: twine; extra == "dev"
27
+
28
+ # DML Pipeline
29
+
30
+ Reusable build, estimation, plotting, and utility code for applying DoubleML to
31
+ administrative observational data for program analysis.
32
+
33
+ The package includes the full implementation under `project_code`, plus an
34
+ empty `config_template` package that shows the required config shape without
35
+ shipping project configuration.
@@ -0,0 +1,8 @@
1
+ # DML Pipeline
2
+
3
+ Reusable build, estimation, plotting, and utility code for applying DoubleML to
4
+ administrative observational data for program analysis.
5
+
6
+ The package includes the full implementation under `project_code`, plus an
7
+ empty `config_template` package that shows the required config shape without
8
+ shipping project configuration.
dml_dev-0.1.0/agent.md ADDED
@@ -0,0 +1,87 @@
1
+ # Repository Guide
2
+
3
+ This repository contains a general-purpose Polars build pipeline and DoubleML
4
+ estimation pipeline for administrative observational data. The package exposes
5
+ the full implementation under `project_code`, including executable pipeline
6
+ entrypoints and shared helper modules.
7
+
8
+ ## Runtime Environment
9
+
10
+ Runtime locations are configured through environment variables in
11
+ `project_code/src/paths.py`.
12
+
13
+ - `DML_PIPELINE_ROOT_DIR`: project root used as the default base directory.
14
+ - `DML_PIPELINE_DATA_DIR`: source data directory.
15
+ - `DML_PIPELINE_LOCAL_DIR`: local working directory for generated data and logs.
16
+ - `DML_PIPELINE_CONFIG_DIR`: optional config directory location.
17
+
18
+ When an environment variable is not set, paths default to folders under the
19
+ current working directory.
20
+
21
+ ## Main Pipeline
22
+
23
+ ### Build Processed Panels
24
+
25
+ `project_code/pipeline/build.py` builds program-specific processed parquet
26
+ files from a configured administrative data source and program registry entry.
27
+
28
+ The script:
29
+
30
+ 1. Loads source panel files from the build spec.
31
+ 2. Loads a selected program source definition.
32
+ 3. Joins program records to the source panel on the configured join key.
33
+ 4. Constructs relative-time panels for treated and eligible comparison records.
34
+ 5. Creates relative-time variables from calendar-time source columns.
35
+ 6. Applies configured post-panel feature transforms.
36
+ 7. Writes processed parquet files for downstream estimation.
37
+
38
+ CLI usage:
39
+
40
+ ```bash
41
+ python project_code/pipeline/build.py <program_pointer>
42
+ ```
43
+
44
+ ### Estimate Effects
45
+
46
+ `project_code/pipeline/estimate.py` runs DoubleML estimation from a YAML
47
+ experiment spec.
48
+
49
+ The script:
50
+
51
+ 1. Loads an experiment YAML file.
52
+ 2. Expands registry pointer lists into concrete runs.
53
+ 3. Loads processed parquet files for the selected program.
54
+ 4. Cleans missing values and applies configured filters.
55
+ 5. Selects treatment, outcome, covariates, and join key columns.
56
+ 6. Encodes configured categorical covariates.
57
+ 7. Fits DoubleML IRM with configured outcome and propensity models.
58
+ 8. Logs estimation summaries and row-level predictions.
59
+
60
+ CLI usage:
61
+
62
+ ```bash
63
+ python project_code/pipeline/estimate.py <experiment_name>
64
+ ```
65
+
66
+ ## Configuration
67
+
68
+ Real project configuration is intentionally not packaged. The package includes
69
+ `config_template`, which shows the expected files and datatypes without filling
70
+ in project-specific values.
71
+
72
+ Expected config shape:
73
+
74
+ - `config/build_spec.py`: shared source data and post-panel transforms.
75
+ - `config/experiments/*.yaml`: experiment specs with registry pointer lists.
76
+ - `config/registries/programs.py`: program source registry.
77
+ - `config/registries/covariate_sets.py`: covariate set registry.
78
+ - `config/registries/filter_sets.py`: filter expression registry.
79
+ - `config/registries/models.py`: outcome and propensity model registries.
80
+
81
+ ## Shared Modules
82
+
83
+ - `project_code/src/paths.py`: path configuration and output path helpers.
84
+ - `project_code/src/build_helpers.py`: build dataclasses and panel construction helpers.
85
+ - `project_code/src/estimate_helpers.py`: experiment loading, run expansion, validation, and DoubleML fitting.
86
+ - `project_code/src/plotting.py`: diagnostic plots from estimation and prediction logs.
87
+ - `project_code/src/utils.py`: logging, table export, timing, and resource helpers.
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,15 @@
1
+ from project_code.src.build_helpers import BuildSource, BuildSpec
2
+
3
+
4
+ source_data = BuildSource(
5
+ paths=[],
6
+ passthrough_cols=[],
7
+ passthrough_cols_as_lag=[],
8
+ )
9
+
10
+
11
+ BUILD_SPEC = BuildSpec(
12
+ source_data=source_data,
13
+ programs={},
14
+ post_panel_transforms=[],
15
+ )
@@ -0,0 +1,2 @@
1
+ """Example experiment YAML files."""
2
+
@@ -0,0 +1,7 @@
1
+ program_pointer: []
2
+ outcomes: []
3
+ covariate_set_pointer: []
4
+ filter_set_pointer: []
5
+ outcomes_model_pointer: []
6
+ propensity_model_pointer: []
7
+ num_controls_per_treat: []
@@ -0,0 +1,3 @@
1
+ COVARIATE_SET_REGISTRY: dict[str, list[str]] = {}
2
+
3
+ CATEGORICAL_COVARIATES: list[str] = []
@@ -0,0 +1,4 @@
1
+ import polars as pl
2
+
3
+
4
+ FILTER_SET_REGISTRY: dict[str, list[pl.Expr]] = {}
@@ -0,0 +1,6 @@
1
+ from typing import Any
2
+
3
+
4
+ OUTCOMES_MODEL_REGISTRY: dict[str, Any] = {}
5
+
6
+ PROPENSITY_MODEL_REGISTRY: dict[str, Any] = {}
@@ -0,0 +1,4 @@
1
+ from project_code.src.build_helpers import ProgramSource
2
+
3
+
4
+ PROGRAM_REGISTRY: dict[str, ProgramSource] = {}
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.4
2
+ Name: dml-dev
3
+ Version: 0.1.0
4
+ Summary: DoubleML build, estimation, plotting, and utility pipelines.
5
+ Author: DML Pipeline Contributors
6
+ Keywords: administrative-data,causal-inference,doubleml,observational-data,program-evaluation
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Topic :: Scientific/Engineering
13
+ Requires-Python: >=3.12
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: doubleml
16
+ Requires-Dist: joblib
17
+ Requires-Dist: oi-tools[figures]
18
+ Requires-Dist: plotnine
19
+ Requires-Dist: polars
20
+ Requires-Dist: psutil
21
+ Requires-Dist: PyYAML
22
+ Requires-Dist: scikit-learn
23
+ Requires-Dist: threadpoolctl
24
+ Provides-Extra: dev
25
+ Requires-Dist: build; extra == "dev"
26
+ Requires-Dist: twine; extra == "dev"
27
+
28
+ # DML Pipeline
29
+
30
+ Reusable build, estimation, plotting, and utility code for applying DoubleML to
31
+ administrative observational data for program analysis.
32
+
33
+ The package includes the full implementation under `project_code`, plus an
34
+ empty `config_template` package that shows the required config shape without
35
+ shipping project configuration.
@@ -0,0 +1,29 @@
1
+ MANIFEST.in
2
+ README.md
3
+ agent.md
4
+ pyproject.toml
5
+ config_template/__init__.py
6
+ config_template/build_spec.py
7
+ config_template/experiments/__init__.py
8
+ config_template/experiments/example_experiment.yaml
9
+ config_template/registries/__init__.py
10
+ config_template/registries/covariate_sets.py
11
+ config_template/registries/filter_sets.py
12
+ config_template/registries/models.py
13
+ config_template/registries/programs.py
14
+ dml_dev.egg-info/PKG-INFO
15
+ dml_dev.egg-info/SOURCES.txt
16
+ dml_dev.egg-info/dependency_links.txt
17
+ dml_dev.egg-info/entry_points.txt
18
+ dml_dev.egg-info/requires.txt
19
+ dml_dev.egg-info/top_level.txt
20
+ project_code/__init__.py
21
+ project_code/pipeline/__init__.py
22
+ project_code/pipeline/build.py
23
+ project_code/pipeline/estimate.py
24
+ project_code/src/__init__.py
25
+ project_code/src/build_helpers.py
26
+ project_code/src/estimate_helpers.py
27
+ project_code/src/paths.py
28
+ project_code/src/plotting.py
29
+ project_code/src/utils.py
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ dml-build = project_code.pipeline.build:cli
3
+ dml-estimate = project_code.pipeline.estimate:cli
@@ -0,0 +1,13 @@
1
+ doubleml
2
+ joblib
3
+ oi-tools[figures]
4
+ plotnine
5
+ polars
6
+ psutil
7
+ PyYAML
8
+ scikit-learn
9
+ threadpoolctl
10
+
11
+ [dev]
12
+ build
13
+ twine
@@ -0,0 +1,2 @@
1
+ config_template
2
+ project_code
@@ -0,0 +1,2 @@
1
+ """DML pipeline package."""
2
+
@@ -0,0 +1,2 @@
1
+ """Executable build and estimation pipelines."""
2
+
@@ -0,0 +1,61 @@
1
+ """Build processed cohort panels for a configured program.
2
+
3
+ This entrypoint loads the build spec, backs up any existing program output,
4
+ then writes one processed parquet file per cohort.
5
+ """
6
+
7
+ import argparse
8
+ import sys
9
+ import time
10
+ from pathlib import Path
11
+
12
+ LOCAL_DIR = Path(__file__).resolve().parents[2]
13
+ sys.path.insert(0, str(LOCAL_DIR))
14
+
15
+ from project_code.src.build_helpers import (
16
+ backup_existing_output,
17
+ build_cohort_file,
18
+ get_post_panel_transforms,
19
+ get_program_spec,
20
+ get_source_data_spec,
21
+ time_elapsed,
22
+ )
23
+
24
+
25
+ def main(program: str) -> None:
26
+ """Run the build pipeline for one program name from the registry."""
27
+
28
+ start = time.time()
29
+
30
+ source_data_spec = get_source_data_spec()
31
+ program_spec = get_program_spec(program)
32
+ post_panel_transforms = get_post_panel_transforms()
33
+
34
+ backup_existing_output(program)
35
+
36
+ for source_data_path in source_data_spec.paths:
37
+ build_cohort_file(
38
+ source_data_path=source_data_path,
39
+ program=program,
40
+ source_data_spec=source_data_spec,
41
+ program_spec=program_spec,
42
+ post_panel_transforms=post_panel_transforms,
43
+ )
44
+
45
+ end = time.time()
46
+ total_run_time = time_elapsed(start, end)
47
+ print("\n Done")
48
+ print(f"\n Total time: {total_run_time}")
49
+
50
+
51
+ def cli() -> None:
52
+ """Command-line wrapper for package entrypoints."""
53
+
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument("program")
56
+ args = parser.parse_args()
57
+ main(program=args.program)
58
+
59
+
60
+ if __name__ == "__main__":
61
+ cli()
@@ -0,0 +1,122 @@
1
+ """Run DoubleML estimation for a YAML experiment.
2
+
3
+ This entrypoint loads an experiment, expands its registry pointers into runs,
4
+ fits each run, and writes estimation and prediction logs.
5
+ """
6
+
7
+ import argparse
8
+ import os
9
+ import sys
10
+ import time
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+
14
+ import polars as pl
15
+
16
+ if os.environ.get("NCPUS"):
17
+ os.environ["POLARS_MAX_THREADS"] = os.environ["NCPUS"]
18
+
19
+ LOCAL_DIR = Path(__file__).resolve().parents[2]
20
+ sys.path.insert(0, str(LOCAL_DIR))
21
+
22
+ from project_code.src.estimate_helpers import (
23
+ fit_doubleml_irm,
24
+ get_experiment,
25
+ prepare_estimation_data,
26
+ unpack_runs,
27
+ validate_runs,
28
+ )
29
+ from project_code.src.utils import log_process_resources, log_results, time_elapsed, trim_memory
30
+
31
+
32
+ def main(experiment_name: str) -> None:
33
+ """Run all expanded estimation runs for one experiment YAML name."""
34
+
35
+ experiment = get_experiment(experiment_name)
36
+ runs = unpack_runs(experiment)
37
+ validate_runs(runs)
38
+
39
+ stop_resource_logging = log_process_resources(interval=30)
40
+ try:
41
+ for run_number, run in enumerate(runs, start=1):
42
+ print(f"Starting run #{run_number} of {len(runs)} \n")
43
+ start = time.time()
44
+
45
+ df, x_cols, summary = prepare_estimation_data(run)
46
+
47
+ start_estimation = time.time()
48
+ dml_obj = fit_doubleml_irm(
49
+ df=df,
50
+ run=run,
51
+ covariate_set_after_dummies=x_cols,
52
+ )
53
+
54
+ end = time.time()
55
+ total_run_time = time_elapsed(start, end)
56
+ estimation_run_time = time_elapsed(start_estimation, end)
57
+ estimation_run_time_hours = (end - start_estimation) / (60 * 60)
58
+
59
+ print("\n Starting logging...\n")
60
+ estimation_log = pl.DataFrame({
61
+ "program": [run.program_name],
62
+ "treatment": [run.treatment],
63
+ "outcome": [run.outcome],
64
+ "covariate_set_name": [run.covariate_set_pointer],
65
+ "filter_set_name": [run.filter_set_pointer],
66
+ "num_controls_per_treat": [run.num_controls_per_treat],
67
+ "outcomes_model_name": [run.outcomes_model_pointer],
68
+ "propensity_model_name": [run.propensity_model_pointer],
69
+ "outcomes_model_class": [type(run.outcomes_model).__name__],
70
+ "propensity_model_class": [type(run.propensity_model).__name__],
71
+ "outcomes_model_params": [str(run.outcomes_model.get_params())],
72
+ "propensity_model_params": [str(run.propensity_model.get_params())],
73
+ "dml_estimate": [float(dml_obj.coef[0])],
74
+ "dml_se": [float(dml_obj.se[0])],
75
+ "dml_outcomes_loss": [float(dml_obj.nuisance_loss["ml_g0"][0][0])],
76
+ "dml_prop_loss": [float(dml_obj.nuisance_loss["ml_m"][0][0])],
77
+ "total_run_time": [total_run_time],
78
+ "estimation_run_time": [estimation_run_time],
79
+ "estimation_run_time_hours": [estimation_run_time_hours],
80
+ "timestamp": [datetime.now()],
81
+ "n_controls": [summary["n_controls"]],
82
+ "n_unique_controls": [summary["n_unique_controls"]],
83
+ "n_covariates": [summary["n_covariates"]],
84
+ "n_treated": [summary["n_treated"]],
85
+ "n_null_rows_dropped": [summary["n_null_rows_dropped"]],
86
+ "n_rows": [summary["n_rows"]],
87
+ "run_number": [run_number],
88
+ })
89
+ predictions_log = pl.DataFrame({
90
+ **{col: df[col].to_numpy() for col in df.columns},
91
+ "run_number": [run_number] * len(df),
92
+ "true_outcomes": df[run.outcome],
93
+ "true_propensity": df[run.treatment],
94
+ "outcomes_predictions": dml_obj.predictions["ml_g0"][:, 0, 0],
95
+ "propensity_predictions": dml_obj.predictions["ml_m"][:, 0, 0],
96
+ })
97
+
98
+ log_results("estimation", estimation_log, experiment_name, run_number)
99
+ log_results("predictions", predictions_log, experiment_name, run_number)
100
+
101
+ print(f"""\n Run #{run_number} complete
102
+ \n Estimation run time: {estimation_run_time}
103
+ \n Total run time: {total_run_time}\n \n""")
104
+
105
+ del dml_obj, df
106
+ trim_memory()
107
+
108
+ finally:
109
+ stop_resource_logging.set()
110
+
111
+
112
+ def cli() -> None:
113
+ """Command-line wrapper for package entrypoints."""
114
+
115
+ parser = argparse.ArgumentParser()
116
+ parser.add_argument("experiment_name", type=str)
117
+ args = parser.parse_args()
118
+ main(experiment_name=args.experiment_name)
119
+
120
+
121
+ if __name__ == "__main__":
122
+ cli()
@@ -0,0 +1,2 @@
1
+ """Shared helpers for the DML pipelines."""
2
+