PyPI - dml-dev - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dml-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

config_template/__init__.py +1 -0
config_template/build_spec.py +15 -0
config_template/experiments/__init__.py +2 -0
config_template/experiments/example_experiment.yaml +7 -0
config_template/registries/__init__.py +1 -0
config_template/registries/covariate_sets.py +3 -0
config_template/registries/filter_sets.py +4 -0
config_template/registries/models.py +6 -0
config_template/registries/programs.py +4 -0
dml_dev-0.1.0.dist-info/METADATA +35 -0
dml_dev-0.1.0.dist-info/RECORD +24 -0
dml_dev-0.1.0.dist-info/WHEEL +5 -0
dml_dev-0.1.0.dist-info/entry_points.txt +3 -0
dml_dev-0.1.0.dist-info/top_level.txt +2 -0
project_code/__init__.py +2 -0
project_code/pipeline/__init__.py +2 -0
project_code/pipeline/build.py +61 -0
project_code/pipeline/estimate.py +122 -0
project_code/src/__init__.py +2 -0
project_code/src/build_helpers.py +289 -0
project_code/src/estimate_helpers.py +263 -0
project_code/src/paths.py +35 -0
project_code/src/plotting.py +253 -0
project_code/src/utils.py +95 -0

config_template/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

config_template/build_spec.py ADDED Viewed

@@ -0,0 +1,15 @@
+from project_code.src.build_helpers import BuildSource, BuildSpec
+source_data = BuildSource(
+    paths=[],
+    passthrough_cols=[],
+    passthrough_cols_as_lag=[],
+)
+BUILD_SPEC = BuildSpec(
+    source_data=source_data,
+    programs={},
+    post_panel_transforms=[],
+)

config_template/experiments/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Example experiment YAML files."""
2	+

config_template/experiments/example_experiment.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+program_pointer: []
+outcomes: []
+covariate_set_pointer: []
+filter_set_pointer: []
+outcomes_model_pointer: []
+propensity_model_pointer: []
+num_controls_per_treat: []

config_template/registries/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

config_template/registries/covariate_sets.py ADDED Viewed

@@ -0,0 +1,3 @@
+COVARIATE_SET_REGISTRY: dict[str, list[str]] = {}
+CATEGORICAL_COVARIATES: list[str] = []

config_template/registries/filter_sets.py ADDED Viewed

@@ -0,0 +1,4 @@
+import polars as pl
+FILTER_SET_REGISTRY: dict[str, list[pl.Expr]] = {}

config_template/registries/models.py ADDED Viewed

@@ -0,0 +1,6 @@
+from typing import Any
+OUTCOMES_MODEL_REGISTRY: dict[str, Any] = {}
+PROPENSITY_MODEL_REGISTRY: dict[str, Any] = {}

config_template/registries/programs.py ADDED Viewed

@@ -0,0 +1,4 @@
+from project_code.src.build_helpers import ProgramSource
+PROGRAM_REGISTRY: dict[str, ProgramSource] = {}

dml_dev-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,35 @@
+Metadata-Version: 2.4
+Name: dml-dev
+Version: 0.1.0
+Summary: DoubleML build, estimation, plotting, and utility pipelines.
+Author: DML Pipeline Contributors
+Keywords: administrative-data,causal-inference,doubleml,observational-data,program-evaluation
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+Requires-Dist: doubleml
+Requires-Dist: joblib
+Requires-Dist: oi-tools[figures]
+Requires-Dist: plotnine
+Requires-Dist: polars
+Requires-Dist: psutil
+Requires-Dist: PyYAML
+Requires-Dist: scikit-learn
+Requires-Dist: threadpoolctl
+Provides-Extra: dev
+Requires-Dist: build; extra == "dev"
+Requires-Dist: twine; extra == "dev"
+# DML Pipeline
+Reusable build, estimation, plotting, and utility code for applying DoubleML to
+administrative observational data for program analysis.
+The package includes the full implementation under `project_code`, plus an
+empty `config_template` package that shows the required config shape without
+shipping project configuration.

dml_dev-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+config_template/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+config_template/build_spec.py,sha256=7bnTZuzDt4M6ahHFzIwn0SBAW_s9hNZNaZMcFZPbRfE,272
+config_template/experiments/__init__.py,sha256=jjiIUqNMd8N7pbUmn-fhoN_86ZcZN6_KRFdjD41xuOA,38
+config_template/experiments/example_experiment.yaml,sha256=OYhssBSzMgs6DYVqmQHB1q0hBphCU-O-Kr6J8vyXSkg,165
+config_template/registries/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+config_template/registries/covariate_sets.py,sha256=7FLp5f8Q5rgxPCL1gDE-tiDLfkFou51p26y0fK9gir0,90
+config_template/registries/filter_sets.py,sha256=FjAF97-VPV_rJpgSITuxPCCCJtQDNmxL7lAha79W2E0,73
+config_template/registries/models.py,sha256=gXlBvLwoFahq-HK7g5aD03VRe_1VWMiCkDfb620ZzIg,118
+config_template/registries/programs.py,sha256=nLWw4fjZvzO2nVDvXSOGjVaZ0wKZj7Rq_xdSz6eacqc,107
+project_code/__init__.py,sha256=bwmrbow99Z6b3_gOA36dz_i-mUUYI4ZF3g7uRRaPa5U,29
+project_code/pipeline/__init__.py,sha256=iD1tczg7zGZ-iJLEeLrTN7QOqSml16SDG_BryJmoKpY,50
+project_code/pipeline/build.py,sha256=n4QdVUImJ96WtEM6urjpGv1sACFX-af6xB1az8kSVfI,1551
+project_code/pipeline/estimate.py,sha256=4OlaSz2wqeJ7j82e8s3-8aEJmbJpQbaLs5nQ3_OwL8s,4689
+project_code/src/__init__.py,sha256=fSqPxsQ4vSqFiIeUX87hnojKT7B1_W5N3GPOp0o6o4Y,45
+project_code/src/build_helpers.py,sha256=kOV4Q6NZyMOuvAuprg5VkUgZes0Yv4rJfjrodMhPAY4,9106
+project_code/src/estimate_helpers.py,sha256=fCU7UKLGYfsgkbruqbTa9gt3LaVRnmDPDqZLB83-al8,8527
+project_code/src/paths.py,sha256=Msvtz2Iz19RF8m7xfsaNAiAbxAm9IowEHtK58ErFDY0,1102
+project_code/src/plotting.py,sha256=ISsYGfanDuMqErOa8_pdDMgmfA9ZnewuwdYCwfmKLMM,8759
+project_code/src/utils.py,sha256=cQOOzc3cQtWW-0652VvYuBSbDiNxVoh2PMEQKKnUtCQ,2479
+dml_dev-0.1.0.dist-info/METADATA,sha256=NpZtmk8qEBl3u9otvkgM0cl7FkuE-dx1qf1ZdVyhr4s,1259
+dml_dev-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+dml_dev-0.1.0.dist-info/entry_points.txt,sha256=fuHwWSJipTQJvn9tT4ATcRpnfOEjVop6e_tj0LGHUZg,112
+dml_dev-0.1.0.dist-info/top_level.txt,sha256=3VjkRcifTL17eG8IuySDOo1bh_fIMXwF4DRP0_5JScU,29
+dml_dev-0.1.0.dist-info/RECORD,,

dml_dev-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

dml_dev-0.1.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+dml-build = project_code.pipeline.build:cli
+dml-estimate = project_code.pipeline.estimate:cli

dml_dev-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ config_template
2	+ project_code

project_code/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """DML pipeline package."""
2	+

project_code/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Executable build and estimation pipelines."""
2	+

project_code/pipeline/build.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Build processed cohort panels for a configured program.
+This entrypoint loads the build spec, backs up any existing program output,
+then writes one processed parquet file per cohort.
+"""
+import argparse
+import sys
+import time
+from pathlib import Path
+LOCAL_DIR = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(LOCAL_DIR))
+from project_code.src.build_helpers import (
+    backup_existing_output,
+    build_cohort_file,
+    get_post_panel_transforms,
+    get_program_spec,
+    get_source_data_spec,
+    time_elapsed,
+)
+def main(program: str) -> None:
+    """Run the build pipeline for one program name from the registry."""
+    start = time.time()
+    source_data_spec = get_source_data_spec()
+    program_spec = get_program_spec(program)
+    post_panel_transforms = get_post_panel_transforms()
+    backup_existing_output(program)
+    for source_data_path in source_data_spec.paths:
+        build_cohort_file(
+            source_data_path=source_data_path,
+            program=program,
+            source_data_spec=source_data_spec,
+            program_spec=program_spec,
+            post_panel_transforms=post_panel_transforms,
+        )
+    end = time.time()
+    total_run_time = time_elapsed(start, end)
+    print("\n Done")
+    print(f"\n Total time: {total_run_time}")
+def cli() -> None:
+    """Command-line wrapper for package entrypoints."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("program")
+    args = parser.parse_args()
+    main(program=args.program)
+if __name__ == "__main__":
+    cli()

project_code/pipeline/estimate.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Run DoubleML estimation for a YAML experiment.
+This entrypoint loads an experiment, expands its registry pointers into runs,
+fits each run, and writes estimation and prediction logs.
+"""
+import argparse
+import os
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+import polars as pl
+if os.environ.get("NCPUS"):
+    os.environ["POLARS_MAX_THREADS"] = os.environ["NCPUS"]
+LOCAL_DIR = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(LOCAL_DIR))
+from project_code.src.estimate_helpers import (
+    fit_doubleml_irm,
+    get_experiment,
+    prepare_estimation_data,
+    unpack_runs,
+    validate_runs,
+)
+from project_code.src.utils import log_process_resources, log_results, time_elapsed, trim_memory
+def main(experiment_name: str) -> None:
+    """Run all expanded estimation runs for one experiment YAML name."""
+    experiment = get_experiment(experiment_name)
+    runs = unpack_runs(experiment)
+    validate_runs(runs)
+    stop_resource_logging = log_process_resources(interval=30)
+    try:
+        for run_number, run in enumerate(runs, start=1):
+            print(f"Starting run #{run_number} of {len(runs)} \n")
+            start = time.time()
+            df, x_cols, summary = prepare_estimation_data(run)
+            start_estimation = time.time()
+            dml_obj = fit_doubleml_irm(
+                df=df,
+                run=run,
+                covariate_set_after_dummies=x_cols,
+            )
+            end = time.time()
+            total_run_time = time_elapsed(start, end)
+            estimation_run_time = time_elapsed(start_estimation, end)
+            estimation_run_time_hours = (end - start_estimation) / (60 * 60)
+            print("\n Starting logging...\n")
+            estimation_log = pl.DataFrame({
+                "program": [run.program_name],
+                "treatment": [run.treatment],
+                "outcome": [run.outcome],
+                "covariate_set_name": [run.covariate_set_pointer],
+                "filter_set_name": [run.filter_set_pointer],
+                "num_controls_per_treat": [run.num_controls_per_treat],
+                "outcomes_model_name": [run.outcomes_model_pointer],
+                "propensity_model_name": [run.propensity_model_pointer],
+                "outcomes_model_class": [type(run.outcomes_model).__name__],
+                "propensity_model_class": [type(run.propensity_model).__name__],
+                "outcomes_model_params": [str(run.outcomes_model.get_params())],
+                "propensity_model_params": [str(run.propensity_model.get_params())],
+                "dml_estimate": [float(dml_obj.coef[0])],
+                "dml_se": [float(dml_obj.se[0])],
+                "dml_outcomes_loss": [float(dml_obj.nuisance_loss["ml_g0"][0][0])],
+                "dml_prop_loss": [float(dml_obj.nuisance_loss["ml_m"][0][0])],
+                "total_run_time": [total_run_time],
+                "estimation_run_time": [estimation_run_time],
+                "estimation_run_time_hours": [estimation_run_time_hours],
+                "timestamp": [datetime.now()],
+                "n_controls": [summary["n_controls"]],
+                "n_unique_controls": [summary["n_unique_controls"]],
+                "n_covariates": [summary["n_covariates"]],
+                "n_treated": [summary["n_treated"]],
+                "n_null_rows_dropped": [summary["n_null_rows_dropped"]],
+                "n_rows": [summary["n_rows"]],
+                "run_number": [run_number],
+            })
+            predictions_log = pl.DataFrame({
+                **{col: df[col].to_numpy() for col in df.columns},
+                "run_number": [run_number] * len(df),
+                "true_outcomes": df[run.outcome],
+                "true_propensity": df[run.treatment],
+                "outcomes_predictions": dml_obj.predictions["ml_g0"][:, 0, 0],
+                "propensity_predictions": dml_obj.predictions["ml_m"][:, 0, 0],
+            })
+            log_results("estimation", estimation_log, experiment_name, run_number)
+            log_results("predictions", predictions_log, experiment_name, run_number)
+            print(f"""\n Run #{run_number} complete
+                  \n Estimation run time: {estimation_run_time}
+                  \n Total run time: {total_run_time}\n \n""")
+            del dml_obj, df
+            trim_memory()
+    finally:
+        stop_resource_logging.set()
+def cli() -> None:
+    """Command-line wrapper for package entrypoints."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("experiment_name", type=str)
+    args = parser.parse_args()
+    main(experiment_name=args.experiment_name)
+if __name__ == "__main__":
+    cli()

project_code/src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Shared helpers for the DML pipelines."""
2	+

project_code/src/build_helpers.py ADDED Viewed

@@ -0,0 +1,289 @@
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+import shutil
+import sys
+import tempfile
+import time
+import polars as pl
+from project_code.src.paths import CONFIG_DIR, processed_data_out_folder, processed_data_out_path
+from project_code.src.utils import time_elapsed, trim_memory
+TREATMENT_COL = "treatment"
+OBSERVATION_COL = "observation_year"
+JOIN_KEY = "unit_id"
+Transform = Callable[[pl.LazyFrame], pl.LazyFrame]
+@dataclass(frozen=True)
+class RelativeCol:
+    """Calendar-year columns to convert into event-time columns around enrollment."""
+    stem: str
+    years: Sequence[int]
+@dataclass(frozen=True, kw_only=True)
+class BuildSource:
+    """Input files and columns to carry from one side of the build join."""
+    paths: Sequence[Path]
+    passthrough_cols: Sequence[pl.Expr]
+    passthrough_cols_as_lag: Sequence[RelativeCol]
+    join_key_col: pl.Expr = field(default_factory=lambda: pl.col(JOIN_KEY))
+@dataclass(frozen=True, kw_only=True)
+class ProgramSource(BuildSource):
+    """Program-specific source data and column mappings."""
+    name: str
+    treatment_col: pl.Expr
+    enrollment_year_col: pl.Expr
+@dataclass(frozen=True, init=False)
+class BuildSpec:
+    """Complete build recipe: source data, programs, and post-panel transforms."""
+    source_data: BuildSource
+    programs: dict[str, ProgramSource]
+    post_panel_transforms: Sequence[Transform]
+    def __init__(
+        self,
+        source_data: BuildSource | None = None,
+        programs: dict[str, ProgramSource] | None = None,
+        post_panel_transforms: Sequence[Transform] = (),
+    ):
+        if source_data is None:
+            raise ValueError("BuildSpec requires source_data")
+        object.__setattr__(self, "source_data", source_data)
+        object.__setattr__(self, "programs", programs or {})
+        object.__setattr__(self, "post_panel_transforms", post_panel_transforms)
+def get_build_spec() -> BuildSpec:
+    """Load the configured build recipe lazily to avoid import cycles."""
+    sys.path.insert(0, str(CONFIG_DIR.parent))
+    from config.build_spec import BUILD_SPEC
+    return BUILD_SPEC
+def get_program_spec(program: str) -> ProgramSource:
+    """Return the configured source definition for one program."""
+    try:
+        return get_build_spec().programs[program]
+    except KeyError as e:
+        raise ValueError(f"Unknown program: {program}") from e
+def get_source_data_spec() -> BuildSource:
+    """Return the shared source data input definition."""
+    return get_build_spec().source_data
+def get_post_panel_transforms() -> Sequence[Transform]:
+    """Return transforms applied after event-time panel construction."""
+    return get_build_spec().post_panel_transforms
+def backup_existing_output(program: str) -> None:
+    """Move existing build output aside before writing a fresh run."""
+    out_folder = processed_data_out_folder(program)
+    if not out_folder.exists():
+        return
+    backup = out_folder.with_name(out_folder.name + "_backup")
+    if backup.exists():
+        shutil.rmtree(backup)
+    out_folder.rename(backup)
+def load_program_lf(program_spec: ProgramSource) -> pl.LazyFrame:
+    """Load treated program records and normalize key build columns."""
+    return (
+        pl.scan_parquet(program_spec.paths)
+        .with_columns(
+            program_spec.join_key_col.alias(JOIN_KEY),
+            program_spec.treatment_col.alias(TREATMENT_COL),
+            program_spec.enrollment_year_col.alias(OBSERVATION_COL),
+        )
+        .filter(pl.col(TREATMENT_COL) == 1)
+    )
+def get_treated_enrollment_years(program_lf: pl.LazyFrame) -> list[int]:
+    """Collect enrollment years that define separate event-time panels."""
+    return (
+        program_lf.select(OBSERVATION_COL)
+        .drop_nulls()
+        .unique()
+        .collect()
+        .to_series()
+        .sort()
+        .to_list()
+    )
+def build_lag_exprs(
+    passthrough_cols_as_lag: Sequence[RelativeCol],
+    enrollment_year: int,
+    available_cols: set[str],
+) -> tuple[list[pl.Expr], set[str]]:
+    """Build expressions that map calendar-year columns to event-time columns.
+    Missing calendar-year inputs become null columns so every cohort panel has a
+    stable schema.
+    """
+    lag_exprs = []
+    missing_cols = set()
+    for col in passthrough_cols_as_lag:
+        for relative_year in col.years:
+            calendar_year = enrollment_year + relative_year
+            source_col = f"{col.stem}_{calendar_year}"
+            suffix = f"{relative_year}" if relative_year >= 0 else f"L{abs(relative_year)}"
+            target_col = f"{col.stem}_{suffix}"
+            if source_col in available_cols:
+                lag_exprs.append(pl.col(source_col).alias(target_col))
+            else:
+                missing_cols.add(source_col)
+                lag_exprs.append(pl.lit(None).alias(target_col))
+    return lag_exprs, missing_cols
+def apply_transforms(
+    lf: pl.LazyFrame,
+    transforms: Sequence[Transform],
+) -> pl.LazyFrame:
+    """Apply configured LazyFrame transforms in order."""
+    for transform in transforms:
+        lf = transform(lf)
+    return lf
+def build_cohort_file(
+    source_data_path: Path,
+    program: str,
+    source_data_spec: BuildSource,
+    program_spec: ProgramSource,
+    post_panel_transforms: Sequence[Transform],
+) -> None:
+    """Build and write one processed parquet file for one birth cohort.
+    The output stacks one panel per treated enrollment year, with controls
+    repeated into the panels where they are untreated or not-yet-treated.
+    """
+    start = time.time()
+    cohort = int(source_data_path.stem.split("=")[1])
+    print(f"\n \n Starting cohort {cohort}")
+    # Temporary cohort window used to avoid scanning out-of-scope source files.
+    if cohort < 1940 or cohort > 1995:
+        return
+    source_data_lf = pl.scan_parquet(source_data_path).with_columns(
+        source_data_spec.join_key_col.alias(JOIN_KEY)
+    )
+    program_lf = load_program_lf(program_spec)
+    treated_enrollment_years = get_treated_enrollment_years(program_lf)
+    # Join once at calendar time, then slice into event-time panels below.
+    merged_lf = source_data_lf.join(program_lf, on=JOIN_KEY, how="left")
+    merged_lf = merged_lf.with_columns(pl.col(TREATMENT_COL).fill_null(0))
+    available_cols = set(merged_lf.collect_schema().names())
+    passthrough_cols = [
+        pl.col(JOIN_KEY),
+        pl.col(OBSERVATION_COL),
+        pl.col(TREATMENT_COL),
+        *program_spec.passthrough_cols,
+        *source_data_spec.passthrough_cols,
+    ]
+    passthrough_cols_as_lag = [
+        *program_spec.passthrough_cols_as_lag,
+        *source_data_spec.passthrough_cols_as_lag,
+    ]
+    missing_cols = set()
+    temp_dir = Path(tempfile.mkdtemp())
+    cohort_panel_paths = []
+    for enrollment_year in treated_enrollment_years:
+        # Keep treated observations from this enrollment year and controls that
+        # are never treated or not yet treated.
+        cohort_panel = merged_lf.filter(
+            (pl.col(TREATMENT_COL) == 0) | (pl.col(OBSERVATION_COL) == enrollment_year)
+        )
+        cohort_panel = cohort_panel.with_columns(
+            pl.col(OBSERVATION_COL).fill_null(enrollment_year)
+        )
+        # Build event-time columns from calendar-time source columns.
+        lag_exprs, missing_lag_cols = build_lag_exprs(
+            passthrough_cols_as_lag=passthrough_cols_as_lag,
+            enrollment_year=enrollment_year,
+            available_cols=available_cols,
+        )
+        missing_cols.update(missing_lag_cols)
+        # Write each enrollment-year panel separately to keep memory bounded.
+        temp_path = temp_dir / f"panel_{enrollment_year}.parquet"
+        cohort_panel.select(
+            *passthrough_cols,
+            *lag_exprs,
+        ).sink_parquet(temp_path, engine="streaming")
+        cohort_panel_paths.append(temp_path)
+        del cohort_panel
+        trim_memory()
+    if missing_cols:
+        print(f"Warning - Lag construction is missing the following columns: {missing_cols}")
+    result = pl.concat(
+        [pl.scan_parquet(path) for path in cohort_panel_paths],
+        how="vertical_relaxed",
+    )
+    # Add common post-panel features after all relative columns exist.
+    result = apply_transforms(result, post_panel_transforms)
+    out_path = processed_data_out_path(program, cohort)
+    result.sink_parquet(out_path, engine="streaming")
+    del result
+    trim_memory()
+    end = time.time()
+    cohort_run_time = time_elapsed(start, end)
+    print(f"Cohort {cohort} build complete. \n Run time: {cohort_run_time}")
+def add_derived_columns(program: str) -> None:
+    """Re-apply post-panel transforms to files that have already been built."""
+    folder = processed_data_out_folder(program)
+    post_panel_transforms = get_post_panel_transforms()
+    for path in folder.iterdir():
+        lf = pl.scan_parquet(path)
+        lf = apply_transforms(lf, post_panel_transforms)
+        lf.sink_parquet(path, engine="streaming")

project_code/src/estimate_helpers.py ADDED Viewed

@@ -0,0 +1,263 @@
+from dataclasses import dataclass
+from itertools import product
+import re
+import sys
+from typing import Any
+import doubleml as dml
+import polars as pl
+import yaml
+from doubleml.utils import PSProcessorConfig
+from joblib import parallel_backend
+from sklearn.base import clone
+from threadpoolctl import threadpool_limits
+from project_code.src.build_helpers import TREATMENT_COL, JOIN_KEY
+from project_code.src.paths import CONFIG_DIR, processed_data_out_folder
+from project_code.src.utils import trim_memory
+MODEL_THREADS = 8
+N_JOBS_CV = 4
+sys.path.insert(0, str(CONFIG_DIR.parent))
+@dataclass(frozen=True)
+class Experiment:
+    """Experiment pointers loaded from a YAML file."""
+    program_pointer: list[str]
+    outcomes: list[str]
+    covariate_set_pointer: list[str]
+    filter_set_pointer: list[str]
+    outcomes_model_pointer: list[str]
+    propensity_model_pointer: list[str]
+    num_controls_per_treat: list[float]
+@dataclass
+class Run:
+    """Concrete estimation run after resolving experiment pointers."""
+    program_name: str
+    join_key: str
+    treatment: str
+    outcome: str
+    covariate_set: list[str]
+    filter_set: list[Any]
+    covariate_set_pointer: str
+    filter_set_pointer: str
+    outcomes_model_pointer: str
+    propensity_model_pointer: str
+    num_controls_per_treat: float
+    outcomes_model: Any
+    propensity_model: Any
+def get_experiment(name: str) -> Experiment:
+    """Load one experiment YAML by name."""
+    path = CONFIG_DIR / "experiments" / f"{name}.yaml"
+    if not path.exists():
+        raise ValueError(f"Unknown experiment: {name}")
+    with path.open() as f:
+        raw = yaml.safe_load(f) or {}
+    return Experiment(**raw)
+def validate_runs(runs: list[Run]) -> None:
+    """Validate that required data files and referenced columns exist."""
+    import polars as pl
+    from project_code.src.paths import processed_data_out_path
+    errors = []
+    def _schema_has_reference(schema: pl.Schema, column_reference: str) -> bool:
+        if column_reference in schema:
+            return True
+        if column_reference.startswith("^"):
+            pattern = re.compile(column_reference)
+            return any(pattern.fullmatch(col) for col in schema)
+        return False
+    for i, run in enumerate(runs):
+        label = (
+            f"Run #{i + 1} "
+            f"({run.program_name}, {run.covariate_set_pointer}, {run.filter_set_pointer}, "
+            f"{run.outcomes_model_pointer}, {run.propensity_model_pointer}, "
+            f"{run.num_controls_per_treat} controls/treat)"
+        )
+        # Check processed data exists and required columns are present
+        data_path = processed_data_out_path(run.program_name, 1945)
+        if not data_path.exists():
+            errors.append(f"{label}: no processed data found for program '{run.program_name}'")
+        else:
+            schema = pl.scan_parquet(data_path).collect_schema()
+            for var in [TREATMENT_COL, run.outcome] + run.covariate_set:
+                if var not in schema:
+                    errors.append(f"{label}: variable '{var}' not in schema")
+            # Check column references in filter expressions
+            for filter_entry in run.filter_set:
+                for col in filter_entry.meta.root_names():
+                    if not _schema_has_reference(schema, col):
+                        errors.append(f"{label}: filter references column '{col}' not in schema")
+    if errors:
+        raise ValueError("Run validation failed:\n\n" + "\n".join(errors))
+def unpack_runs(experiment: Experiment) -> list[Run]:
+    """Resolve registry pointers and expand an experiment into concrete runs."""
+    from config.registries.programs import PROGRAM_REGISTRY
+    from config.registries.covariate_sets import COVARIATE_SET_REGISTRY
+    from config.registries.filter_sets import FILTER_SET_REGISTRY
+    from config.registries.models import OUTCOMES_MODEL_REGISTRY, PROPENSITY_MODEL_REGISTRY
+    runs = []
+    combos = product(
+        experiment.program_pointer,
+        experiment.outcomes,
+        experiment.covariate_set_pointer,
+        experiment.filter_set_pointer,
+        experiment.outcomes_model_pointer,
+        experiment.propensity_model_pointer,
+        experiment.num_controls_per_treat,
+    )
+    for (
+        program_pointer,
+        out,
+        cov_pointer,
+        filt_pointer,
+        outcomes_model_pointer,
+        propensity_model_pointer,
+        num_controls_per_treat,
+    ) in combos:
+        runs.append(
+            Run(
+                program_name=PROGRAM_REGISTRY[program_pointer].name,
+                join_key=JOIN_KEY,
+                treatment=TREATMENT_COL,
+                outcome=out,
+                covariate_set=COVARIATE_SET_REGISTRY[cov_pointer],
+                filter_set=FILTER_SET_REGISTRY[filt_pointer],
+                covariate_set_pointer=cov_pointer,
+                filter_set_pointer=filt_pointer,
+                outcomes_model_pointer=outcomes_model_pointer,
+                propensity_model_pointer=propensity_model_pointer,
+                num_controls_per_treat=num_controls_per_treat,
+                outcomes_model=clone(OUTCOMES_MODEL_REGISTRY[outcomes_model_pointer]),
+                propensity_model=clone(PROPENSITY_MODEL_REGISTRY[propensity_model_pointer]),
+            )
+        )
+    return runs
+def sample_controls_per_treated(lf: pl.LazyFrame, treatment_col: str, join_key: str, num_controls_per_treat: float) -> pl.LazyFrame:
+    """Keep all treated rows and R hash sample of controls."""
+    treated = pl.col(treatment_col) == 1
+    n_treated, n_controls = lf.select(
+        treated.sum(),
+        (~treated).sum(),
+    ).collect().row(0)
+    if n_controls == 0:
+        return lf
+    keep_prob = min(1.0, num_controls_per_treat * n_treated / n_controls)
+    return lf.filter(treated | (pl.col(join_key).hash() < keep_prob * (2**64 - 1)))
+def prepare_estimation_data(run: Run):
+    """Load, filter, sample, encode, and summarize modeling data for one run."""
+    from config.registries.covariate_sets import CATEGORICAL_COVARIATES
+    folder_path = processed_data_out_folder(run.program_name)
+    paths = sorted(folder_path.glob("*.parquet"))
+    lf = pl.concat([pl.scan_parquet(path) for path in paths], how="vertical_relaxed")
+    columns = [run.treatment, run.outcome, run.join_key] + run.covariate_set
+    lf = (
+        lf.with_columns(pl.col(pl.Float32, pl.Float64).fill_nan(None))
+        .drop_nulls(subset=columns)
+    )
+    for filter_entry in run.filter_set:
+        lf = lf.filter(filter_entry)
+    lf = sample_controls_per_treated(
+        lf=lf,
+        treatment_col=run.treatment,
+        join_key=run.join_key,
+        num_controls_per_treat=run.num_controls_per_treat,
+    )
+    print("\n Starting collect...\n")
+    lf = lf.select(columns).collect()
+    print("\n Starting hot encoding...\n")
+    cols_to_encode = [col for col in CATEGORICAL_COVARIATES if col in lf.columns]
+    lf = lf.to_dummies(columns=cols_to_encode, drop_first=True)
+    x_cols = [x for x in lf.columns if x not in [run.join_key, run.treatment, run.outcome]]
+    n_rows = lf.select(pl.len()).item()
+    n_treated = lf.filter(pl.col(run.treatment) == 1).select(pl.len()).item()
+    summary = {
+        "n_rows": n_rows,
+        "n_treated": n_treated,
+        "n_controls": n_rows - n_treated,
+        "n_unique_controls": lf.filter(pl.col(run.treatment) == 0)
+        .select(run.join_key)
+        .unique()
+        .select(pl.len())
+        .item(),
+        "n_covariates": len(x_cols),
+        "n_null_rows_dropped": 0,
+    }
+    df = lf.to_pandas()
+    del lf
+    trim_memory()
+    return df, x_cols, summary
+def fit_doubleml_irm(df, run: Run, covariate_set_after_dummies: list[str]):
+    """Fit DoubleML IRM and return the fitted DoubleML object."""
+    print("\n Starting DML prep...\n")
+    dml_data = dml.DoubleMLData(
+        data=df,
+        y_col=run.outcome,
+        d_cols=[run.treatment],
+        x_cols=covariate_set_after_dummies,
+    )
+    ps_config = PSProcessorConfig()
+    dml_obj = dml.DoubleMLIRM(
+        obj_dml_data=dml_data,
+        ml_g=run.outcomes_model,
+        ml_m=run.propensity_model,
+        score="ATTE",
+        n_folds=5,
+        n_rep=3,
+        ps_processor_config=ps_config,
+    )
+    print("\n Starting DML fit...\n")
+    with threadpool_limits(limits=MODEL_THREADS, user_api="openmp"):
+        with parallel_backend("loky", inner_max_num_threads=MODEL_THREADS):
+            dml_obj.fit(n_jobs_cv=N_JOBS_CV)
+    return dml_obj

project_code/src/paths.py ADDED Viewed

@@ -0,0 +1,35 @@
+from pathlib import Path
+import os
+ROOT_DIR = Path(os.environ.get("DML_PIPELINE_ROOT_DIR", Path.cwd()))
+# === Data directories ===
+DATA_DIR = Path(os.environ.get("DML_PIPELINE_DATA_DIR", ROOT_DIR / "data"))
+# === Local working directory ===
+LOCAL_DIR = Path(os.environ.get("DML_PIPELINE_LOCAL_DIR", ROOT_DIR))
+# sub-directories
+SRC_DIR = LOCAL_DIR / "src"
+OUT_DIR = LOCAL_DIR / "outputs"
+CONFIG_DIR = Path(os.environ.get("DML_PIPELINE_CONFIG_DIR", LOCAL_DIR / "config"))
+def processed_data_out_folder(program: str) -> Path:
+    path = (LOCAL_DIR / "data" / "build" / f"{program}")
+    return path
+def processed_data_out_path(program: str, cohort: int) -> Path:
+    folder_path = processed_data_out_folder(program)
+    path = (
+        folder_path /
+        f"{program}_panel_cohort={cohort}.parquet"
+    )
+    path.parent.mkdir(parents=True, exist_ok=True)
+    return path
+def get_log_out_path(result_type: str, experiment_name: str) -> Path:
+    path = (OUT_DIR / "raw" / f"{experiment_name}" / f"log_{result_type}.parquet")
+    path.parent.mkdir(parents=True, exist_ok=True)
+    return path

project_code/src/plotting.py ADDED Viewed

@@ -0,0 +1,253 @@
+import polars as pl
+import plotnine as pn
+from oi_tools.figures import (
+    OIColors,
+    save_figure,
+    scale_color_oi,
+    scale_fill_oi,
+    theme_oi,
+)
+from project_code.src.build_helpers import TREATMENT_COL
+from project_code.src.paths import OUT_DIR, get_log_out_path
+PROPENSITY_PLOT_FOLDER = OUT_DIR / "plots" / "density_functions"
+WEIGHTED_RESIDUAL_PLOT_FOLDER = OUT_DIR / "plots" / "weighted_residual_influence"
+CALIBRATION_PLOT_FOLDER = OUT_DIR / "plots" / "calibration"
+def plot_propensity_density(experiment_name: str, run_number: int):
+    """Plot treated/control propensity score densities for one run."""
+    df = (
+        pl.read_parquet(get_log_out_path("predictions", experiment_name))
+        .filter(pl.col("run_number") == run_number)
+    )
+    plot_df = (
+        df.select(
+            pl.col("propensity_predictions"),
+            pl.when(pl.col(TREATMENT_COL) == 1)
+            .then(pl.lit("Treated"))
+            .otherwise(pl.lit("Control"))
+            .alias("treatment_status"),
+        )
+        .to_pandas()
+    )
+    fig = (
+        pn.ggplot(
+            plot_df,
+            pn.aes(
+                x="propensity_predictions",
+                color="treatment_status",
+                fill="treatment_status",
+            ),
+        )
+        + pn.geom_density(alpha=0.25)
+        + pn.scale_x_continuous(limits=(0, 1))
+        + scale_color_oi(name="")
+        + scale_fill_oi(name="")
+        + pn.labs(
+            x="Estimated propensity score",
+            y="Density",
+            title="Propensity Score Density",
+        )
+        + theme_oi()
+    )
+    PROPENSITY_PLOT_FOLDER.mkdir(parents=True, exist_ok=True)
+    save_figure(fig, PROPENSITY_PLOT_FOLDER / f"{experiment_name}_{run_number}")
+    return fig
+def _prediction_diagnostics(df: pl.DataFrame, outcome_col: str) -> pl.DataFrame:
+    """Add ATT control weights and residual diagnostics to prediction rows."""
+    return (
+        df.with_columns(
+            pl.when(pl.col("propensity_predictions").is_between(0, 1, closed="none"))
+            .then(pl.col("propensity_predictions") / (1 - pl.col("propensity_predictions")))
+            .otherwise(None)
+            .alias("att_control_weight"),
+            (pl.col(outcome_col) - pl.col("outcomes_predictions")).alias("outcome_residual"),
+        )
+        .with_columns(
+            (pl.col("att_control_weight") * pl.col("outcome_residual")).alias(
+                "weighted_residual_contribution"
+            ),
+            (pl.col("att_control_weight") * pl.col("outcome_residual")).abs().alias(
+                "abs_weighted_residual_contribution"
+            ),
+        )
+    )
+def _weighted_mean(df: pl.DataFrame, value_col: str, weight_col: str) -> float | None:
+    """Return weighted mean, or None when all usable weight is zero/null."""
+    numerator, denominator = df.select(
+        (pl.col(value_col) * pl.col(weight_col)).sum().alias("weighted_sum"),
+        pl.col(weight_col).sum().alias("weight_sum"),
+    ).row(0)
+    if denominator in (None, 0):
+        return None
+    return numerator / denominator
+def _control_ess(controls: pl.DataFrame) -> float | None:
+    """Effective sample size of ATT-weighted controls."""
+    sum_w, sum_w_sq = controls.select(
+        pl.col("att_control_weight").sum().alias("sum_w"),
+        (pl.col("att_control_weight") ** 2).sum().alias("sum_w_sq"),
+    ).row(0)
+    if sum_w_sq in (None, 0):
+        return None
+    return (sum_w**2) / sum_w_sq
+def create_run_summary(experiment_name: str, run_number: int) -> pl.DataFrame:
+    """Create a one-row table of ATT estimates, losses, samples, and diagnostics."""
+    estimation_log = (
+        pl.read_parquet(get_log_out_path("estimation", experiment_name))
+        .filter(pl.col("run_number") == run_number)
+    )
+    if estimation_log.height != 1:
+        raise ValueError(f"Expected one estimation log row, found {estimation_log.height}")
+    predictions_log = (
+        pl.read_parquet(get_log_out_path("predictions", experiment_name))
+        .filter(pl.col("run_number") == run_number)
+    )
+    outcome_col = estimation_log["outcome"].item()
+    diagnostics = _prediction_diagnostics(predictions_log, outcome_col)
+    treated = diagnostics.filter(pl.col(TREATMENT_COL) == 1)
+    controls = diagnostics.filter(pl.col(TREATMENT_COL) == 0)
+    mean_treated_outcome = treated.select(pl.col(outcome_col).mean()).item()
+    weighted_control_outcome = _weighted_mean(controls, outcome_col, "att_control_weight")
+    att_outcome_only = treated.select((pl.col(outcome_col) - pl.col("outcomes_predictions")).mean()).item()
+    att_ipw_only = (
+        None if weighted_control_outcome is None else mean_treated_outcome - weighted_control_outcome
+    )
+    att_dml = float(estimation_log["dml_estimate"].item())
+    return pl.DataFrame({
+        "att_dml": [att_dml],
+        "se_dml": [float(estimation_log["dml_se"].item())],
+        "outcome_model_loss": [float(estimation_log["dml_outcomes_loss"].item())],
+        "propensity_model_loss": [float(estimation_log["dml_prop_loss"].item())],
+        "n_treated": [int(estimation_log["n_treated"].item())],
+        "n_controls": [int(estimation_log["n_controls"].item())],
+        "n_total": [int(estimation_log["n_rows"].item())],
+        "att_control_ess": [_control_ess(controls)],
+        "att_outcome_only": [att_outcome_only],
+        "att_ipw_only": [att_ipw_only],
+        "att_residual_correction": [att_dml - att_outcome_only],
+    })
+def plot_weighted_residual_influence(experiment_name: str, run_number: int):
+    """Plot ATT-weighted control residuals for one run."""
+    estimation_log = (
+        pl.read_parquet(get_log_out_path("estimation", experiment_name))
+        .filter(pl.col("run_number") == run_number)
+    )
+    if estimation_log.height != 1:
+        raise ValueError(f"Expected one estimation log row, found {estimation_log.height}")
+    outcome_col = estimation_log["outcome"].item()
+    predictions_log = (
+        pl.read_parquet(get_log_out_path("predictions", experiment_name))
+        .filter(pl.col("run_number") == run_number)
+    )
+    controls = (
+        _prediction_diagnostics(predictions_log, outcome_col)
+        .filter(pl.col(TREATMENT_COL) == 0)
+        .drop_nulls(
+            subset=[
+                "att_control_weight",
+                "outcome_residual",
+                "abs_weighted_residual_contribution",
+            ]
+        )
+    )
+    plot_df = controls.to_pandas()
+    fig = (
+        pn.ggplot(
+            plot_df,
+            pn.aes(
+                x="att_control_weight",
+                y="outcome_residual",
+                color="abs_weighted_residual_contribution",
+            ),
+        )
+        + pn.geom_point(alpha=0.35)
+        + pn.labs(
+            x="ATT control weight",
+            y="Outcome residual",
+            color="Abs. weighted residual",
+            title="Weighted Control Residual Influence",
+        )
+        + pn.scale_color_gradient(low=OIColors.BLUE, high=OIColors.RED)
+        + theme_oi()
+    )
+    WEIGHTED_RESIDUAL_PLOT_FOLDER.mkdir(parents=True, exist_ok=True)
+    save_figure(fig, WEIGHTED_RESIDUAL_PLOT_FOLDER / f"{experiment_name}_{run_number}")
+    return fig
+def calibration_plot(
+    experiment_name: str,
+    run_number: int,
+    prediction_type: str,
+    x_limits: tuple[float, float] | None = None,
+):
+    """Plot true values against nuisance predictions for controls in one run."""
+    if prediction_type not in ["propensity", "outcomes"]:
+        raise ValueError("prediction_type must be either 'propensity' or 'outcomes'")
+    prediction_col = f"{prediction_type}_predictions"
+    actual_col = f"true_{prediction_type}"
+    plot_df = (
+        pl.read_parquet(get_log_out_path("predictions", experiment_name))
+        .filter(
+            (pl.col("run_number") == run_number)
+            & (pl.col(TREATMENT_COL) == 0)
+        )
+        .select(
+            pl.col(prediction_col).alias("prediction"),
+            pl.col(actual_col).alias("actual"),
+        )
+        .drop_nulls()
+        .to_pandas()
+    )
+    fig = (
+        pn.ggplot(plot_df, pn.aes(x="prediction", y="actual"))
+        + pn.geom_point(alpha=0.18, color=OIColors.BLUE)
+        + pn.geom_smooth(se=True, color=OIColors.RED)
+        + pn.labs(
+            x=f"{prediction_type.title()} prediction",
+            y=f"True {prediction_type}",
+            title=f"{prediction_type.title()} Calibration",
+        )
+        + theme_oi()
+    )
+    if x_limits is not None:
+        fig = fig + pn.scale_x_continuous(limits=x_limits)
+    CALIBRATION_PLOT_FOLDER.mkdir(parents=True, exist_ok=True)
+    save_figure(
+        fig,
+        CALIBRATION_PLOT_FOLDER / f"{experiment_name}_{run_number}_{prediction_type}",
+    )
+    return fig

project_code/src/utils.py ADDED Viewed

@@ -0,0 +1,95 @@
+import ctypes
+import gc
+import os
+import sys
+import threading
+import polars as pl
+from project_code.src.paths import OUT_DIR, get_log_out_path
+def log_results(result_type: str,
+                df: pl.DataFrame,
+                experiment_name: str,
+                run_number: int) -> None:
+    path = get_log_out_path(result_type, experiment_name)
+    if run_number == 1:
+        df.write_parquet(path)
+    else:
+        existing = pl.read_parquet(path)
+        combined = pl.concat([existing, df])
+        combined.write_parquet(path)
+    return
+def time_elapsed(start: float, end: float) -> str:
+    """Format elapsed wall-clock time for logs."""
+    elapsed = end - start
+    minutes = int(elapsed // 60)
+    seconds = elapsed % 60
+    return f"{minutes} min, {seconds:.0f} sec"
+def trim_memory() -> None:
+    """Ask Python and, on Linux, libc to release unused memory."""
+    gc.collect()
+    if sys.platform.startswith("linux"):
+        libc = ctypes.CDLL("libc.so.6")
+        if hasattr(libc, "malloc_trim"):
+            libc.malloc_trim(0)
+def log_process_resources(interval: float = 30) -> threading.Event:
+    import psutil
+    stop_event = threading.Event()
+    proc = psutil.Process(os.getpid())
+    def _log_loop():
+        proc.cpu_percent(interval=None)
+        while not stop_event.is_set():
+            if stop_event.wait(interval):
+                break
+            cpu_pct = proc.cpu_percent(interval=None)
+            mem_gb = proc.memory_info().rss / 1e9
+            n_threads = proc.num_threads()
+            print(
+                f"CPU: {cpu_pct:.1f}% | "
+                f"Memory: {mem_gb:.2f} GB | "
+                f"Threads: {n_threads}",
+                flush=True,
+            )
+    thread = threading.Thread(target=_log_loop, daemon=True)
+    thread.start()
+    return stop_event
+def pl_to_csv(df: pl.DataFrame, name: str) -> None:
+    """Save a transposed Polars table with variable names on the left."""
+    table_dir = OUT_DIR / "tables"
+    table_dir.mkdir(parents=True, exist_ok=True)
+    csv_name = name if name.endswith(".csv") else f"{name}.csv"
+    # This format is easier to drop into LaTeX-style summary tables:
+    # one row per original column, with observed values spread across columns.
+    out_df = df.transpose(
+        include_header=True,
+        header_name="variable",
+        column_names=[f"value_{i}" for i in range(1, df.height + 1)],
+    )
+    out_df.write_csv(table_dir / csv_name)
+    return