PyPI - dml-dev - Versions diffs - 0.1.0__tar.gz - Mend

dml-dev 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

dml_dev-0.1.0/MANIFEST.in +4 -0
dml_dev-0.1.0/PKG-INFO +35 -0
dml_dev-0.1.0/README.md +8 -0
dml_dev-0.1.0/agent.md +87 -0
dml_dev-0.1.0/config_template/__init__.py +1 -0
dml_dev-0.1.0/config_template/build_spec.py +15 -0
dml_dev-0.1.0/config_template/experiments/__init__.py +2 -0
dml_dev-0.1.0/config_template/experiments/example_experiment.yaml +7 -0
dml_dev-0.1.0/config_template/registries/__init__.py +1 -0
dml_dev-0.1.0/config_template/registries/covariate_sets.py +3 -0
dml_dev-0.1.0/config_template/registries/filter_sets.py +4 -0
dml_dev-0.1.0/config_template/registries/models.py +6 -0
dml_dev-0.1.0/config_template/registries/programs.py +4 -0
dml_dev-0.1.0/dml_dev.egg-info/PKG-INFO +35 -0
dml_dev-0.1.0/dml_dev.egg-info/SOURCES.txt +29 -0
dml_dev-0.1.0/dml_dev.egg-info/dependency_links.txt +1 -0
dml_dev-0.1.0/dml_dev.egg-info/entry_points.txt +3 -0
dml_dev-0.1.0/dml_dev.egg-info/requires.txt +13 -0
dml_dev-0.1.0/dml_dev.egg-info/top_level.txt +2 -0
dml_dev-0.1.0/project_code/__init__.py +2 -0
dml_dev-0.1.0/project_code/pipeline/__init__.py +2 -0
dml_dev-0.1.0/project_code/pipeline/build.py +61 -0
dml_dev-0.1.0/project_code/pipeline/estimate.py +122 -0
dml_dev-0.1.0/project_code/src/__init__.py +2 -0
dml_dev-0.1.0/project_code/src/build_helpers.py +289 -0
dml_dev-0.1.0/project_code/src/estimate_helpers.py +263 -0
dml_dev-0.1.0/project_code/src/paths.py +35 -0
dml_dev-0.1.0/project_code/src/plotting.py +253 -0
dml_dev-0.1.0/project_code/src/utils.py +95 -0
dml_dev-0.1.0/pyproject.toml +64 -0
dml_dev-0.1.0/setup.cfg +4 -0

dml_dev-0.1.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,4 @@
+recursive-include project_code *.py
+recursive-include config_template *.py *.yaml
+include README.md
+include agent.md

dml_dev-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,35 @@
+Metadata-Version: 2.4
+Name: dml-dev
+Version: 0.1.0
+Summary: DoubleML build, estimation, plotting, and utility pipelines.
+Author: DML Pipeline Contributors
+Keywords: administrative-data,causal-inference,doubleml,observational-data,program-evaluation
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+Requires-Dist: doubleml
+Requires-Dist: joblib
+Requires-Dist: oi-tools[figures]
+Requires-Dist: plotnine
+Requires-Dist: polars
+Requires-Dist: psutil
+Requires-Dist: PyYAML
+Requires-Dist: scikit-learn
+Requires-Dist: threadpoolctl
+Provides-Extra: dev
+Requires-Dist: build; extra == "dev"
+Requires-Dist: twine; extra == "dev"
+# DML Pipeline
+Reusable build, estimation, plotting, and utility code for applying DoubleML to
+administrative observational data for program analysis.
+The package includes the full implementation under `project_code`, plus an
+empty `config_template` package that shows the required config shape without
+shipping project configuration.

dml_dev-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,8 @@
+# DML Pipeline
+Reusable build, estimation, plotting, and utility code for applying DoubleML to
+administrative observational data for program analysis.
+The package includes the full implementation under `project_code`, plus an
+empty `config_template` package that shows the required config shape without
+shipping project configuration.

dml_dev-0.1.0/agent.md ADDED Viewed

@@ -0,0 +1,87 @@
+# Repository Guide
+This repository contains a general-purpose Polars build pipeline and DoubleML
+estimation pipeline for administrative observational data. The package exposes
+the full implementation under `project_code`, including executable pipeline
+entrypoints and shared helper modules.
+## Runtime Environment
+Runtime locations are configured through environment variables in
+`project_code/src/paths.py`.
+- `DML_PIPELINE_ROOT_DIR`: project root used as the default base directory.
+- `DML_PIPELINE_DATA_DIR`: source data directory.
+- `DML_PIPELINE_LOCAL_DIR`: local working directory for generated data and logs.
+- `DML_PIPELINE_CONFIG_DIR`: optional config directory location.
+When an environment variable is not set, paths default to folders under the
+current working directory.
+## Main Pipeline
+### Build Processed Panels
+`project_code/pipeline/build.py` builds program-specific processed parquet
+files from a configured administrative data source and program registry entry.
+The script:
+1. Loads source panel files from the build spec.
+2. Loads a selected program source definition.
+3. Joins program records to the source panel on the configured join key.
+4. Constructs relative-time panels for treated and eligible comparison records.
+5. Creates relative-time variables from calendar-time source columns.
+6. Applies configured post-panel feature transforms.
+7. Writes processed parquet files for downstream estimation.
+CLI usage:
+```bash
+python project_code/pipeline/build.py <program_pointer>
+```
+### Estimate Effects
+`project_code/pipeline/estimate.py` runs DoubleML estimation from a YAML
+experiment spec.
+The script:
+1. Loads an experiment YAML file.
+2. Expands registry pointer lists into concrete runs.
+3. Loads processed parquet files for the selected program.
+4. Cleans missing values and applies configured filters.
+5. Selects treatment, outcome, covariates, and join key columns.
+6. Encodes configured categorical covariates.
+7. Fits DoubleML IRM with configured outcome and propensity models.
+8. Logs estimation summaries and row-level predictions.
+CLI usage:
+```bash
+python project_code/pipeline/estimate.py <experiment_name>
+```
+## Configuration
+Real project configuration is intentionally not packaged. The package includes
+`config_template`, which shows the expected files and datatypes without filling
+in project-specific values.
+Expected config shape:
+- `config/build_spec.py`: shared source data and post-panel transforms.
+- `config/experiments/*.yaml`: experiment specs with registry pointer lists.
+- `config/registries/programs.py`: program source registry.
+- `config/registries/covariate_sets.py`: covariate set registry.
+- `config/registries/filter_sets.py`: filter expression registry.
+- `config/registries/models.py`: outcome and propensity model registries.
+## Shared Modules
+- `project_code/src/paths.py`: path configuration and output path helpers.
+- `project_code/src/build_helpers.py`: build dataclasses and panel construction helpers.
+- `project_code/src/estimate_helpers.py`: experiment loading, run expansion, validation, and DoubleML fitting.
+- `project_code/src/plotting.py`: diagnostic plots from estimation and prediction logs.
+- `project_code/src/utils.py`: logging, table export, timing, and resource helpers.

dml_dev-0.1.0/config_template/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

dml_dev-0.1.0/config_template/build_spec.py ADDED Viewed

@@ -0,0 +1,15 @@
+from project_code.src.build_helpers import BuildSource, BuildSpec
+source_data = BuildSource(
+    paths=[],
+    passthrough_cols=[],
+    passthrough_cols_as_lag=[],
+)
+BUILD_SPEC = BuildSpec(
+    source_data=source_data,
+    programs={},
+    post_panel_transforms=[],
+)

dml_dev-0.1.0/config_template/experiments/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Example experiment YAML files."""
2	+

dml_dev-0.1.0/config_template/experiments/example_experiment.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+program_pointer: []
+outcomes: []
+covariate_set_pointer: []
+filter_set_pointer: []
+outcomes_model_pointer: []
+propensity_model_pointer: []
+num_controls_per_treat: []

dml_dev-0.1.0/config_template/registries/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

dml_dev-0.1.0/config_template/registries/covariate_sets.py ADDED Viewed

@@ -0,0 +1,3 @@
+COVARIATE_SET_REGISTRY: dict[str, list[str]] = {}
+CATEGORICAL_COVARIATES: list[str] = []

dml_dev-0.1.0/config_template/registries/filter_sets.py ADDED Viewed

@@ -0,0 +1,4 @@
+import polars as pl
+FILTER_SET_REGISTRY: dict[str, list[pl.Expr]] = {}

dml_dev-0.1.0/config_template/registries/models.py ADDED Viewed

@@ -0,0 +1,6 @@
+from typing import Any
+OUTCOMES_MODEL_REGISTRY: dict[str, Any] = {}
+PROPENSITY_MODEL_REGISTRY: dict[str, Any] = {}

dml_dev-0.1.0/config_template/registries/programs.py ADDED Viewed

@@ -0,0 +1,4 @@
+from project_code.src.build_helpers import ProgramSource
+PROGRAM_REGISTRY: dict[str, ProgramSource] = {}

dml_dev-0.1.0/dml_dev.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,35 @@
+Metadata-Version: 2.4
+Name: dml-dev
+Version: 0.1.0
+Summary: DoubleML build, estimation, plotting, and utility pipelines.
+Author: DML Pipeline Contributors
+Keywords: administrative-data,causal-inference,doubleml,observational-data,program-evaluation
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+Requires-Dist: doubleml
+Requires-Dist: joblib
+Requires-Dist: oi-tools[figures]
+Requires-Dist: plotnine
+Requires-Dist: polars
+Requires-Dist: psutil
+Requires-Dist: PyYAML
+Requires-Dist: scikit-learn
+Requires-Dist: threadpoolctl
+Provides-Extra: dev
+Requires-Dist: build; extra == "dev"
+Requires-Dist: twine; extra == "dev"
+# DML Pipeline
+Reusable build, estimation, plotting, and utility code for applying DoubleML to
+administrative observational data for program analysis.
+The package includes the full implementation under `project_code`, plus an
+empty `config_template` package that shows the required config shape without
+shipping project configuration.

dml_dev-0.1.0/dml_dev.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,29 @@
+MANIFEST.in
+README.md
+agent.md
+pyproject.toml
+config_template/__init__.py
+config_template/build_spec.py
+config_template/experiments/__init__.py
+config_template/experiments/example_experiment.yaml
+config_template/registries/__init__.py
+config_template/registries/covariate_sets.py
+config_template/registries/filter_sets.py
+config_template/registries/models.py
+config_template/registries/programs.py
+dml_dev.egg-info/PKG-INFO
+dml_dev.egg-info/SOURCES.txt
+dml_dev.egg-info/dependency_links.txt
+dml_dev.egg-info/entry_points.txt
+dml_dev.egg-info/requires.txt
+dml_dev.egg-info/top_level.txt
+project_code/__init__.py
+project_code/pipeline/__init__.py
+project_code/pipeline/build.py
+project_code/pipeline/estimate.py
+project_code/src/__init__.py
+project_code/src/build_helpers.py
+project_code/src/estimate_helpers.py
+project_code/src/paths.py
+project_code/src/plotting.py
+project_code/src/utils.py

dml_dev-0.1.0/dml_dev.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

dml_dev-0.1.0/dml_dev.egg-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+dml-build = project_code.pipeline.build:cli
+dml-estimate = project_code.pipeline.estimate:cli

dml_dev-0.1.0/dml_dev.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,13 @@
+doubleml
+joblib
+oi-tools[figures]
+plotnine
+polars
+psutil
+PyYAML
+scikit-learn
+threadpoolctl
+[dev]
+build
+twine

dml_dev-0.1.0/dml_dev.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ config_template
2	+ project_code

dml_dev-0.1.0/project_code/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """DML pipeline package."""
2	+

dml_dev-0.1.0/project_code/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Executable build and estimation pipelines."""
2	+

dml_dev-0.1.0/project_code/pipeline/build.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Build processed cohort panels for a configured program.
+This entrypoint loads the build spec, backs up any existing program output,
+then writes one processed parquet file per cohort.
+"""
+import argparse
+import sys
+import time
+from pathlib import Path
+LOCAL_DIR = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(LOCAL_DIR))
+from project_code.src.build_helpers import (
+    backup_existing_output,
+    build_cohort_file,
+    get_post_panel_transforms,
+    get_program_spec,
+    get_source_data_spec,
+    time_elapsed,
+)
+def main(program: str) -> None:
+    """Run the build pipeline for one program name from the registry."""
+    start = time.time()
+    source_data_spec = get_source_data_spec()
+    program_spec = get_program_spec(program)
+    post_panel_transforms = get_post_panel_transforms()
+    backup_existing_output(program)
+    for source_data_path in source_data_spec.paths:
+        build_cohort_file(
+            source_data_path=source_data_path,
+            program=program,
+            source_data_spec=source_data_spec,
+            program_spec=program_spec,
+            post_panel_transforms=post_panel_transforms,
+        )
+    end = time.time()
+    total_run_time = time_elapsed(start, end)
+    print("\n Done")
+    print(f"\n Total time: {total_run_time}")
+def cli() -> None:
+    """Command-line wrapper for package entrypoints."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("program")
+    args = parser.parse_args()
+    main(program=args.program)
+if __name__ == "__main__":
+    cli()

dml_dev-0.1.0/project_code/pipeline/estimate.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Run DoubleML estimation for a YAML experiment.
+This entrypoint loads an experiment, expands its registry pointers into runs,
+fits each run, and writes estimation and prediction logs.
+"""
+import argparse
+import os
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+import polars as pl
+if os.environ.get("NCPUS"):
+    os.environ["POLARS_MAX_THREADS"] = os.environ["NCPUS"]
+LOCAL_DIR = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(LOCAL_DIR))
+from project_code.src.estimate_helpers import (
+    fit_doubleml_irm,
+    get_experiment,
+    prepare_estimation_data,
+    unpack_runs,
+    validate_runs,
+)
+from project_code.src.utils import log_process_resources, log_results, time_elapsed, trim_memory
+def main(experiment_name: str) -> None:
+    """Run all expanded estimation runs for one experiment YAML name."""
+    experiment = get_experiment(experiment_name)
+    runs = unpack_runs(experiment)
+    validate_runs(runs)
+    stop_resource_logging = log_process_resources(interval=30)
+    try:
+        for run_number, run in enumerate(runs, start=1):
+            print(f"Starting run #{run_number} of {len(runs)} \n")
+            start = time.time()
+            df, x_cols, summary = prepare_estimation_data(run)
+            start_estimation = time.time()
+            dml_obj = fit_doubleml_irm(
+                df=df,
+                run=run,
+                covariate_set_after_dummies=x_cols,
+            )
+            end = time.time()
+            total_run_time = time_elapsed(start, end)
+            estimation_run_time = time_elapsed(start_estimation, end)
+            estimation_run_time_hours = (end - start_estimation) / (60 * 60)
+            print("\n Starting logging...\n")
+            estimation_log = pl.DataFrame({
+                "program": [run.program_name],
+                "treatment": [run.treatment],
+                "outcome": [run.outcome],
+                "covariate_set_name": [run.covariate_set_pointer],
+                "filter_set_name": [run.filter_set_pointer],
+                "num_controls_per_treat": [run.num_controls_per_treat],
+                "outcomes_model_name": [run.outcomes_model_pointer],
+                "propensity_model_name": [run.propensity_model_pointer],
+                "outcomes_model_class": [type(run.outcomes_model).__name__],
+                "propensity_model_class": [type(run.propensity_model).__name__],
+                "outcomes_model_params": [str(run.outcomes_model.get_params())],
+                "propensity_model_params": [str(run.propensity_model.get_params())],
+                "dml_estimate": [float(dml_obj.coef[0])],
+                "dml_se": [float(dml_obj.se[0])],
+                "dml_outcomes_loss": [float(dml_obj.nuisance_loss["ml_g0"][0][0])],
+                "dml_prop_loss": [float(dml_obj.nuisance_loss["ml_m"][0][0])],
+                "total_run_time": [total_run_time],
+                "estimation_run_time": [estimation_run_time],
+                "estimation_run_time_hours": [estimation_run_time_hours],
+                "timestamp": [datetime.now()],
+                "n_controls": [summary["n_controls"]],
+                "n_unique_controls": [summary["n_unique_controls"]],
+                "n_covariates": [summary["n_covariates"]],
+                "n_treated": [summary["n_treated"]],
+                "n_null_rows_dropped": [summary["n_null_rows_dropped"]],
+                "n_rows": [summary["n_rows"]],
+                "run_number": [run_number],
+            })
+            predictions_log = pl.DataFrame({
+                **{col: df[col].to_numpy() for col in df.columns},
+                "run_number": [run_number] * len(df),
+                "true_outcomes": df[run.outcome],
+                "true_propensity": df[run.treatment],
+                "outcomes_predictions": dml_obj.predictions["ml_g0"][:, 0, 0],
+                "propensity_predictions": dml_obj.predictions["ml_m"][:, 0, 0],
+            })
+            log_results("estimation", estimation_log, experiment_name, run_number)
+            log_results("predictions", predictions_log, experiment_name, run_number)
+            print(f"""\n Run #{run_number} complete
+                  \n Estimation run time: {estimation_run_time}
+                  \n Total run time: {total_run_time}\n \n""")
+            del dml_obj, df
+            trim_memory()
+    finally:
+        stop_resource_logging.set()
+def cli() -> None:
+    """Command-line wrapper for package entrypoints."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("experiment_name", type=str)
+    args = parser.parse_args()
+    main(experiment_name=args.experiment_name)
+if __name__ == "__main__":
+    cli()

dml_dev-0.1.0/project_code/src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Shared helpers for the DML pipelines."""
2	+