kmds-featurization 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. kmds_featurization-0.1.0/PKG-INFO +178 -0
  2. kmds_featurization-0.1.0/README.md +154 -0
  3. kmds_featurization-0.1.0/pyproject.toml +43 -0
  4. kmds_featurization-0.1.0/setup.cfg +4 -0
  5. kmds_featurization-0.1.0/src/featurization/__init__.py +1 -0
  6. kmds_featurization-0.1.0/src/featurization/cli.py +131 -0
  7. kmds_featurization-0.1.0/src/featurization/core/__init__.py +1 -0
  8. kmds_featurization-0.1.0/src/featurization/core/data_loader.py +37 -0
  9. kmds_featurization-0.1.0/src/featurization/core/featurization_init.py +110 -0
  10. kmds_featurization-0.1.0/src/featurization/core/path_coordinator.py +243 -0
  11. kmds_featurization-0.1.0/src/featurization/core/sequential_pipeline_runner.py +190 -0
  12. kmds_featurization-0.1.0/src/featurization/py.typed +0 -0
  13. kmds_featurization-0.1.0/src/featurization/transforms/filters.py +31 -0
  14. kmds_featurization-0.1.0/src/featurization/transforms/geo.py +67 -0
  15. kmds_featurization-0.1.0/src/featurization/utils.py +49 -0
  16. kmds_featurization-0.1.0/src/kmds_featurization.egg-info/PKG-INFO +178 -0
  17. kmds_featurization-0.1.0/src/kmds_featurization.egg-info/SOURCES.txt +36 -0
  18. kmds_featurization-0.1.0/src/kmds_featurization.egg-info/dependency_links.txt +1 -0
  19. kmds_featurization-0.1.0/src/kmds_featurization.egg-info/entry_points.txt +2 -0
  20. kmds_featurization-0.1.0/src/kmds_featurization.egg-info/requires.txt +9 -0
  21. kmds_featurization-0.1.0/src/kmds_featurization.egg-info/top_level.txt +2 -0
  22. kmds_featurization-0.1.0/src/tabular/__init__.py +1 -0
  23. kmds_featurization-0.1.0/src/tabular/attribute_derivation.py +11 -0
  24. kmds_featurization-0.1.0/src/tabular/entity_tagging.py +61 -0
  25. kmds_featurization-0.1.0/src/tabular/feature_space.py +335 -0
  26. kmds_featurization-0.1.0/src/tabular/hierarchical_low_count_var_encoding.py +115 -0
  27. kmds_featurization-0.1.0/src/tabular/low_count_cat_var_encoding.py +46 -0
  28. kmds_featurization-0.1.0/src/tabular/merge_ops.py +24 -0
  29. kmds_featurization-0.1.0/src/tabular/modeling_filter.py +64 -0
  30. kmds_featurization-0.1.0/src/tabular/target_encoding.py +49 -0
  31. kmds_featurization-0.1.0/src/tabular/train_val_split.py +35 -0
  32. kmds_featurization-0.1.0/tests/test_feature_space.py +96 -0
  33. kmds_featurization-0.1.0/tests/test_initialization.py +78 -0
  34. kmds_featurization-0.1.0/tests/test_leakage_guards.py +68 -0
  35. kmds_featurization-0.1.0/tests/test_low_count_cat_var_encoding.py +52 -0
  36. kmds_featurization-0.1.0/tests/test_merge_ops.py +36 -0
  37. kmds_featurization-0.1.0/tests/test_sba_pipeline.py +113 -0
  38. kmds_featurization-0.1.0/tests/test_utils.py +44 -0
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: kmds-featurization
3
+ Version: 0.1.0
4
+ Summary: KMDS Featurization Service and Orchestration Pipeline
5
+ License-Expression: LicenseRef-Proprietary
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Programming Language :: Python :: 3 :: Only
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: pandas
16
+ Requires-Dist: numpy
17
+ Requires-Dist: pyyaml
18
+ Requires-Dist: scikit-learn
19
+ Requires-Dist: pgeocode
20
+ Requires-Dist: category-encoders
21
+ Requires-Dist: kneed
22
+ Requires-Dist: matplotlib
23
+ Requires-Dist: tabulate>=0.10.0
24
+
25
+ # KMDS Featurization
26
+
27
+ This repository provides a configurable, stage-based featurization engine for SBA modeling workflows.
28
+
29
+ The design goal is simple:
30
+ - keep stage logic understandable and composable
31
+ - keep orchestration/configuration centralized
32
+ - keep modeling flow leakage-safe (fit on train only, reuse on val/active)
33
+
34
+ ## What This Produces
35
+
36
+ The pipeline writes two CSV outputs:
37
+ - featurized_data.csv: consolidated engineered dataset (modeled + active partitions)
38
+ - model_ready_numeric_data.csv: numeric model-ready export from the final stage output
39
+
40
+ Additional diagnostic artifact:
41
+ - feature_selection_knee_curve.png: ranked feature-importance knee plot saved in the featurization output directory
42
+
43
+ For the current SBA flow, the model-ready dataset is:
44
+ - numeric/bool only
45
+ - train-fitted feature-selected
46
+ - schema-aligned across train/val/active
47
+ - persisted with index=False (no index artifact column)
48
+
49
+ ## Core Concepts
50
+
51
+ - Anchor index: record_id
52
+ - Stage contract: method(context, stage_cfg) -> DataFrame
53
+ - Waterfall behavior: each stage can shrink survivor rows by index
54
+ - Horizontal feature assembly: stage outputs are concatenated by index
55
+ - Controlled index expansion: only stages marked allow_new_indices may re-introduce rows
56
+
57
+ ## Pipeline Layout (Current Hybrid Design)
58
+
59
+ Front section (feature assembly):
60
+ 1. record_id_definition
61
+ 2. borrower_geo_coding
62
+ 3. prepare_categorical_data
63
+ 4. prepare_numerical_data
64
+ 5. merge_categorical_and_numerical
65
+ 6. merge_with_borrower_geo
66
+
67
+ Merge stage design:
68
+ - package component: src/tabular/merge_ops.py
69
+ - user wrappers: featurization_scripts/featurization.py
70
+ - merge key: record_id index
71
+
72
+ Leakage-safe modeling section:
73
+ 7. low_count_featurization_of_cat_vars
74
+ 8. hierarchical_low_count_var_encoding
75
+ 9. loan_status_recoding
76
+ 10. filter_modeling_universe
77
+ 11. stratified_train_val_split
78
+ 12. target_encode_categorical_vars
79
+ 13. harmonize_and_project_feature_space
80
+ 14. merge_modeled_and_active_partitions
81
+
82
+ Current encoding rule:
83
+ - if both raw and rarity-corrected categorical variants exist (x and x_rcs), only x_rcs is target-encoded
84
+
85
+ ## Tree-Based Feature Selection
86
+
87
+ Feature selection runs in harmonize_and_project_feature_space using train rows only.
88
+
89
+ Supported selector modes:
90
+ - threshold
91
+ - tree_ensemble
92
+
93
+ Supported tree models:
94
+ - gbm
95
+ - random_forest
96
+ - xgboost (optional dependency)
97
+
98
+ All selector choices are config-driven via featurizer_config.yaml and surfaced through PathCoordinator (no stage-level hardcoded constants).
99
+
100
+ Feature-count tuning for kneedle mode:
101
+ - FEATURE_SELECTION_TOP_K_MODE: kneedle
102
+ - FEATURE_SELECTION_TOP_K_MIN_RATIO: conservative default floor, e.g. 0.5
103
+ - FEATURE_SELECTION_MIN_FEATURE_COUNT: hard floor for retained features
104
+ - FEATURE_SELECTION_TARGET_FEATURE_COUNT: explicit count override when the curve is too aggressive
105
+ - FEATURE_SELECTION_REQUIRE_KNEEDLE: fail loudly if the knee cannot be determined
106
+
107
+ ## Repository Organization
108
+
109
+ - src/featurization/core: orchestration, configuration bootstrap, path resolution
110
+ - src/featurization/transforms: reusable transformation primitives
111
+ - src/tabular: reusable tabular feature modules (encoding, splitting, feature space)
112
+ - src/tabular/merge_ops.py: reusable index-aligned tabular merge helper
113
+ - tests: package-level smoke and behavior checks
114
+ - documents: architecture and configuration references
115
+
116
+ ## Package Component Buckets
117
+
118
+ The tabular package modules are intentionally split into two modeling buckets:
119
+
120
+ - Row-selection components:
121
+ - src/tabular/modeling_filter.py
122
+ - src/tabular/train_val_split.py
123
+ - Purpose: decide which records participate in training and how records are partitioned.
124
+
125
+ - Column-selection components:
126
+ - src/tabular/feature_space.py
127
+ - src/tabular/target_encoding.py
128
+ - src/tabular/low_count_cat_var_encoding.py
129
+ - src/tabular/hierarchical_low_count_var_encoding.py
130
+ - Purpose: decide which feature columns are engineered, selected, encoded, and projected.
131
+
132
+ - Assembly components:
133
+ - src/tabular/merge_ops.py
134
+ - Purpose: index-aligned horizontal composition of prepared payloads.
135
+
136
+ ## CLI
137
+
138
+ Initialize config:
139
+
140
+ ```bash
141
+ featurization-cli init \
142
+ --working-dir /path/to/workspace \
143
+ --metadata-file sba_loans_metadata_table.csv \
144
+ --data-file sba_loans_user_cleaned.csv
145
+ ```
146
+
147
+ Run pipeline:
148
+
149
+ ```bash
150
+ featurization-cli run --working-dir /path/to/workspace
151
+ ```
152
+
153
+ Run smoke test in this repo:
154
+
155
+ ```bash
156
+ pytest -q tests/test_sba_pipeline.py
157
+ ```
158
+
159
+ ## How To Extend Safely
160
+
161
+ 1. Add reusable logic in src/tabular first whenever possible.
162
+ 2. Keep stage wrappers in workspace featurization_scripts/featurization.py thin and explicit.
163
+ 3. Add new tunables to:
164
+ - featurizer_config.yaml
165
+ - src/featurization/core/path_coordinator.py
166
+ - src/featurization/core/featurization_init.py
167
+ 4. Preserve leakage rules:
168
+ - fit artifacts on train only
169
+ - transform val/active using train-fitted artifacts
170
+ 5. Validate with tests after each change.
171
+
172
+ ## Recommended Read Order
173
+
174
+ 1. documents/sba_pipeline_featurization.md
175
+ 2. documents/config_blueprint.md
176
+ 3. documents/path_coordinator_function.md
177
+ 4. src/featurization/core/sequential_pipeline_runner.py
178
+ 5. src/tabular/feature_space.py
@@ -0,0 +1,154 @@
1
+ # KMDS Featurization
2
+
3
+ This repository provides a configurable, stage-based featurization engine for SBA modeling workflows.
4
+
5
+ The design goal is simple:
6
+ - keep stage logic understandable and composable
7
+ - keep orchestration/configuration centralized
8
+ - keep modeling flow leakage-safe (fit on train only, reuse on val/active)
9
+
10
+ ## What This Produces
11
+
12
+ The pipeline writes two CSV outputs:
13
+ - featurized_data.csv: consolidated engineered dataset (modeled + active partitions)
14
+ - model_ready_numeric_data.csv: numeric model-ready export from the final stage output
15
+
16
+ Additional diagnostic artifact:
17
+ - feature_selection_knee_curve.png: ranked feature-importance knee plot saved in the featurization output directory
18
+
19
+ For the current SBA flow, the model-ready dataset is:
20
+ - numeric/bool only
21
+ - train-fitted feature-selected
22
+ - schema-aligned across train/val/active
23
+ - persisted with index=False (no index artifact column)
24
+
25
+ ## Core Concepts
26
+
27
+ - Anchor index: record_id
28
+ - Stage contract: method(context, stage_cfg) -> DataFrame
29
+ - Waterfall behavior: each stage can shrink survivor rows by index
30
+ - Horizontal feature assembly: stage outputs are concatenated by index
31
+ - Controlled index expansion: only stages marked allow_new_indices may re-introduce rows
32
+
33
+ ## Pipeline Layout (Current Hybrid Design)
34
+
35
+ Front section (feature assembly):
36
+ 1. record_id_definition
37
+ 2. borrower_geo_coding
38
+ 3. prepare_categorical_data
39
+ 4. prepare_numerical_data
40
+ 5. merge_categorical_and_numerical
41
+ 6. merge_with_borrower_geo
42
+
43
+ Merge stage design:
44
+ - package component: src/tabular/merge_ops.py
45
+ - user wrappers: featurization_scripts/featurization.py
46
+ - merge key: record_id index
47
+
48
+ Leakage-safe modeling section:
49
+ 7. low_count_featurization_of_cat_vars
50
+ 8. hierarchical_low_count_var_encoding
51
+ 9. loan_status_recoding
52
+ 10. filter_modeling_universe
53
+ 11. stratified_train_val_split
54
+ 12. target_encode_categorical_vars
55
+ 13. harmonize_and_project_feature_space
56
+ 14. merge_modeled_and_active_partitions
57
+
58
+ Current encoding rule:
59
+ - if both raw and rarity-corrected categorical variants exist (x and x_rcs), only x_rcs is target-encoded
60
+
61
+ ## Tree-Based Feature Selection
62
+
63
+ Feature selection runs in harmonize_and_project_feature_space using train rows only.
64
+
65
+ Supported selector modes:
66
+ - threshold
67
+ - tree_ensemble
68
+
69
+ Supported tree models:
70
+ - gbm
71
+ - random_forest
72
+ - xgboost (optional dependency)
73
+
74
+ All selector choices are config-driven via featurizer_config.yaml and surfaced through PathCoordinator (no stage-level hardcoded constants).
75
+
76
+ Feature-count tuning for kneedle mode:
77
+ - FEATURE_SELECTION_TOP_K_MODE: kneedle
78
+ - FEATURE_SELECTION_TOP_K_MIN_RATIO: conservative default floor, e.g. 0.5
79
+ - FEATURE_SELECTION_MIN_FEATURE_COUNT: hard floor for retained features
80
+ - FEATURE_SELECTION_TARGET_FEATURE_COUNT: explicit count override when the curve is too aggressive
81
+ - FEATURE_SELECTION_REQUIRE_KNEEDLE: fail loudly if the knee cannot be determined
82
+
83
+ ## Repository Organization
84
+
85
+ - src/featurization/core: orchestration, configuration bootstrap, path resolution
86
+ - src/featurization/transforms: reusable transformation primitives
87
+ - src/tabular: reusable tabular feature modules (encoding, splitting, feature space)
88
+ - src/tabular/merge_ops.py: reusable index-aligned tabular merge helper
89
+ - tests: package-level smoke and behavior checks
90
+ - documents: architecture and configuration references
91
+
92
+ ## Package Component Buckets
93
+
94
+ The tabular package modules are intentionally split into two modeling buckets:
95
+
96
+ - Row-selection components:
97
+ - src/tabular/modeling_filter.py
98
+ - src/tabular/train_val_split.py
99
+ - Purpose: decide which records participate in training and how records are partitioned.
100
+
101
+ - Column-selection components:
102
+ - src/tabular/feature_space.py
103
+ - src/tabular/target_encoding.py
104
+ - src/tabular/low_count_cat_var_encoding.py
105
+ - src/tabular/hierarchical_low_count_var_encoding.py
106
+ - Purpose: decide which feature columns are engineered, selected, encoded, and projected.
107
+
108
+ - Assembly components:
109
+ - src/tabular/merge_ops.py
110
+ - Purpose: index-aligned horizontal composition of prepared payloads.
111
+
112
+ ## CLI
113
+
114
+ Initialize config:
115
+
116
+ ```bash
117
+ featurization-cli init \
118
+ --working-dir /path/to/workspace \
119
+ --metadata-file sba_loans_metadata_table.csv \
120
+ --data-file sba_loans_user_cleaned.csv
121
+ ```
122
+
123
+ Run pipeline:
124
+
125
+ ```bash
126
+ featurization-cli run --working-dir /path/to/workspace
127
+ ```
128
+
129
+ Run smoke test in this repo:
130
+
131
+ ```bash
132
+ pytest -q tests/test_sba_pipeline.py
133
+ ```
134
+
135
+ ## How To Extend Safely
136
+
137
+ 1. Add reusable logic in src/tabular first whenever possible.
138
+ 2. Keep stage wrappers in workspace featurization_scripts/featurization.py thin and explicit.
139
+ 3. Add new tunables to:
140
+ - featurizer_config.yaml
141
+ - src/featurization/core/path_coordinator.py
142
+ - src/featurization/core/featurization_init.py
143
+ 4. Preserve leakage rules:
144
+ - fit artifacts on train only
145
+ - transform val/active using train-fitted artifacts
146
+ 5. Validate with tests after each change.
147
+
148
+ ## Recommended Read Order
149
+
150
+ 1. documents/sba_pipeline_featurization.md
151
+ 2. documents/config_blueprint.md
152
+ 3. documents/path_coordinator_function.md
153
+ 4. src/featurization/core/sequential_pipeline_runner.py
154
+ 5. src/tabular/feature_space.py
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "kmds-featurization"
7
+ version = "0.1.0"
8
+ description = "KMDS Featurization Service and Orchestration Pipeline"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "LicenseRef-Proprietary"
12
+ classifiers = [
13
+ "Programming Language :: Python :: 3",
14
+ "Programming Language :: Python :: 3 :: Only",
15
+ "Programming Language :: Python :: 3.10",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ "Programming Language :: Python :: 3.13",
19
+ "Operating System :: OS Independent",
20
+ ]
21
+ dependencies = [
22
+ "pandas",
23
+ "numpy",
24
+ "pyyaml",
25
+ "scikit-learn",
26
+ "pgeocode",
27
+ "category-encoders",
28
+ "kneed",
29
+ "matplotlib",
30
+ "tabulate>=0.10.0",
31
+ ]
32
+
33
+ [project.scripts]
34
+ featurization-cli = "featurization.cli:main"
35
+
36
+ [tool.setuptools]
37
+ package-dir = {"" = "src"}
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["src"]
41
+
42
+ [tool.pytest.ini_options]
43
+ pythonpath = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ # KMDS Tabular Utilities Package
@@ -0,0 +1,131 @@
1
+ import argparse
2
+ import sys
3
+ import os
4
+ import yaml
5
+ from featurization.core.sequential_pipeline_runner import PipelineRunner
6
+ from featurization.core.featurization_init import initialize_config
7
+ from featurization.core.path_coordinator import PathCoordinator
8
+
9
+ def main():
10
+ parser = argparse.ArgumentParser(
11
+ description="Featurization Service Shell - kmds-data-helper Ecosystem"
12
+ )
13
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
14
+
15
+ # --- Init Command ---
16
+ init_parser = subparsers.add_parser("init", help="Initialize a featurization workspace configuration.")
17
+ init_parser.add_argument(
18
+ "--working-dir",
19
+ required=True,
20
+ help="Path to the workflow directory."
21
+ )
22
+ init_parser.add_argument(
23
+ "--metadata-file",
24
+ required=True,
25
+ help="The dd-parser-cleaner summary CSV filename."
26
+ )
27
+ init_parser.add_argument(
28
+ "--data-file",
29
+ required=True,
30
+ help="The cleaned data CSV filename."
31
+ )
32
+ init_parser.add_argument(
33
+ "--structural-type",
34
+ choices=["cross-sectional", "longitudinal", "panel"],
35
+ default="cross-sectional",
36
+ help="The structural type of the dataset (cross-sectional, longitudinal, or panel)."
37
+ )
38
+
39
+ # --- Add-Stage Command ---
40
+ add_parser = subparsers.add_parser("add-stage", help="Add a custom stage to the pipeline.")
41
+ add_parser.add_argument("--name", required=True, help="Name of the stage.")
42
+ add_parser.add_argument("--entity", required=True, help="Entity category in metadata (e.g., geographical).")
43
+ add_parser.add_argument("--sub-filter", required=True, help="Specific sub-filter group (e.g., Borrower).")
44
+ add_parser.add_argument("--config", default="featurizer_config.yaml", help="Config filename.")
45
+
46
+ # --- Check Command ---
47
+ check_parser = subparsers.add_parser("check", help="Check if a stage is configured.")
48
+ check_parser.add_argument("--name", required=True, help="Stage name to verify.")
49
+ check_parser.add_argument("--config", default="featurizer_config.yaml", help="Config filename.")
50
+
51
+ # --- Run Command ---
52
+ run_parser = subparsers.add_parser("run", help="Execute the featurization pipeline.")
53
+ run_parser.add_argument(
54
+ "--working-dir",
55
+ type=str,
56
+ required=True,
57
+ help="Path to the active project working directory containing configurations and datasets."
58
+ )
59
+ run_parser.add_argument(
60
+ "--target-col",
61
+ type=str,
62
+ default="target",
63
+ help="Name of the machine learning target prediction column."
64
+ )
65
+ run_parser.add_argument(
66
+ "--config",
67
+ type=str,
68
+ default="featurizer_config.yaml",
69
+ help="Path to the environment config layout file."
70
+ )
71
+
72
+ args = parser.parse_args()
73
+
74
+ if not args.command:
75
+ parser.print_help()
76
+ sys.exit(0)
77
+
78
+ try:
79
+ if args.command == "init":
80
+ if not os.path.isdir(args.working_dir):
81
+ print(f"❌ Error: Provided working directory does not exist: {args.working_dir}")
82
+ sys.exit(1)
83
+ initialize_config(
84
+ args.working_dir,
85
+ args.metadata_file,
86
+ args.data_file,
87
+ args.structural_type
88
+ )
89
+ print("✅ Initialization Complete.")
90
+ return
91
+
92
+ if args.command == "check":
93
+ if not os.path.exists(args.config):
94
+ print(f"❌ Config file '{args.config}' not found.")
95
+ sys.exit(1)
96
+ with open(args.config, "r") as f:
97
+ config = yaml.safe_load(f)
98
+ working_dir = config.get("working_dir", os.getcwd())
99
+ resolver = PathCoordinator(working_dir, config)
100
+
101
+ if resolver.is_stage_configured(args.name):
102
+ print(f"✅ Stage '{args.name}' is configured in the pipeline.")
103
+ else:
104
+ print(f"❌ Stage '{args.name}' is NOT configured.")
105
+ sys.exit(1)
106
+ return
107
+
108
+ if args.command == "run":
109
+ # Load configuration
110
+ config_path = os.path.join(args.working_dir, args.config)
111
+ if not os.path.exists(config_path):
112
+ config_path = args.config
113
+
114
+ if not os.path.exists(config_path):
115
+ print(f"❌ Error: Config file not found at {config_path}")
116
+ sys.exit(1)
117
+
118
+ with open(config_path, "r") as f:
119
+ config = yaml.safe_load(f)
120
+
121
+ # Initialize the runner and execute accumulation
122
+ runner = PipelineRunner(working_dir=args.working_dir, config=config)
123
+ runner.accumulate_stages()
124
+ # runner.accumulate_stages() now handles persistence internally via PathCoordinator
125
+ return
126
+ except Exception as e:
127
+ print(f"💥 Pipeline Execution Failed: {str(e)}")
128
+ sys.exit(1)
129
+
130
+ if __name__ == "__main__":
131
+ main()
@@ -0,0 +1 @@
1
+ # KMDS Core Infrastructure Package
@@ -0,0 +1,37 @@
1
+ import os
2
+ import pandas as pd
3
+ from featurization.utils import load_kmds_metadata
4
+
5
+ class KMDSDataLoader:
6
+ """
7
+ Package Component: Centralized data loading module.
8
+ Handles lazy-loading of KMDS datasets and metadata to avoid redundant I/O.
9
+ """
10
+ def __init__(self, resolver):
11
+ self.resolver = resolver
12
+ self._data = None
13
+ self._metadata = None
14
+
15
+ @property
16
+ def data(self) -> pd.DataFrame:
17
+ """Lazy-loads the primary cleaned dataset."""
18
+ if self._data is None:
19
+ path = self.resolver.featurization_input_path
20
+ if not os.path.exists(path):
21
+ raise FileNotFoundError(f"Source data not found at: {path}")
22
+ print(f"📥 [Data Loader] Reading cleaned data: {os.path.basename(path)}")
23
+ self._data = pd.read_csv(path, low_memory=False)
24
+ return self._data
25
+
26
+ @property
27
+ def metadata(self) -> pd.DataFrame:
28
+ """Lazy-loads the KMDS metadata (data dictionary)."""
29
+ if self._metadata is None:
30
+ path = self.resolver.metadata_path
31
+ if not os.path.exists(path):
32
+ print(f"⚠️ [Data Loader] Metadata missing at {path}. Returning empty frame.")
33
+ self._metadata = pd.DataFrame()
34
+ else:
35
+ print(f"📥 [Data Loader] Reading metadata: {os.path.basename(path)}")
36
+ self._metadata = load_kmds_metadata(path)
37
+ return self._metadata
@@ -0,0 +1,110 @@
1
+ import os
2
+ import yaml
3
+
4
+ def initialize_config(working_dir: str, metadata_file: str, data_file: str, structural_type: str = "cross-sectional", config_name: str = "featurizer_config.yaml"):
5
+ """
6
+ Bootstraps a new featurization workspace by persisting key project anchors
7
+ from dd-parser-cleaner into the featurizer_config.yaml.
8
+ """
9
+ abs_working_dir = os.path.abspath(working_dir)
10
+ config_path = os.path.join(abs_working_dir, config_name)
11
+
12
+ # 1. Load existing config if available (check target path or current directory)
13
+ existing_config = {}
14
+ search_path = config_path if os.path.exists(config_path) else config_name
15
+ if os.path.exists(search_path):
16
+ with open(search_path, "r") as f:
17
+ existing_config = yaml.safe_load(f) or {}
18
+
19
+ # 2. Define final config, preferring existing values for non-anchor settings
20
+ final_config = {
21
+ "working_dir": abs_working_dir,
22
+ "pipeline": existing_config.get("pipeline", []), # New pipeline structure
23
+ "metadata_file": metadata_file,
24
+ "dd_cleaner_output_dir": existing_config.get("dd_cleaner_output_dir", "dd_cleaner"),
25
+ "featurization_input_data": existing_config.get("featurization_input_data", data_file),
26
+ "featurization_output_dir": existing_config.get("featurization_output_dir", "featurization"),
27
+ "quarantine_dir": existing_config.get("quarantine_dir", "featurization/quarantine"),
28
+ "featurized_data_file": existing_config.get("featurized_data_file", "featurized_data.csv"),
29
+ "model_ready_data_file": existing_config.get("model_ready_data_file", "model_ready_numeric_data.csv"),
30
+ "feat_doc_directory": existing_config.get("feat_doc_directory", "featurization_docs"),
31
+ "entity_assignment_output": existing_config.get("entity_assignment_output", "entity_assignments.md"),
32
+ "script_dir": existing_config.get("script_dir", "featurization_scripts"), # New script directory key
33
+ "script_name": existing_config.get("script_name", "featurization.py"), # New script file name key
34
+ "country_code": existing_config.get("country_code", "us"),
35
+ "structural_type": structural_type,
36
+ "VALIDATION_SIZE": existing_config.get("VALIDATION_SIZE", 0.2),
37
+ "FEATURE_SELECTION_MIN_NON_NULL_RATE": existing_config.get(
38
+ "FEATURE_SELECTION_MIN_NON_NULL_RATE", 0.01
39
+ ),
40
+ "FEATURE_SELECTION_METHOD": existing_config.get("FEATURE_SELECTION_METHOD", "tree_ensemble"),
41
+ "FEATURE_SELECTION_TOP_K": existing_config.get("FEATURE_SELECTION_TOP_K", 50),
42
+ "FEATURE_SELECTION_TOP_K_MODE": existing_config.get("FEATURE_SELECTION_TOP_K_MODE", "fixed"),
43
+ "FEATURE_SELECTION_TOP_K_MIN": existing_config.get("FEATURE_SELECTION_TOP_K_MIN", 1),
44
+ "FEATURE_SELECTION_TOP_K_MIN_RATIO": existing_config.get(
45
+ "FEATURE_SELECTION_TOP_K_MIN_RATIO", 0.0
46
+ ),
47
+ "FEATURE_SELECTION_MIN_FEATURE_COUNT": existing_config.get(
48
+ "FEATURE_SELECTION_MIN_FEATURE_COUNT", 0
49
+ ),
50
+ "FEATURE_SELECTION_TOP_K_MAX": existing_config.get("FEATURE_SELECTION_TOP_K_MAX", 0),
51
+ "FEATURE_SELECTION_TARGET_FEATURE_COUNT": existing_config.get(
52
+ "FEATURE_SELECTION_TARGET_FEATURE_COUNT", 0
53
+ ),
54
+ "FEATURE_SELECTION_KNEEDLE_SENSITIVITY": existing_config.get(
55
+ "FEATURE_SELECTION_KNEEDLE_SENSITIVITY", 1.0
56
+ ),
57
+ "FEATURE_SELECTION_KNEEDLE_CURVE": existing_config.get(
58
+ "FEATURE_SELECTION_KNEEDLE_CURVE", "convex"
59
+ ),
60
+ "FEATURE_SELECTION_KNEEDLE_DIRECTION": existing_config.get(
61
+ "FEATURE_SELECTION_KNEEDLE_DIRECTION", "decreasing"
62
+ ),
63
+ "FEATURE_SELECTION_REQUIRE_KNEEDLE": existing_config.get(
64
+ "FEATURE_SELECTION_REQUIRE_KNEEDLE", False
65
+ ),
66
+ "FEATURE_SELECTION_IMPORTANCE_FLOOR": existing_config.get(
67
+ "FEATURE_SELECTION_IMPORTANCE_FLOOR", 0.0
68
+ ),
69
+ "FEATURE_SELECTION_TREE_MODEL": existing_config.get("FEATURE_SELECTION_TREE_MODEL", "gbm"),
70
+ "FEATURE_SELECTION_TREE_N_ESTIMATORS": existing_config.get(
71
+ "FEATURE_SELECTION_TREE_N_ESTIMATORS", 200
72
+ ),
73
+ "FEATURE_SELECTION_TREE_LEARNING_RATE": existing_config.get(
74
+ "FEATURE_SELECTION_TREE_LEARNING_RATE", 0.05
75
+ ),
76
+ "FEATURE_SELECTION_TREE_MAX_DEPTH": existing_config.get("FEATURE_SELECTION_TREE_MAX_DEPTH", 3),
77
+ "FEATURE_SELECTION_TREE_SUBSAMPLE": existing_config.get(
78
+ "FEATURE_SELECTION_TREE_SUBSAMPLE", 0.8
79
+ ),
80
+ "FEATURE_SELECTION_TREE_RANDOM_STATE": existing_config.get(
81
+ "FEATURE_SELECTION_TREE_RANDOM_STATE", 42
82
+ ),
83
+ "MODEL_READY_NUMERIC_ONLY": existing_config.get("MODEL_READY_NUMERIC_ONLY", True)
84
+ }
85
+
86
+ # 3. Preserve any custom stage definitions or extra settings from the existing config
87
+ for key, value in existing_config.items():
88
+ if key not in final_config:
89
+ final_config[key] = value
90
+
91
+ # 4. Ensure standard directory structure exists relative to finalized working_dir
92
+ required_dirs = [
93
+ "data",
94
+ "documents",
95
+ "notebooks",
96
+ "featurization_scripts",
97
+ os.path.join("data", final_config["dd_cleaner_output_dir"]),
98
+ os.path.join("data", final_config["featurization_output_dir"]),
99
+ os.path.join("data", final_config["quarantine_dir"]),
100
+ os.path.join("documents", final_config["feat_doc_directory"])
101
+ ]
102
+ for folder in required_dirs:
103
+ os.makedirs(os.path.join(abs_working_dir, folder), exist_ok=True)
104
+
105
+ with open(config_path, "w") as f:
106
+ yaml.safe_dump(final_config, f, default_flow_style=False, sort_keys=False)
107
+
108
+ print(f"✨ Workspace Initialized: {config_path}")
109
+ print(f" - Metadata Anchor: {metadata_file}")
110
+ print(f" - Cleaned Data Anchor: {data_file}")