kmds-featurization 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kmds_featurization-0.1.0/PKG-INFO +178 -0
- kmds_featurization-0.1.0/README.md +154 -0
- kmds_featurization-0.1.0/pyproject.toml +43 -0
- kmds_featurization-0.1.0/setup.cfg +4 -0
- kmds_featurization-0.1.0/src/featurization/__init__.py +1 -0
- kmds_featurization-0.1.0/src/featurization/cli.py +131 -0
- kmds_featurization-0.1.0/src/featurization/core/__init__.py +1 -0
- kmds_featurization-0.1.0/src/featurization/core/data_loader.py +37 -0
- kmds_featurization-0.1.0/src/featurization/core/featurization_init.py +110 -0
- kmds_featurization-0.1.0/src/featurization/core/path_coordinator.py +243 -0
- kmds_featurization-0.1.0/src/featurization/core/sequential_pipeline_runner.py +190 -0
- kmds_featurization-0.1.0/src/featurization/py.typed +0 -0
- kmds_featurization-0.1.0/src/featurization/transforms/filters.py +31 -0
- kmds_featurization-0.1.0/src/featurization/transforms/geo.py +67 -0
- kmds_featurization-0.1.0/src/featurization/utils.py +49 -0
- kmds_featurization-0.1.0/src/kmds_featurization.egg-info/PKG-INFO +178 -0
- kmds_featurization-0.1.0/src/kmds_featurization.egg-info/SOURCES.txt +36 -0
- kmds_featurization-0.1.0/src/kmds_featurization.egg-info/dependency_links.txt +1 -0
- kmds_featurization-0.1.0/src/kmds_featurization.egg-info/entry_points.txt +2 -0
- kmds_featurization-0.1.0/src/kmds_featurization.egg-info/requires.txt +9 -0
- kmds_featurization-0.1.0/src/kmds_featurization.egg-info/top_level.txt +2 -0
- kmds_featurization-0.1.0/src/tabular/__init__.py +1 -0
- kmds_featurization-0.1.0/src/tabular/attribute_derivation.py +11 -0
- kmds_featurization-0.1.0/src/tabular/entity_tagging.py +61 -0
- kmds_featurization-0.1.0/src/tabular/feature_space.py +335 -0
- kmds_featurization-0.1.0/src/tabular/hierarchical_low_count_var_encoding.py +115 -0
- kmds_featurization-0.1.0/src/tabular/low_count_cat_var_encoding.py +46 -0
- kmds_featurization-0.1.0/src/tabular/merge_ops.py +24 -0
- kmds_featurization-0.1.0/src/tabular/modeling_filter.py +64 -0
- kmds_featurization-0.1.0/src/tabular/target_encoding.py +49 -0
- kmds_featurization-0.1.0/src/tabular/train_val_split.py +35 -0
- kmds_featurization-0.1.0/tests/test_feature_space.py +96 -0
- kmds_featurization-0.1.0/tests/test_initialization.py +78 -0
- kmds_featurization-0.1.0/tests/test_leakage_guards.py +68 -0
- kmds_featurization-0.1.0/tests/test_low_count_cat_var_encoding.py +52 -0
- kmds_featurization-0.1.0/tests/test_merge_ops.py +36 -0
- kmds_featurization-0.1.0/tests/test_sba_pipeline.py +113 -0
- kmds_featurization-0.1.0/tests/test_utils.py +44 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kmds-featurization
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: KMDS Featurization Service and Orchestration Pipeline
|
|
5
|
+
License-Expression: LicenseRef-Proprietary
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: pandas
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
Requires-Dist: pyyaml
|
|
18
|
+
Requires-Dist: scikit-learn
|
|
19
|
+
Requires-Dist: pgeocode
|
|
20
|
+
Requires-Dist: category-encoders
|
|
21
|
+
Requires-Dist: kneed
|
|
22
|
+
Requires-Dist: matplotlib
|
|
23
|
+
Requires-Dist: tabulate>=0.10.0
|
|
24
|
+
|
|
25
|
+
# KMDS Featurization
|
|
26
|
+
|
|
27
|
+
This repository provides a configurable, stage-based featurization engine for SBA modeling workflows.
|
|
28
|
+
|
|
29
|
+
The design goal is simple:
|
|
30
|
+
- keep stage logic understandable and composable
|
|
31
|
+
- keep orchestration/configuration centralized
|
|
32
|
+
- keep modeling flow leakage-safe (fit on train only, reuse on val/active)
|
|
33
|
+
|
|
34
|
+
## What This Produces
|
|
35
|
+
|
|
36
|
+
The pipeline writes two CSV outputs:
|
|
37
|
+
- featurized_data.csv: consolidated engineered dataset (modeled + active partitions)
|
|
38
|
+
- model_ready_numeric_data.csv: numeric model-ready export from the final stage output
|
|
39
|
+
|
|
40
|
+
Additional diagnostic artifact:
|
|
41
|
+
- feature_selection_knee_curve.png: ranked feature-importance knee plot saved in the featurization output directory
|
|
42
|
+
|
|
43
|
+
For the current SBA flow, the model-ready dataset is:
|
|
44
|
+
- numeric/bool only
|
|
45
|
+
- train-fitted feature-selected
|
|
46
|
+
- schema-aligned across train/val/active
|
|
47
|
+
- persisted with index=False (no index artifact column)
|
|
48
|
+
|
|
49
|
+
## Core Concepts
|
|
50
|
+
|
|
51
|
+
- Anchor index: record_id
|
|
52
|
+
- Stage contract: method(context, stage_cfg) -> DataFrame
|
|
53
|
+
- Waterfall behavior: each stage can shrink survivor rows by index
|
|
54
|
+
- Horizontal feature assembly: stage outputs are concatenated by index
|
|
55
|
+
- Controlled index expansion: only stages marked allow_new_indices may re-introduce rows
|
|
56
|
+
|
|
57
|
+
## Pipeline Layout (Current Hybrid Design)
|
|
58
|
+
|
|
59
|
+
Front section (feature assembly):
|
|
60
|
+
1. record_id_definition
|
|
61
|
+
2. borrower_geo_coding
|
|
62
|
+
3. prepare_categorical_data
|
|
63
|
+
4. prepare_numerical_data
|
|
64
|
+
5. merge_categorical_and_numerical
|
|
65
|
+
6. merge_with_borrower_geo
|
|
66
|
+
|
|
67
|
+
Merge stage design:
|
|
68
|
+
- package component: src/tabular/merge_ops.py
|
|
69
|
+
- user wrappers: featurization_scripts/featurization.py
|
|
70
|
+
- merge key: record_id index
|
|
71
|
+
|
|
72
|
+
Leakage-safe modeling section:
|
|
73
|
+
7. low_count_featurization_of_cat_vars
|
|
74
|
+
8. hierarchical_low_count_var_encoding
|
|
75
|
+
9. loan_status_recoding
|
|
76
|
+
10. filter_modeling_universe
|
|
77
|
+
11. stratified_train_val_split
|
|
78
|
+
12. target_encode_categorical_vars
|
|
79
|
+
13. harmonize_and_project_feature_space
|
|
80
|
+
14. merge_modeled_and_active_partitions
|
|
81
|
+
|
|
82
|
+
Current encoding rule:
|
|
83
|
+
- if both raw and rarity-corrected categorical variants exist (x and x_rcs), only x_rcs is target-encoded
|
|
84
|
+
|
|
85
|
+
## Tree-Based Feature Selection
|
|
86
|
+
|
|
87
|
+
Feature selection runs in harmonize_and_project_feature_space using train rows only.
|
|
88
|
+
|
|
89
|
+
Supported selector modes:
|
|
90
|
+
- threshold
|
|
91
|
+
- tree_ensemble
|
|
92
|
+
|
|
93
|
+
Supported tree models:
|
|
94
|
+
- gbm
|
|
95
|
+
- random_forest
|
|
96
|
+
- xgboost (optional dependency)
|
|
97
|
+
|
|
98
|
+
All selector choices are config-driven via featurizer_config.yaml and surfaced through PathCoordinator (no stage-level hardcoded constants).
|
|
99
|
+
|
|
100
|
+
Feature-count tuning for kneedle mode:
|
|
101
|
+
- FEATURE_SELECTION_TOP_K_MODE: kneedle
|
|
102
|
+
- FEATURE_SELECTION_TOP_K_MIN_RATIO: conservative default floor, e.g. 0.5
|
|
103
|
+
- FEATURE_SELECTION_MIN_FEATURE_COUNT: hard floor for retained features
|
|
104
|
+
- FEATURE_SELECTION_TARGET_FEATURE_COUNT: explicit count override when the curve is too aggressive
|
|
105
|
+
- FEATURE_SELECTION_REQUIRE_KNEEDLE: fail loudly if the knee cannot be determined
|
|
106
|
+
|
|
107
|
+
## Repository Organization
|
|
108
|
+
|
|
109
|
+
- src/featurization/core: orchestration, configuration bootstrap, path resolution
|
|
110
|
+
- src/featurization/transforms: reusable transformation primitives
|
|
111
|
+
- src/tabular: reusable tabular feature modules (encoding, splitting, feature space)
|
|
112
|
+
- src/tabular/merge_ops.py: reusable index-aligned tabular merge helper
|
|
113
|
+
- tests: package-level smoke and behavior checks
|
|
114
|
+
- documents: architecture and configuration references
|
|
115
|
+
|
|
116
|
+
## Package Component Buckets
|
|
117
|
+
|
|
118
|
+
The tabular package modules are intentionally split into two modeling buckets:
|
|
119
|
+
|
|
120
|
+
- Row-selection components:
|
|
121
|
+
- src/tabular/modeling_filter.py
|
|
122
|
+
- src/tabular/train_val_split.py
|
|
123
|
+
- Purpose: decide which records participate in training and how records are partitioned.
|
|
124
|
+
|
|
125
|
+
- Column-selection components:
|
|
126
|
+
- src/tabular/feature_space.py
|
|
127
|
+
- src/tabular/target_encoding.py
|
|
128
|
+
- src/tabular/low_count_cat_var_encoding.py
|
|
129
|
+
- src/tabular/hierarchical_low_count_var_encoding.py
|
|
130
|
+
- Purpose: decide which feature columns are engineered, selected, encoded, and projected.
|
|
131
|
+
|
|
132
|
+
- Assembly components:
|
|
133
|
+
- src/tabular/merge_ops.py
|
|
134
|
+
- Purpose: index-aligned horizontal composition of prepared payloads.
|
|
135
|
+
|
|
136
|
+
## CLI
|
|
137
|
+
|
|
138
|
+
Initialize config:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
featurization-cli init \
|
|
142
|
+
--working-dir /path/to/workspace \
|
|
143
|
+
--metadata-file sba_loans_metadata_table.csv \
|
|
144
|
+
--data-file sba_loans_user_cleaned.csv
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Run pipeline:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
featurization-cli run --working-dir /path/to/workspace
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Run smoke test in this repo:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
pytest -q tests/test_sba_pipeline.py
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## How To Extend Safely
|
|
160
|
+
|
|
161
|
+
1. Add reusable logic in src/tabular first whenever possible.
|
|
162
|
+
2. Keep stage wrappers in workspace featurization_scripts/featurization.py thin and explicit.
|
|
163
|
+
3. Add new tunables to:
|
|
164
|
+
- featurizer_config.yaml
|
|
165
|
+
- src/featurization/core/path_coordinator.py
|
|
166
|
+
- src/featurization/core/featurization_init.py
|
|
167
|
+
4. Preserve leakage rules:
|
|
168
|
+
- fit artifacts on train only
|
|
169
|
+
- transform val/active using train-fitted artifacts
|
|
170
|
+
5. Validate with tests after each change.
|
|
171
|
+
|
|
172
|
+
## Recommended Read Order
|
|
173
|
+
|
|
174
|
+
1. documents/sba_pipeline_featurization.md
|
|
175
|
+
2. documents/config_blueprint.md
|
|
176
|
+
3. documents/path_coordinator_function.md
|
|
177
|
+
4. src/featurization/core/sequential_pipeline_runner.py
|
|
178
|
+
5. src/tabular/feature_space.py
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# KMDS Featurization
|
|
2
|
+
|
|
3
|
+
This repository provides a configurable, stage-based featurization engine for SBA modeling workflows.
|
|
4
|
+
|
|
5
|
+
The design goal is simple:
|
|
6
|
+
- keep stage logic understandable and composable
|
|
7
|
+
- keep orchestration/configuration centralized
|
|
8
|
+
- keep modeling flow leakage-safe (fit on train only, reuse on val/active)
|
|
9
|
+
|
|
10
|
+
## What This Produces
|
|
11
|
+
|
|
12
|
+
The pipeline writes two CSV outputs:
|
|
13
|
+
- featurized_data.csv: consolidated engineered dataset (modeled + active partitions)
|
|
14
|
+
- model_ready_numeric_data.csv: numeric model-ready export from the final stage output
|
|
15
|
+
|
|
16
|
+
Additional diagnostic artifact:
|
|
17
|
+
- feature_selection_knee_curve.png: ranked feature-importance knee plot saved in the featurization output directory
|
|
18
|
+
|
|
19
|
+
For the current SBA flow, the model-ready dataset is:
|
|
20
|
+
- numeric/bool only
|
|
21
|
+
- train-fitted feature-selected
|
|
22
|
+
- schema-aligned across train/val/active
|
|
23
|
+
- persisted with index=False (no index artifact column)
|
|
24
|
+
|
|
25
|
+
## Core Concepts
|
|
26
|
+
|
|
27
|
+
- Anchor index: record_id
|
|
28
|
+
- Stage contract: method(context, stage_cfg) -> DataFrame
|
|
29
|
+
- Waterfall behavior: each stage can shrink survivor rows by index
|
|
30
|
+
- Horizontal feature assembly: stage outputs are concatenated by index
|
|
31
|
+
- Controlled index expansion: only stages marked allow_new_indices may re-introduce rows
|
|
32
|
+
|
|
33
|
+
## Pipeline Layout (Current Hybrid Design)
|
|
34
|
+
|
|
35
|
+
Front section (feature assembly):
|
|
36
|
+
1. record_id_definition
|
|
37
|
+
2. borrower_geo_coding
|
|
38
|
+
3. prepare_categorical_data
|
|
39
|
+
4. prepare_numerical_data
|
|
40
|
+
5. merge_categorical_and_numerical
|
|
41
|
+
6. merge_with_borrower_geo
|
|
42
|
+
|
|
43
|
+
Merge stage design:
|
|
44
|
+
- package component: src/tabular/merge_ops.py
|
|
45
|
+
- user wrappers: featurization_scripts/featurization.py
|
|
46
|
+
- merge key: record_id index
|
|
47
|
+
|
|
48
|
+
Leakage-safe modeling section:
|
|
49
|
+
7. low_count_featurization_of_cat_vars
|
|
50
|
+
8. hierarchical_low_count_var_encoding
|
|
51
|
+
9. loan_status_recoding
|
|
52
|
+
10. filter_modeling_universe
|
|
53
|
+
11. stratified_train_val_split
|
|
54
|
+
12. target_encode_categorical_vars
|
|
55
|
+
13. harmonize_and_project_feature_space
|
|
56
|
+
14. merge_modeled_and_active_partitions
|
|
57
|
+
|
|
58
|
+
Current encoding rule:
|
|
59
|
+
- if both raw and rarity-corrected categorical variants exist (x and x_rcs), only x_rcs is target-encoded
|
|
60
|
+
|
|
61
|
+
## Tree-Based Feature Selection
|
|
62
|
+
|
|
63
|
+
Feature selection runs in harmonize_and_project_feature_space using train rows only.
|
|
64
|
+
|
|
65
|
+
Supported selector modes:
|
|
66
|
+
- threshold
|
|
67
|
+
- tree_ensemble
|
|
68
|
+
|
|
69
|
+
Supported tree models:
|
|
70
|
+
- gbm
|
|
71
|
+
- random_forest
|
|
72
|
+
- xgboost (optional dependency)
|
|
73
|
+
|
|
74
|
+
All selector choices are config-driven via featurizer_config.yaml and surfaced through PathCoordinator (no stage-level hardcoded constants).
|
|
75
|
+
|
|
76
|
+
Feature-count tuning for kneedle mode:
|
|
77
|
+
- FEATURE_SELECTION_TOP_K_MODE: kneedle
|
|
78
|
+
- FEATURE_SELECTION_TOP_K_MIN_RATIO: conservative default floor, e.g. 0.5
|
|
79
|
+
- FEATURE_SELECTION_MIN_FEATURE_COUNT: hard floor for retained features
|
|
80
|
+
- FEATURE_SELECTION_TARGET_FEATURE_COUNT: explicit count override when the curve is too aggressive
|
|
81
|
+
- FEATURE_SELECTION_REQUIRE_KNEEDLE: fail loudly if the knee cannot be determined
|
|
82
|
+
|
|
83
|
+
## Repository Organization
|
|
84
|
+
|
|
85
|
+
- src/featurization/core: orchestration, configuration bootstrap, path resolution
|
|
86
|
+
- src/featurization/transforms: reusable transformation primitives
|
|
87
|
+
- src/tabular: reusable tabular feature modules (encoding, splitting, feature space)
|
|
88
|
+
- src/tabular/merge_ops.py: reusable index-aligned tabular merge helper
|
|
89
|
+
- tests: package-level smoke and behavior checks
|
|
90
|
+
- documents: architecture and configuration references
|
|
91
|
+
|
|
92
|
+
## Package Component Buckets
|
|
93
|
+
|
|
94
|
+
The tabular package modules are intentionally split into two modeling buckets:
|
|
95
|
+
|
|
96
|
+
- Row-selection components:
|
|
97
|
+
- src/tabular/modeling_filter.py
|
|
98
|
+
- src/tabular/train_val_split.py
|
|
99
|
+
- Purpose: decide which records participate in training and how records are partitioned.
|
|
100
|
+
|
|
101
|
+
- Column-selection components:
|
|
102
|
+
- src/tabular/feature_space.py
|
|
103
|
+
- src/tabular/target_encoding.py
|
|
104
|
+
- src/tabular/low_count_cat_var_encoding.py
|
|
105
|
+
- src/tabular/hierarchical_low_count_var_encoding.py
|
|
106
|
+
- Purpose: decide which feature columns are engineered, selected, encoded, and projected.
|
|
107
|
+
|
|
108
|
+
- Assembly components:
|
|
109
|
+
- src/tabular/merge_ops.py
|
|
110
|
+
- Purpose: index-aligned horizontal composition of prepared payloads.
|
|
111
|
+
|
|
112
|
+
## CLI
|
|
113
|
+
|
|
114
|
+
Initialize config:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
featurization-cli init \
|
|
118
|
+
--working-dir /path/to/workspace \
|
|
119
|
+
--metadata-file sba_loans_metadata_table.csv \
|
|
120
|
+
--data-file sba_loans_user_cleaned.csv
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Run pipeline:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
featurization-cli run --working-dir /path/to/workspace
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Run smoke test in this repo:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
pytest -q tests/test_sba_pipeline.py
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## How To Extend Safely
|
|
136
|
+
|
|
137
|
+
1. Add reusable logic in src/tabular first whenever possible.
|
|
138
|
+
2. Keep stage wrappers in workspace featurization_scripts/featurization.py thin and explicit.
|
|
139
|
+
3. Add new tunables to:
|
|
140
|
+
- featurizer_config.yaml
|
|
141
|
+
- src/featurization/core/path_coordinator.py
|
|
142
|
+
- src/featurization/core/featurization_init.py
|
|
143
|
+
4. Preserve leakage rules:
|
|
144
|
+
- fit artifacts on train only
|
|
145
|
+
- transform val/active using train-fitted artifacts
|
|
146
|
+
5. Validate with tests after each change.
|
|
147
|
+
|
|
148
|
+
## Recommended Read Order
|
|
149
|
+
|
|
150
|
+
1. documents/sba_pipeline_featurization.md
|
|
151
|
+
2. documents/config_blueprint.md
|
|
152
|
+
3. documents/path_coordinator_function.md
|
|
153
|
+
4. src/featurization/core/sequential_pipeline_runner.py
|
|
154
|
+
5. src/tabular/feature_space.py
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "kmds-featurization"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "KMDS Featurization Service and Orchestration Pipeline"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "LicenseRef-Proprietary"
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"pandas",
|
|
23
|
+
"numpy",
|
|
24
|
+
"pyyaml",
|
|
25
|
+
"scikit-learn",
|
|
26
|
+
"pgeocode",
|
|
27
|
+
"category-encoders",
|
|
28
|
+
"kneed",
|
|
29
|
+
"matplotlib",
|
|
30
|
+
"tabulate>=0.10.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
featurization-cli = "featurization.cli:main"
|
|
35
|
+
|
|
36
|
+
[tool.setuptools]
|
|
37
|
+
package-dir = {"" = "src"}
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["src"]
|
|
41
|
+
|
|
42
|
+
[tool.pytest.ini_options]
|
|
43
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# KMDS Tabular Utilities Package
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import yaml
|
|
5
|
+
from featurization.core.sequential_pipeline_runner import PipelineRunner
|
|
6
|
+
from featurization.core.featurization_init import initialize_config
|
|
7
|
+
from featurization.core.path_coordinator import PathCoordinator
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
parser = argparse.ArgumentParser(
|
|
11
|
+
description="Featurization Service Shell - kmds-data-helper Ecosystem"
|
|
12
|
+
)
|
|
13
|
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
14
|
+
|
|
15
|
+
# --- Init Command ---
|
|
16
|
+
init_parser = subparsers.add_parser("init", help="Initialize a featurization workspace configuration.")
|
|
17
|
+
init_parser.add_argument(
|
|
18
|
+
"--working-dir",
|
|
19
|
+
required=True,
|
|
20
|
+
help="Path to the workflow directory."
|
|
21
|
+
)
|
|
22
|
+
init_parser.add_argument(
|
|
23
|
+
"--metadata-file",
|
|
24
|
+
required=True,
|
|
25
|
+
help="The dd-parser-cleaner summary CSV filename."
|
|
26
|
+
)
|
|
27
|
+
init_parser.add_argument(
|
|
28
|
+
"--data-file",
|
|
29
|
+
required=True,
|
|
30
|
+
help="The cleaned data CSV filename."
|
|
31
|
+
)
|
|
32
|
+
init_parser.add_argument(
|
|
33
|
+
"--structural-type",
|
|
34
|
+
choices=["cross-sectional", "longitudinal", "panel"],
|
|
35
|
+
default="cross-sectional",
|
|
36
|
+
help="The structural type of the dataset (cross-sectional, longitudinal, or panel)."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# --- Add-Stage Command ---
|
|
40
|
+
add_parser = subparsers.add_parser("add-stage", help="Add a custom stage to the pipeline.")
|
|
41
|
+
add_parser.add_argument("--name", required=True, help="Name of the stage.")
|
|
42
|
+
add_parser.add_argument("--entity", required=True, help="Entity category in metadata (e.g., geographical).")
|
|
43
|
+
add_parser.add_argument("--sub-filter", required=True, help="Specific sub-filter group (e.g., Borrower).")
|
|
44
|
+
add_parser.add_argument("--config", default="featurizer_config.yaml", help="Config filename.")
|
|
45
|
+
|
|
46
|
+
# --- Check Command ---
|
|
47
|
+
check_parser = subparsers.add_parser("check", help="Check if a stage is configured.")
|
|
48
|
+
check_parser.add_argument("--name", required=True, help="Stage name to verify.")
|
|
49
|
+
check_parser.add_argument("--config", default="featurizer_config.yaml", help="Config filename.")
|
|
50
|
+
|
|
51
|
+
# --- Run Command ---
|
|
52
|
+
run_parser = subparsers.add_parser("run", help="Execute the featurization pipeline.")
|
|
53
|
+
run_parser.add_argument(
|
|
54
|
+
"--working-dir",
|
|
55
|
+
type=str,
|
|
56
|
+
required=True,
|
|
57
|
+
help="Path to the active project working directory containing configurations and datasets."
|
|
58
|
+
)
|
|
59
|
+
run_parser.add_argument(
|
|
60
|
+
"--target-col",
|
|
61
|
+
type=str,
|
|
62
|
+
default="target",
|
|
63
|
+
help="Name of the machine learning target prediction column."
|
|
64
|
+
)
|
|
65
|
+
run_parser.add_argument(
|
|
66
|
+
"--config",
|
|
67
|
+
type=str,
|
|
68
|
+
default="featurizer_config.yaml",
|
|
69
|
+
help="Path to the environment config layout file."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
args = parser.parse_args()
|
|
73
|
+
|
|
74
|
+
if not args.command:
|
|
75
|
+
parser.print_help()
|
|
76
|
+
sys.exit(0)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
if args.command == "init":
|
|
80
|
+
if not os.path.isdir(args.working_dir):
|
|
81
|
+
print(f"❌ Error: Provided working directory does not exist: {args.working_dir}")
|
|
82
|
+
sys.exit(1)
|
|
83
|
+
initialize_config(
|
|
84
|
+
args.working_dir,
|
|
85
|
+
args.metadata_file,
|
|
86
|
+
args.data_file,
|
|
87
|
+
args.structural_type
|
|
88
|
+
)
|
|
89
|
+
print("✅ Initialization Complete.")
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
if args.command == "check":
|
|
93
|
+
if not os.path.exists(args.config):
|
|
94
|
+
print(f"❌ Config file '{args.config}' not found.")
|
|
95
|
+
sys.exit(1)
|
|
96
|
+
with open(args.config, "r") as f:
|
|
97
|
+
config = yaml.safe_load(f)
|
|
98
|
+
working_dir = config.get("working_dir", os.getcwd())
|
|
99
|
+
resolver = PathCoordinator(working_dir, config)
|
|
100
|
+
|
|
101
|
+
if resolver.is_stage_configured(args.name):
|
|
102
|
+
print(f"✅ Stage '{args.name}' is configured in the pipeline.")
|
|
103
|
+
else:
|
|
104
|
+
print(f"❌ Stage '{args.name}' is NOT configured.")
|
|
105
|
+
sys.exit(1)
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
if args.command == "run":
|
|
109
|
+
# Load configuration
|
|
110
|
+
config_path = os.path.join(args.working_dir, args.config)
|
|
111
|
+
if not os.path.exists(config_path):
|
|
112
|
+
config_path = args.config
|
|
113
|
+
|
|
114
|
+
if not os.path.exists(config_path):
|
|
115
|
+
print(f"❌ Error: Config file not found at {config_path}")
|
|
116
|
+
sys.exit(1)
|
|
117
|
+
|
|
118
|
+
with open(config_path, "r") as f:
|
|
119
|
+
config = yaml.safe_load(f)
|
|
120
|
+
|
|
121
|
+
# Initialize the runner and execute accumulation
|
|
122
|
+
runner = PipelineRunner(working_dir=args.working_dir, config=config)
|
|
123
|
+
runner.accumulate_stages()
|
|
124
|
+
# runner.accumulate_stages() now handles persistence internally via PathCoordinator
|
|
125
|
+
return
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"💥 Pipeline Execution Failed: {str(e)}")
|
|
128
|
+
sys.exit(1)
|
|
129
|
+
|
|
130
|
+
if __name__ == "__main__":
|
|
131
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# KMDS Core Infrastructure Package
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from featurization.utils import load_kmds_metadata
|
|
4
|
+
|
|
5
|
+
class KMDSDataLoader:
|
|
6
|
+
"""
|
|
7
|
+
Package Component: Centralized data loading module.
|
|
8
|
+
Handles lazy-loading of KMDS datasets and metadata to avoid redundant I/O.
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self, resolver):
|
|
11
|
+
self.resolver = resolver
|
|
12
|
+
self._data = None
|
|
13
|
+
self._metadata = None
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def data(self) -> pd.DataFrame:
|
|
17
|
+
"""Lazy-loads the primary cleaned dataset."""
|
|
18
|
+
if self._data is None:
|
|
19
|
+
path = self.resolver.featurization_input_path
|
|
20
|
+
if not os.path.exists(path):
|
|
21
|
+
raise FileNotFoundError(f"Source data not found at: {path}")
|
|
22
|
+
print(f"📥 [Data Loader] Reading cleaned data: {os.path.basename(path)}")
|
|
23
|
+
self._data = pd.read_csv(path, low_memory=False)
|
|
24
|
+
return self._data
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def metadata(self) -> pd.DataFrame:
|
|
28
|
+
"""Lazy-loads the KMDS metadata (data dictionary)."""
|
|
29
|
+
if self._metadata is None:
|
|
30
|
+
path = self.resolver.metadata_path
|
|
31
|
+
if not os.path.exists(path):
|
|
32
|
+
print(f"⚠️ [Data Loader] Metadata missing at {path}. Returning empty frame.")
|
|
33
|
+
self._metadata = pd.DataFrame()
|
|
34
|
+
else:
|
|
35
|
+
print(f"📥 [Data Loader] Reading metadata: {os.path.basename(path)}")
|
|
36
|
+
self._metadata = load_kmds_metadata(path)
|
|
37
|
+
return self._metadata
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
def initialize_config(working_dir: str, metadata_file: str, data_file: str, structural_type: str = "cross-sectional", config_name: str = "featurizer_config.yaml"):
|
|
5
|
+
"""
|
|
6
|
+
Bootstraps a new featurization workspace by persisting key project anchors
|
|
7
|
+
from dd-parser-cleaner into the featurizer_config.yaml.
|
|
8
|
+
"""
|
|
9
|
+
abs_working_dir = os.path.abspath(working_dir)
|
|
10
|
+
config_path = os.path.join(abs_working_dir, config_name)
|
|
11
|
+
|
|
12
|
+
# 1. Load existing config if available (check target path or current directory)
|
|
13
|
+
existing_config = {}
|
|
14
|
+
search_path = config_path if os.path.exists(config_path) else config_name
|
|
15
|
+
if os.path.exists(search_path):
|
|
16
|
+
with open(search_path, "r") as f:
|
|
17
|
+
existing_config = yaml.safe_load(f) or {}
|
|
18
|
+
|
|
19
|
+
# 2. Define final config, preferring existing values for non-anchor settings
|
|
20
|
+
final_config = {
|
|
21
|
+
"working_dir": abs_working_dir,
|
|
22
|
+
"pipeline": existing_config.get("pipeline", []), # New pipeline structure
|
|
23
|
+
"metadata_file": metadata_file,
|
|
24
|
+
"dd_cleaner_output_dir": existing_config.get("dd_cleaner_output_dir", "dd_cleaner"),
|
|
25
|
+
"featurization_input_data": existing_config.get("featurization_input_data", data_file),
|
|
26
|
+
"featurization_output_dir": existing_config.get("featurization_output_dir", "featurization"),
|
|
27
|
+
"quarantine_dir": existing_config.get("quarantine_dir", "featurization/quarantine"),
|
|
28
|
+
"featurized_data_file": existing_config.get("featurized_data_file", "featurized_data.csv"),
|
|
29
|
+
"model_ready_data_file": existing_config.get("model_ready_data_file", "model_ready_numeric_data.csv"),
|
|
30
|
+
"feat_doc_directory": existing_config.get("feat_doc_directory", "featurization_docs"),
|
|
31
|
+
"entity_assignment_output": existing_config.get("entity_assignment_output", "entity_assignments.md"),
|
|
32
|
+
"script_dir": existing_config.get("script_dir", "featurization_scripts"), # New script directory key
|
|
33
|
+
"script_name": existing_config.get("script_name", "featurization.py"), # New script file name key
|
|
34
|
+
"country_code": existing_config.get("country_code", "us"),
|
|
35
|
+
"structural_type": structural_type,
|
|
36
|
+
"VALIDATION_SIZE": existing_config.get("VALIDATION_SIZE", 0.2),
|
|
37
|
+
"FEATURE_SELECTION_MIN_NON_NULL_RATE": existing_config.get(
|
|
38
|
+
"FEATURE_SELECTION_MIN_NON_NULL_RATE", 0.01
|
|
39
|
+
),
|
|
40
|
+
"FEATURE_SELECTION_METHOD": existing_config.get("FEATURE_SELECTION_METHOD", "tree_ensemble"),
|
|
41
|
+
"FEATURE_SELECTION_TOP_K": existing_config.get("FEATURE_SELECTION_TOP_K", 50),
|
|
42
|
+
"FEATURE_SELECTION_TOP_K_MODE": existing_config.get("FEATURE_SELECTION_TOP_K_MODE", "fixed"),
|
|
43
|
+
"FEATURE_SELECTION_TOP_K_MIN": existing_config.get("FEATURE_SELECTION_TOP_K_MIN", 1),
|
|
44
|
+
"FEATURE_SELECTION_TOP_K_MIN_RATIO": existing_config.get(
|
|
45
|
+
"FEATURE_SELECTION_TOP_K_MIN_RATIO", 0.0
|
|
46
|
+
),
|
|
47
|
+
"FEATURE_SELECTION_MIN_FEATURE_COUNT": existing_config.get(
|
|
48
|
+
"FEATURE_SELECTION_MIN_FEATURE_COUNT", 0
|
|
49
|
+
),
|
|
50
|
+
"FEATURE_SELECTION_TOP_K_MAX": existing_config.get("FEATURE_SELECTION_TOP_K_MAX", 0),
|
|
51
|
+
"FEATURE_SELECTION_TARGET_FEATURE_COUNT": existing_config.get(
|
|
52
|
+
"FEATURE_SELECTION_TARGET_FEATURE_COUNT", 0
|
|
53
|
+
),
|
|
54
|
+
"FEATURE_SELECTION_KNEEDLE_SENSITIVITY": existing_config.get(
|
|
55
|
+
"FEATURE_SELECTION_KNEEDLE_SENSITIVITY", 1.0
|
|
56
|
+
),
|
|
57
|
+
"FEATURE_SELECTION_KNEEDLE_CURVE": existing_config.get(
|
|
58
|
+
"FEATURE_SELECTION_KNEEDLE_CURVE", "convex"
|
|
59
|
+
),
|
|
60
|
+
"FEATURE_SELECTION_KNEEDLE_DIRECTION": existing_config.get(
|
|
61
|
+
"FEATURE_SELECTION_KNEEDLE_DIRECTION", "decreasing"
|
|
62
|
+
),
|
|
63
|
+
"FEATURE_SELECTION_REQUIRE_KNEEDLE": existing_config.get(
|
|
64
|
+
"FEATURE_SELECTION_REQUIRE_KNEEDLE", False
|
|
65
|
+
),
|
|
66
|
+
"FEATURE_SELECTION_IMPORTANCE_FLOOR": existing_config.get(
|
|
67
|
+
"FEATURE_SELECTION_IMPORTANCE_FLOOR", 0.0
|
|
68
|
+
),
|
|
69
|
+
"FEATURE_SELECTION_TREE_MODEL": existing_config.get("FEATURE_SELECTION_TREE_MODEL", "gbm"),
|
|
70
|
+
"FEATURE_SELECTION_TREE_N_ESTIMATORS": existing_config.get(
|
|
71
|
+
"FEATURE_SELECTION_TREE_N_ESTIMATORS", 200
|
|
72
|
+
),
|
|
73
|
+
"FEATURE_SELECTION_TREE_LEARNING_RATE": existing_config.get(
|
|
74
|
+
"FEATURE_SELECTION_TREE_LEARNING_RATE", 0.05
|
|
75
|
+
),
|
|
76
|
+
"FEATURE_SELECTION_TREE_MAX_DEPTH": existing_config.get("FEATURE_SELECTION_TREE_MAX_DEPTH", 3),
|
|
77
|
+
"FEATURE_SELECTION_TREE_SUBSAMPLE": existing_config.get(
|
|
78
|
+
"FEATURE_SELECTION_TREE_SUBSAMPLE", 0.8
|
|
79
|
+
),
|
|
80
|
+
"FEATURE_SELECTION_TREE_RANDOM_STATE": existing_config.get(
|
|
81
|
+
"FEATURE_SELECTION_TREE_RANDOM_STATE", 42
|
|
82
|
+
),
|
|
83
|
+
"MODEL_READY_NUMERIC_ONLY": existing_config.get("MODEL_READY_NUMERIC_ONLY", True)
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# 3. Preserve any custom stage definitions or extra settings from the existing config
|
|
87
|
+
for key, value in existing_config.items():
|
|
88
|
+
if key not in final_config:
|
|
89
|
+
final_config[key] = value
|
|
90
|
+
|
|
91
|
+
# 4. Ensure standard directory structure exists relative to finalized working_dir
|
|
92
|
+
required_dirs = [
|
|
93
|
+
"data",
|
|
94
|
+
"documents",
|
|
95
|
+
"notebooks",
|
|
96
|
+
"featurization_scripts",
|
|
97
|
+
os.path.join("data", final_config["dd_cleaner_output_dir"]),
|
|
98
|
+
os.path.join("data", final_config["featurization_output_dir"]),
|
|
99
|
+
os.path.join("data", final_config["quarantine_dir"]),
|
|
100
|
+
os.path.join("documents", final_config["feat_doc_directory"])
|
|
101
|
+
]
|
|
102
|
+
for folder in required_dirs:
|
|
103
|
+
os.makedirs(os.path.join(abs_working_dir, folder), exist_ok=True)
|
|
104
|
+
|
|
105
|
+
with open(config_path, "w") as f:
|
|
106
|
+
yaml.safe_dump(final_config, f, default_flow_style=False, sort_keys=False)
|
|
107
|
+
|
|
108
|
+
print(f"✨ Workspace Initialized: {config_path}")
|
|
109
|
+
print(f" - Metadata Anchor: {metadata_file}")
|
|
110
|
+
print(f" - Cleaned Data Anchor: {data_file}")
|