tedcheck 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,270 @@
1
+ Metadata-Version: 2.4
2
+ Name: tedcheck
3
+ Version: 0.1.0
4
+ Summary: UMAP Segment Validation tool
5
+ Author-email: Tergel Munkhbaatar <tergelitu@example.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/tergelitu/tedcheck
8
+ Project-URL: Repository, https://github.com/tergelitu/tedcheck.git
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+
19
+ # TEDCHECK - UMAP Segment Validation Tool
20
+
21
+ A comprehensive Python package for UMAP-based customer segmentation visualization and validation.
22
+
23
+ ## Features
24
+
25
+ - **Flexible Configuration**: Customize column names, exclude/include features dynamically
26
+ - **UMAP Dimensionality Reduction**: 2D visualization of customer segments
27
+ - **Quality Metrics**: Calculate silhouette scores, feature importance, and purity metrics
28
+ - **Interactive Visualizations**: Plotly-based interactive charts
29
+ - **Preset Configurations**: Built-in presets for different use cases
30
+ - **Package-Ready**: Both terminal CLI and Python library usage
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ pip install -e .
36
+ ```
37
+
38
+ ## Quick Start
39
+
40
+ ### Terminal Usage
41
+
42
+ ```bash
43
+ # Using default configuration
44
+ tedcheck data.csv
45
+
46
+ # With custom columns
47
+ tedcheck data.csv --user-id customer_id --segment-col tier --time-col month
48
+
49
+ # Using presets
50
+ tedcheck data.csv --preset ecommerce --metrics
51
+
52
+ # Custom configuration file
53
+ tedcheck data.csv --config my_config.json
54
+
55
+ # Skip time column if not available
56
+ tedcheck data.csv --skip-time
57
+ ```
58
+
59
+ ### Python Usage
60
+
61
+ ```python
62
+ from tedcheck import Config, apply_umap_reduction, calculate_umap_metrics
63
+ import pandas as pd
64
+
65
+ # Load data
66
+ df = pd.read_csv('data.csv')
67
+
68
+ # Create config
69
+ config = Config(
70
+ user_id_col='customer_id',
71
+ segment_col='tier',
72
+ skip_time=False
73
+ )
74
+
75
+ # Apply UMAP reduction
76
+ df_umap, embedding, num_cols = apply_umap_reduction(df, config=config)
77
+
78
+ # Calculate metrics
79
+ metrics = calculate_umap_metrics(df_umap, embedding, df, config=config)
80
+ ```
81
+
82
+ ## Configuration
83
+
84
+ ### Default Configuration
85
+
86
+ ```json
87
+ {
88
+ "user_id_col": "user_id",
89
+ "time_col": "base_month",
90
+ "segment_col": "segment",
91
+ "cluster_col": "cluster_kmeans",
92
+ "exclude_cols": ["user_id", "base_month", "segment""],
93
+ "include_cols": null,
94
+ "n_neighbors": 50,
95
+ "min_dist": 0.1,
96
+ "random_state": 42,
97
+ "skip_time": false
98
+ }
99
+ ```
100
+ ## Preset Configurations
101
+
102
+ ### Default (General Purpose)
103
+ ```bash
104
+ tedcheck data.csv --preset default
105
+ ```
106
+
107
+ ### E-commerce
108
+ ```bash
109
+ tedcheck data.csv --preset ecommerce
110
+ # Uses: customer_id, purchase_month, customer_tier
111
+ ```
112
+
113
+ ### SaaS
114
+ ```bash
115
+ tedcheck data.csv --preset saas
116
+ # Uses: account_id, billing_month, account_segment
117
+ ```
118
+
119
+ ## CLI Options
120
+
121
+ ```
122
+ Usage: tedcheck <csv_file> [OPTIONS]
123
+
124
+ Options:
125
+ --base-month <value> Filter by specific month
126
+ --user-id <col> User ID column name
127
+ --time-col <col> Time column name
128
+ --segment-col <col> Segment column name
129
+ --cluster-col <col> Cluster column name
130
+ --exclude-cols <col1,col2> Columns to exclude
131
+ --include-cols <col1,col2> Columns to include only
132
+ --n-neighbors <int> UMAP n_neighbors
133
+ --min-dist <float> UMAP min_dist
134
+ --metrics Calculate and save metrics
135
+ --config <json_file> Load config from JSON
136
+ --preset <name> Load preset (default, ecommerce, saas)
137
+ --skip-time Skip time column if not available
138
+ ```
139
+
140
+ ## Output Files
141
+
142
+ - `*_umap_results.csv` - UMAP coordinates with user IDs and segments
143
+ - `umap_segment_*.html` - Interactive visualizations by month (if time column exists)
144
+ - `umap_segment_all.html` - Single visualization (if no time column)
145
+ - `umap_metrics.json` - Quality metrics (with `--metrics` flag)
146
+ - `feature_importance.csv` - Feature importance scores (with `--metrics` flag)
147
+
148
+ ## Metrics Explained
149
+
150
+ ### Silhouette Score
151
+ - Range: -1 to 1
152
+ - 1: Well-separated clusters
153
+ - 0: Overlapping clusters
154
+ - -1: Incorrect assignment
155
+
156
+ ### Feature Importance
157
+ - Importance of each feature in UMAP dimensions
158
+ - Higher = More influential
159
+
160
+ ### Purity Metrics
161
+ - **Homogeneity**: Segmentation purity (0-1)
162
+ - **Completeness**: Cluster completeness (0-1)
163
+ - **V-Measure**: Harmonic mean (0-1)
164
+
165
+ ## Package Structure
166
+
167
+ ```
168
+ tedcheck/
169
+ ├── __init__.py # Package initialization
170
+ ├── config.py # Configuration class
171
+ ├── features.py # Core UMAP functions
172
+ ├── cli.py # Command-line interface
173
+ ├── utils.py # Utility functions
174
+ ├── exceptions.py # Custom exceptions
175
+ ├── logger.py # Logging setup
176
+ └── configs/ # Preset configurations
177
+ ├── default.json
178
+ ├── ecommerce.json
179
+ └── saas.json
180
+ ```
181
+
182
+ ## API Reference
183
+
184
+ ### `Config` Class
185
+ ```python
186
+ from tedcheck import Config
187
+
188
+ config = Config(
189
+ user_id_col='id',
190
+ segment_col='tier',
191
+ skip_time=True
192
+ )
193
+
194
+ # Validate columns
195
+ missing = config.validate_columns(df)
196
+
197
+ # Load from file
198
+ config = Config.from_json('config.json')
199
+
200
+ # Load preset
201
+ config = Config.from_preset('ecommerce')
202
+
203
+ # Save config
204
+ config.to_json('my_config.json')
205
+ ```
206
+
207
+ ### `apply_umap_reduction()` Function
208
+ ```python
209
+ from tedcheck import apply_umap_reduction, Config
210
+
211
+ df_umap, embedding, num_cols = apply_umap_reduction(
212
+ df,
213
+ config=config
214
+ )
215
+ ```
216
+
217
+ ### `calculate_umap_metrics()` Function
218
+ ```python
219
+ from tedcheck import calculate_umap_metrics
220
+
221
+ metrics = calculate_umap_metrics(
222
+ df_umap,
223
+ embedding,
224
+ df,
225
+ config=config
226
+ )
227
+
228
+ print(metrics['silhouette_avg'])
229
+ print(metrics['feature_importance'])
230
+ print(metrics['purity_metrics'])
231
+ ```
232
+
233
+ ## Troubleshooting
234
+
235
+ ### Missing Column Error
236
+ ```bash
237
+ # Check available columns
238
+ python -c "import pandas as pd; print(pd.read_csv('data.csv').columns.tolist())"
239
+
240
+ # Use --skip-time if time column doesn't exist
241
+ tedcheck data.csv --skip-time
242
+ ```
243
+
244
+ ### Wrong Column Names
245
+ ```bash
246
+ # Specify correct column names
247
+ tedcheck data.csv --user-id id --segment-col group --time-col month
248
+ ```
249
+
250
+ ### Memory Issues
251
+ ```bash
252
+ # Use include_cols to select only important features
253
+ tedcheck data.csv --include-cols feature1,feature2,feature3
254
+ ```
255
+
256
+ ## Contributing
257
+
258
+ Contributions welcome! Please submit pull requests or issues on GitHub.
259
+
260
+ ## License
261
+
262
+ MIT License - See LICENSE file for details
263
+
264
+ ## Author
265
+
266
+ Tergel Munkhbaatar
267
+
268
+ ## Version
269
+
270
+ 0.1.0
@@ -0,0 +1,252 @@
1
+ # TEDCHECK - UMAP Segment Validation Tool
2
+
3
+ A comprehensive Python package for UMAP-based customer segmentation visualization and validation.
4
+
5
+ ## Features
6
+
7
+ - **Flexible Configuration**: Customize column names, exclude/include features dynamically
8
+ - **UMAP Dimensionality Reduction**: 2D visualization of customer segments
9
+ - **Quality Metrics**: Calculate silhouette scores, feature importance, and purity metrics
10
+ - **Interactive Visualizations**: Plotly-based interactive charts
11
+ - **Preset Configurations**: Built-in presets for different use cases
12
+ - **Package-Ready**: Both terminal CLI and Python library usage
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install -e .
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ### Terminal Usage
23
+
24
+ ```bash
25
+ # Using default configuration
26
+ tedcheck data.csv
27
+
28
+ # With custom columns
29
+ tedcheck data.csv --user-id customer_id --segment-col tier --time-col month
30
+
31
+ # Using presets
32
+ tedcheck data.csv --preset ecommerce --metrics
33
+
34
+ # Custom configuration file
35
+ tedcheck data.csv --config my_config.json
36
+
37
+ # Skip time column if not available
38
+ tedcheck data.csv --skip-time
39
+ ```
40
+
41
+ ### Python Usage
42
+
43
+ ```python
44
+ from tedcheck import Config, apply_umap_reduction, calculate_umap_metrics
45
+ import pandas as pd
46
+
47
+ # Load data
48
+ df = pd.read_csv('data.csv')
49
+
50
+ # Create config
51
+ config = Config(
52
+ user_id_col='customer_id',
53
+ segment_col='tier',
54
+ skip_time=False
55
+ )
56
+
57
+ # Apply UMAP reduction
58
+ df_umap, embedding, num_cols = apply_umap_reduction(df, config=config)
59
+
60
+ # Calculate metrics
61
+ metrics = calculate_umap_metrics(df_umap, embedding, df, config=config)
62
+ ```
63
+
64
+ ## Configuration
65
+
66
+ ### Default Configuration
67
+
68
+ ```json
69
+ {
70
+ "user_id_col": "user_id",
71
+ "time_col": "base_month",
72
+ "segment_col": "segment",
73
+ "cluster_col": "cluster_kmeans",
74
+ "exclude_cols": ["user_id", "base_month", "segment""],
75
+ "include_cols": null,
76
+ "n_neighbors": 50,
77
+ "min_dist": 0.1,
78
+ "random_state": 42,
79
+ "skip_time": false
80
+ }
81
+ ```
82
+ ## Preset Configurations
83
+
84
+ ### Default (General Purpose)
85
+ ```bash
86
+ tedcheck data.csv --preset default
87
+ ```
88
+
89
+ ### E-commerce
90
+ ```bash
91
+ tedcheck data.csv --preset ecommerce
92
+ # Uses: customer_id, purchase_month, customer_tier
93
+ ```
94
+
95
+ ### SaaS
96
+ ```bash
97
+ tedcheck data.csv --preset saas
98
+ # Uses: account_id, billing_month, account_segment
99
+ ```
100
+
101
+ ## CLI Options
102
+
103
+ ```
104
+ Usage: tedcheck <csv_file> [OPTIONS]
105
+
106
+ Options:
107
+ --base-month <value> Filter by specific month
108
+ --user-id <col> User ID column name
109
+ --time-col <col> Time column name
110
+ --segment-col <col> Segment column name
111
+ --cluster-col <col> Cluster column name
112
+ --exclude-cols <col1,col2> Columns to exclude
113
+ --include-cols <col1,col2> Columns to include only
114
+ --n-neighbors <int> UMAP n_neighbors
115
+ --min-dist <float> UMAP min_dist
116
+ --metrics Calculate and save metrics
117
+ --config <json_file> Load config from JSON
118
+ --preset <name> Load preset (default, ecommerce, saas)
119
+ --skip-time Skip time column if not available
120
+ ```
121
+
122
+ ## Output Files
123
+
124
+ - `*_umap_results.csv` - UMAP coordinates with user IDs and segments
125
+ - `umap_segment_*.html` - Interactive visualizations by month (if time column exists)
126
+ - `umap_segment_all.html` - Single visualization (if no time column)
127
+ - `umap_metrics.json` - Quality metrics (with `--metrics` flag)
128
+ - `feature_importance.csv` - Feature importance scores (with `--metrics` flag)
129
+
130
+ ## Metrics Explained
131
+
132
+ ### Silhouette Score
133
+ - Range: -1 to 1
134
+ - 1: Well-separated clusters
135
+ - 0: Overlapping clusters
136
+ - -1: Incorrect assignment
137
+
138
+ ### Feature Importance
139
+ - Importance of each feature in UMAP dimensions
140
+ - Higher = More influential
141
+
142
+ ### Purity Metrics
143
+ - **Homogeneity**: Segmentation purity (0-1)
144
+ - **Completeness**: Cluster completeness (0-1)
145
+ - **V-Measure**: Harmonic mean (0-1)
146
+
147
+ ## Package Structure
148
+
149
+ ```
150
+ tedcheck/
151
+ ├── __init__.py # Package initialization
152
+ ├── config.py # Configuration class
153
+ ├── features.py # Core UMAP functions
154
+ ├── cli.py # Command-line interface
155
+ ├── utils.py # Utility functions
156
+ ├── exceptions.py # Custom exceptions
157
+ ├── logger.py # Logging setup
158
+ └── configs/ # Preset configurations
159
+ ├── default.json
160
+ ├── ecommerce.json
161
+ └── saas.json
162
+ ```
163
+
164
+ ## API Reference
165
+
166
+ ### `Config` Class
167
+ ```python
168
+ from tedcheck import Config
169
+
170
+ config = Config(
171
+ user_id_col='id',
172
+ segment_col='tier',
173
+ skip_time=True
174
+ )
175
+
176
+ # Validate columns
177
+ missing = config.validate_columns(df)
178
+
179
+ # Load from file
180
+ config = Config.from_json('config.json')
181
+
182
+ # Load preset
183
+ config = Config.from_preset('ecommerce')
184
+
185
+ # Save config
186
+ config.to_json('my_config.json')
187
+ ```
188
+
189
+ ### `apply_umap_reduction()` Function
190
+ ```python
191
+ from tedcheck import apply_umap_reduction, Config
192
+
193
+ df_umap, embedding, num_cols = apply_umap_reduction(
194
+ df,
195
+ config=config
196
+ )
197
+ ```
198
+
199
+ ### `calculate_umap_metrics()` Function
200
+ ```python
201
+ from tedcheck import calculate_umap_metrics
202
+
203
+ metrics = calculate_umap_metrics(
204
+ df_umap,
205
+ embedding,
206
+ df,
207
+ config=config
208
+ )
209
+
210
+ print(metrics['silhouette_avg'])
211
+ print(metrics['feature_importance'])
212
+ print(metrics['purity_metrics'])
213
+ ```
214
+
215
+ ## Troubleshooting
216
+
217
+ ### Missing Column Error
218
+ ```bash
219
+ # Check available columns
220
+ python -c "import pandas as pd; print(pd.read_csv('data.csv').columns.tolist())"
221
+
222
+ # Use --skip-time if time column doesn't exist
223
+ tedcheck data.csv --skip-time
224
+ ```
225
+
226
+ ### Wrong Column Names
227
+ ```bash
228
+ # Specify correct column names
229
+ tedcheck data.csv --user-id id --segment-col group --time-col month
230
+ ```
231
+
232
+ ### Memory Issues
233
+ ```bash
234
+ # Use include_cols to select only important features
235
+ tedcheck data.csv --include-cols feature1,feature2,feature3
236
+ ```
237
+
238
+ ## Contributing
239
+
240
+ Contributions welcome! Please submit pull requests or issues on GitHub.
241
+
242
+ ## License
243
+
244
+ MIT License - See LICENSE file for details
245
+
246
+ ## Author
247
+
248
+ Tergel Munkhbaatar
249
+
250
+ ## Version
251
+
252
+ 0.1.0
@@ -0,0 +1,28 @@
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tedcheck"
7
+ version = "0.1.0"
8
+ description = "UMAP Segment Validation tool"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {text = "MIT"}
12
+ authors = [{name = "Tergel Munkhbaatar", email = "tergelitu@example.com"}]
13
+ classifiers = [
14
+ "Development Status :: 3 - Alpha",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.8",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ ]
22
+
23
+ [project.urls]
24
+ Homepage = "https://github.com/tergelitu/tedcheck"
25
+ Repository = "https://github.com/tergelitu/tedcheck.git"
26
+
27
+ [project.scripts]
28
+ tedcheck = "tedcheck.cli:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,40 @@
1
+ """TEDCHECK - UMAP Segment Validation Tool"""
2
+
3
+ __version__ = "0.1.0"
4
+ __author__ = "Tergel Munkhbaatar"
5
+
6
+ from .config import Config
7
+ from .features import (
8
+ apply_umap_reduction,
9
+ calculate_umap_metrics,
10
+ plot_umap_by_segment,
11
+ )
12
+ from .utils import (
13
+ get_color_pool,
14
+ assign_colors_to_segments,
15
+ )
16
+
17
+ from .exceptions import (
18
+ TEDCheckException,
19
+ ConfigError,
20
+ MissingColumnsError,
21
+ InvalidPresetError,
22
+ DataValidationError,
23
+ )
24
+ from .logger import setup_logger, get_logger
25
+
26
+ __all__ = [
27
+ "Config",
28
+ "apply_umap_reduction",
29
+ "calculate_umap_metrics",
30
+ "plot_umap_by_segment",
31
+ "get_color_pool",
32
+ "assign_colors_to_segments",
33
+ "TEDCheckException",
34
+ "ConfigError",
35
+ "MissingColumnsError",
36
+ "InvalidPresetError",
37
+ "DataValidationError",
38
+ "setup_logger",
39
+ "get_logger",
40
+ ]