summarystatpkg 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- summarystatpkg-0.1.0/PKG-INFO +168 -0
- summarystatpkg-0.1.0/README.md +142 -0
- summarystatpkg-0.1.0/pyproject.toml +41 -0
- summarystatpkg-0.1.0/setup.cfg +4 -0
- summarystatpkg-0.1.0/summarystatpkg.egg-info/PKG-INFO +168 -0
- summarystatpkg-0.1.0/summarystatpkg.egg-info/SOURCES.txt +7 -0
- summarystatpkg-0.1.0/summarystatpkg.egg-info/dependency_links.txt +1 -0
- summarystatpkg-0.1.0/summarystatpkg.egg-info/requires.txt +3 -0
- summarystatpkg-0.1.0/summarystatpkg.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: summarystatpkg
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A smart data-profiling library for pandas DataFrames — basic and advanced column metadata, heterogeneity detection, null-pattern analysis, and categorical correlation discovery.
|
|
5
|
+
Author: Subhajit Bhattacharyya
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/yourusername/summarystatpkg
|
|
8
|
+
Project-URL: Issues, https://github.com/yourusername/summarystatpkg/issues
|
|
9
|
+
Keywords: data profiling,summary statistics,pandas,EDA,metadata,data quality
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: pandas>=1.5
|
|
24
|
+
Requires-Dist: numpy>=1.23
|
|
25
|
+
Requires-Dist: scikit-learn>=1.2
|
|
26
|
+
|
|
27
|
+
# summarystatpkg
|
|
28
|
+
|
|
29
|
+
A smart **data-profiling library** for pandas DataFrames. Goes well beyond `.describe()` — it detects datetime columns, analyses null patterns, flags heterogeneous columns, clusters mixed-format values, and discovers categorical correlations automatically.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install summarystatpkg
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import pandas as pd
|
|
45
|
+
from summarystatpkg import csv_metadata, advanced_csv_metadata
|
|
46
|
+
|
|
47
|
+
df = pd.read_csv("your_file.csv")
|
|
48
|
+
|
|
49
|
+
# ── Basic profiling ──────────────────────────────────────────
|
|
50
|
+
basic = csv_metadata(df)
|
|
51
|
+
# Returns a list of dicts, one per column
|
|
52
|
+
|
|
53
|
+
# ── Advanced profiling ───────────────────────────────────────
|
|
54
|
+
advanced = advanced_csv_metadata(df)
|
|
55
|
+
# Returns:
|
|
56
|
+
# advanced["columnMetadata"] → per-column analysis
|
|
57
|
+
# advanced["possibleCorrelation"] → detected column relationships
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## What Each Function Does
|
|
63
|
+
|
|
64
|
+
### `csv_metadata(df)`
|
|
65
|
+
Basic column scanner. For every column it returns:
|
|
66
|
+
|
|
67
|
+
| Field | Description |
|
|
68
|
+
|---|---|
|
|
69
|
+
| `name` | Column name |
|
|
70
|
+
| `data_type` | pandas dtype |
|
|
71
|
+
| `notnullpercentage` | % of non-null rows |
|
|
72
|
+
| `uniquepercentage` | % unique values |
|
|
73
|
+
| `top_5_value_counts` | Most frequent values |
|
|
74
|
+
| `mean_value_count` | Mean (numeric) or mean frequency (object) |
|
|
75
|
+
| `std_dev_value_count` | Std dev of above |
|
|
76
|
+
| `max_value` / `min_value` | Range (numeric columns only) |
|
|
77
|
+
| `isdatetime` | Whether the column looks like a datetime |
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
### `advanced_csv_metadata(df)`
|
|
82
|
+
Smart profiler. Runs on up to 2 000 rows for performance. Per column it runs:
|
|
83
|
+
|
|
84
|
+
**Null pattern analysis** — are non-null values clustered or periodic?
|
|
85
|
+
```python
|
|
86
|
+
{
|
|
87
|
+
"has_clusters": True,
|
|
88
|
+
"periodic_pattern": False,
|
|
89
|
+
"common_gap": None
|
|
90
|
+
}
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
**Heterogeneity detection** — entropy + frequency variance score (0–1).
|
|
94
|
+
Scores above 0.5 trigger structural clustering.
|
|
95
|
+
|
|
96
|
+
**Structural clustering** — for heterogeneous columns, values are profiled
|
|
97
|
+
on 12 character-level features and clustered with KMeans. Returns stratified
|
|
98
|
+
sample values per cluster:
|
|
99
|
+
```python
|
|
100
|
+
{
|
|
101
|
+
"cluster_0": {
|
|
102
|
+
"sample_values": ["john@example.com", "alice@corp.io"],
|
|
103
|
+
"dominant_features": ["len_11_20", "has_at"]
|
|
104
|
+
},
|
|
105
|
+
"cluster_1": {
|
|
106
|
+
"sample_values": ["N/A", "unknown"],
|
|
107
|
+
"dominant_features": ["len_0_10"]
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Correlation detection** — scans all categorical column pairs for
|
|
113
|
+
`one-to-one` or `many-to-one` relationships:
|
|
114
|
+
```python
|
|
115
|
+
{
|
|
116
|
+
"country_code->country_name": "one-to-one",
|
|
117
|
+
"store_id->region": "many-to-one"
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
### Individual utility functions
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from summarystatpkg import (
|
|
127
|
+
is_datetime_column, # series → bool
|
|
128
|
+
null_clustering_analysis, # (df, col) → dict
|
|
129
|
+
entropy_based_detection, # (df, col) → float [0–1]
|
|
130
|
+
feature_based_clustering, # (df, col) → dict
|
|
131
|
+
detect_correlations_optimized, # df → dict
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Example Output
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
import pandas as pd
|
|
141
|
+
from summarystatpkg import advanced_csv_metadata
|
|
142
|
+
|
|
143
|
+
df = pd.DataFrame({
|
|
144
|
+
"email": ["a@b.com", "c@d.org", None, "e@f.net"],
|
|
145
|
+
"country": ["US", "UK", "US", "DE"],
|
|
146
|
+
"country_name": ["United States", "United Kingdom", "United States", "Germany"],
|
|
147
|
+
"score": [10, 20, 30, 40],
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
result = advanced_csv_metadata(df)
|
|
151
|
+
print(result["possibleCorrelation"])
|
|
152
|
+
# {'country->country_name': 'one-to-one'}
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Requirements
|
|
158
|
+
|
|
159
|
+
- Python ≥ 3.9
|
|
160
|
+
- pandas ≥ 1.5
|
|
161
|
+
- numpy ≥ 1.23
|
|
162
|
+
- scikit-learn ≥ 1.2
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## License
|
|
167
|
+
|
|
168
|
+
MIT
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# summarystatpkg
|
|
2
|
+
|
|
3
|
+
A smart **data-profiling library** for pandas DataFrames. Goes well beyond `.describe()` — it detects datetime columns, analyses null patterns, flags heterogeneous columns, clusters mixed-format values, and discovers categorical correlations automatically.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install summarystatpkg
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import pandas as pd
|
|
19
|
+
from summarystatpkg import csv_metadata, advanced_csv_metadata
|
|
20
|
+
|
|
21
|
+
df = pd.read_csv("your_file.csv")
|
|
22
|
+
|
|
23
|
+
# ── Basic profiling ──────────────────────────────────────────
|
|
24
|
+
basic = csv_metadata(df)
|
|
25
|
+
# Returns a list of dicts, one per column
|
|
26
|
+
|
|
27
|
+
# ── Advanced profiling ───────────────────────────────────────
|
|
28
|
+
advanced = advanced_csv_metadata(df)
|
|
29
|
+
# Returns:
|
|
30
|
+
# advanced["columnMetadata"] → per-column analysis
|
|
31
|
+
# advanced["possibleCorrelation"] → detected column relationships
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## What Each Function Does
|
|
37
|
+
|
|
38
|
+
### `csv_metadata(df)`
|
|
39
|
+
Basic column scanner. For every column it returns:
|
|
40
|
+
|
|
41
|
+
| Field | Description |
|
|
42
|
+
|---|---|
|
|
43
|
+
| `name` | Column name |
|
|
44
|
+
| `data_type` | pandas dtype |
|
|
45
|
+
| `notnullpercentage` | % of non-null rows |
|
|
46
|
+
| `uniquepercentage` | % unique values |
|
|
47
|
+
| `top_5_value_counts` | Most frequent values |
|
|
48
|
+
| `mean_value_count` | Mean (numeric) or mean frequency (object) |
|
|
49
|
+
| `std_dev_value_count` | Std dev of above |
|
|
50
|
+
| `max_value` / `min_value` | Range (numeric columns only) |
|
|
51
|
+
| `isdatetime` | Whether the column looks like a datetime |
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
### `advanced_csv_metadata(df)`
|
|
56
|
+
Smart profiler. Runs on up to 2 000 rows for performance. Per column it runs:
|
|
57
|
+
|
|
58
|
+
**Null pattern analysis** — are non-null values clustered or periodic?
|
|
59
|
+
```python
|
|
60
|
+
{
|
|
61
|
+
"has_clusters": True,
|
|
62
|
+
"periodic_pattern": False,
|
|
63
|
+
"common_gap": None
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Heterogeneity detection** — entropy + frequency variance score (0–1).
|
|
68
|
+
Scores above 0.5 trigger structural clustering.
|
|
69
|
+
|
|
70
|
+
**Structural clustering** — for heterogeneous columns, values are profiled
|
|
71
|
+
on 12 character-level features and clustered with KMeans. Returns stratified
|
|
72
|
+
sample values per cluster:
|
|
73
|
+
```python
|
|
74
|
+
{
|
|
75
|
+
"cluster_0": {
|
|
76
|
+
"sample_values": ["john@example.com", "alice@corp.io"],
|
|
77
|
+
"dominant_features": ["len_11_20", "has_at"]
|
|
78
|
+
},
|
|
79
|
+
"cluster_1": {
|
|
80
|
+
"sample_values": ["N/A", "unknown"],
|
|
81
|
+
"dominant_features": ["len_0_10"]
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Correlation detection** — scans all categorical column pairs for
|
|
87
|
+
`one-to-one` or `many-to-one` relationships:
|
|
88
|
+
```python
|
|
89
|
+
{
|
|
90
|
+
"country_code->country_name": "one-to-one",
|
|
91
|
+
"store_id->region": "many-to-one"
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
### Individual utility functions
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from summarystatpkg import (
|
|
101
|
+
is_datetime_column, # series → bool
|
|
102
|
+
null_clustering_analysis, # (df, col) → dict
|
|
103
|
+
entropy_based_detection, # (df, col) → float [0–1]
|
|
104
|
+
feature_based_clustering, # (df, col) → dict
|
|
105
|
+
detect_correlations_optimized, # df → dict
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Example Output
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
import pandas as pd
|
|
115
|
+
from summarystatpkg import advanced_csv_metadata
|
|
116
|
+
|
|
117
|
+
df = pd.DataFrame({
|
|
118
|
+
"email": ["a@b.com", "c@d.org", None, "e@f.net"],
|
|
119
|
+
"country": ["US", "UK", "US", "DE"],
|
|
120
|
+
"country_name": ["United States", "United Kingdom", "United States", "Germany"],
|
|
121
|
+
"score": [10, 20, 30, 40],
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
result = advanced_csv_metadata(df)
|
|
125
|
+
print(result["possibleCorrelation"])
|
|
126
|
+
# {'country->country_name': 'one-to-one'}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Requirements
|
|
132
|
+
|
|
133
|
+
- Python ≥ 3.9
|
|
134
|
+
- pandas ≥ 1.5
|
|
135
|
+
- numpy ≥ 1.23
|
|
136
|
+
- scikit-learn ≥ 1.2
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## License
|
|
141
|
+
|
|
142
|
+
MIT
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "summarystatpkg"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A smart data-profiling library for pandas DataFrames — basic and advanced column metadata, heterogeneity detection, null-pattern analysis, and categorical correlation discovery."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Subhajit Bhattacharyya" }
|
|
13
|
+
]
|
|
14
|
+
keywords = ["data profiling", "summary statistics", "pandas", "EDA", "metadata", "data quality"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
26
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
]
|
|
28
|
+
requires-python = ">=3.9"
|
|
29
|
+
dependencies = [
|
|
30
|
+
"pandas>=1.5",
|
|
31
|
+
"numpy>=1.23",
|
|
32
|
+
"scikit-learn>=1.2",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/yourusername/summarystatpkg"
|
|
37
|
+
Issues = "https://github.com/yourusername/summarystatpkg/issues"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["."]
|
|
41
|
+
include = ["summarystatpkg*"]
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: summarystatpkg
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A smart data-profiling library for pandas DataFrames — basic and advanced column metadata, heterogeneity detection, null-pattern analysis, and categorical correlation discovery.
|
|
5
|
+
Author: Subhajit Bhattacharyya
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/yourusername/summarystatpkg
|
|
8
|
+
Project-URL: Issues, https://github.com/yourusername/summarystatpkg/issues
|
|
9
|
+
Keywords: data profiling,summary statistics,pandas,EDA,metadata,data quality
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: pandas>=1.5
|
|
24
|
+
Requires-Dist: numpy>=1.23
|
|
25
|
+
Requires-Dist: scikit-learn>=1.2
|
|
26
|
+
|
|
27
|
+
# summarystatpkg
|
|
28
|
+
|
|
29
|
+
A smart **data-profiling library** for pandas DataFrames. Goes well beyond `.describe()` — it detects datetime columns, analyses null patterns, flags heterogeneous columns, clusters mixed-format values, and discovers categorical correlations automatically.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install summarystatpkg
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import pandas as pd
|
|
45
|
+
from summarystatpkg import csv_metadata, advanced_csv_metadata
|
|
46
|
+
|
|
47
|
+
df = pd.read_csv("your_file.csv")
|
|
48
|
+
|
|
49
|
+
# ── Basic profiling ──────────────────────────────────────────
|
|
50
|
+
basic = csv_metadata(df)
|
|
51
|
+
# Returns a list of dicts, one per column
|
|
52
|
+
|
|
53
|
+
# ── Advanced profiling ───────────────────────────────────────
|
|
54
|
+
advanced = advanced_csv_metadata(df)
|
|
55
|
+
# Returns:
|
|
56
|
+
# advanced["columnMetadata"] → per-column analysis
|
|
57
|
+
# advanced["possibleCorrelation"] → detected column relationships
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## What Each Function Does
|
|
63
|
+
|
|
64
|
+
### `csv_metadata(df)`
|
|
65
|
+
Basic column scanner. For every column it returns:
|
|
66
|
+
|
|
67
|
+
| Field | Description |
|
|
68
|
+
|---|---|
|
|
69
|
+
| `name` | Column name |
|
|
70
|
+
| `data_type` | pandas dtype |
|
|
71
|
+
| `notnullpercentage` | % of non-null rows |
|
|
72
|
+
| `uniquepercentage` | % unique values |
|
|
73
|
+
| `top_5_value_counts` | Most frequent values |
|
|
74
|
+
| `mean_value_count` | Mean (numeric) or mean frequency (object) |
|
|
75
|
+
| `std_dev_value_count` | Std dev of above |
|
|
76
|
+
| `max_value` / `min_value` | Range (numeric columns only) |
|
|
77
|
+
| `isdatetime` | Whether the column looks like a datetime |
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
### `advanced_csv_metadata(df)`
|
|
82
|
+
Smart profiler. Runs on up to 2 000 rows for performance. Per column it runs:
|
|
83
|
+
|
|
84
|
+
**Null pattern analysis** — are non-null values clustered or periodic?
|
|
85
|
+
```python
|
|
86
|
+
{
|
|
87
|
+
"has_clusters": True,
|
|
88
|
+
"periodic_pattern": False,
|
|
89
|
+
"common_gap": None
|
|
90
|
+
}
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
**Heterogeneity detection** — entropy + frequency variance score (0–1).
|
|
94
|
+
Scores above 0.5 trigger structural clustering.
|
|
95
|
+
|
|
96
|
+
**Structural clustering** — for heterogeneous columns, values are profiled
|
|
97
|
+
on 12 character-level features and clustered with KMeans. Returns stratified
|
|
98
|
+
sample values per cluster:
|
|
99
|
+
```python
|
|
100
|
+
{
|
|
101
|
+
"cluster_0": {
|
|
102
|
+
"sample_values": ["john@example.com", "alice@corp.io"],
|
|
103
|
+
"dominant_features": ["len_11_20", "has_at"]
|
|
104
|
+
},
|
|
105
|
+
"cluster_1": {
|
|
106
|
+
"sample_values": ["N/A", "unknown"],
|
|
107
|
+
"dominant_features": ["len_0_10"]
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Correlation detection** — scans all categorical column pairs for
|
|
113
|
+
`one-to-one` or `many-to-one` relationships:
|
|
114
|
+
```python
|
|
115
|
+
{
|
|
116
|
+
"country_code->country_name": "one-to-one",
|
|
117
|
+
"store_id->region": "many-to-one"
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
### Individual utility functions
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from summarystatpkg import (
|
|
127
|
+
is_datetime_column, # series → bool
|
|
128
|
+
null_clustering_analysis, # (df, col) → dict
|
|
129
|
+
entropy_based_detection, # (df, col) → float [0–1]
|
|
130
|
+
feature_based_clustering, # (df, col) → dict
|
|
131
|
+
detect_correlations_optimized, # df → dict
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Example Output
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
import pandas as pd
|
|
141
|
+
from summarystatpkg import advanced_csv_metadata
|
|
142
|
+
|
|
143
|
+
df = pd.DataFrame({
|
|
144
|
+
"email": ["a@b.com", "c@d.org", None, "e@f.net"],
|
|
145
|
+
"country": ["US", "UK", "US", "DE"],
|
|
146
|
+
"country_name": ["United States", "United Kingdom", "United States", "Germany"],
|
|
147
|
+
"score": [10, 20, 30, 40],
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
result = advanced_csv_metadata(df)
|
|
151
|
+
print(result["possibleCorrelation"])
|
|
152
|
+
# {'country->country_name': 'one-to-one'}
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Requirements
|
|
158
|
+
|
|
159
|
+
- Python ≥ 3.9
|
|
160
|
+
- pandas ≥ 1.5
|
|
161
|
+
- numpy ≥ 1.23
|
|
162
|
+
- scikit-learn ≥ 1.2
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## License
|
|
167
|
+
|
|
168
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|