additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
One-hot encode categorical columns.
|
|
3
|
+
|
|
4
|
+
This module provides one-hot encoding functionality for the transform function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import List, Union
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.core.logging import Logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def perform_onehotencoding(
|
|
15
|
+
df: pl.DataFrame,
|
|
16
|
+
columns: Union[str, List[str]]
|
|
17
|
+
) -> pl.DataFrame:
|
|
18
|
+
"""
|
|
19
|
+
One-hot encode categorical columns.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
df: Input DataFrame
|
|
23
|
+
columns: Column(s) to encode
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
DataFrame with one-hot encoded columns
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> # Before: ['category'] = ['A', 'B', 'C']
|
|
30
|
+
>>> # After: ['category_A', 'category_B', 'category_C'] = [1, 0, 0]
|
|
31
|
+
>>> result = perform_onehotencoding(df, columns=['category'])
|
|
32
|
+
"""
|
|
33
|
+
logger = Logger()
|
|
34
|
+
|
|
35
|
+
# Validate
|
|
36
|
+
validate_dataframe(df)
|
|
37
|
+
validate_not_empty(df)
|
|
38
|
+
|
|
39
|
+
# Normalize to list
|
|
40
|
+
columns_list = [columns] if isinstance(columns, str) else columns
|
|
41
|
+
|
|
42
|
+
# Validate columns exist
|
|
43
|
+
missing = [col for col in columns_list if col not in df.columns]
|
|
44
|
+
if missing:
|
|
45
|
+
raise ValueError(f"Columns not found: {missing}")
|
|
46
|
+
|
|
47
|
+
logger.info(f"One-hot encoding {len(columns_list)} columns")
|
|
48
|
+
|
|
49
|
+
result = df
|
|
50
|
+
|
|
51
|
+
# Encode each column
|
|
52
|
+
for col in columns_list:
|
|
53
|
+
# Get unique values
|
|
54
|
+
unique_vals = df[col].unique().drop_nulls().sort().to_list()
|
|
55
|
+
|
|
56
|
+
# Create binary columns for each unique value
|
|
57
|
+
for val in unique_vals:
|
|
58
|
+
new_col_name = f"{col}_{val}"
|
|
59
|
+
result = result.with_columns(
|
|
60
|
+
(pl.col(col) == val).cast(pl.Int8).alias(new_col_name)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Drop original column
|
|
64
|
+
result = result.drop(col)
|
|
65
|
+
|
|
66
|
+
logger.info(f"One-hot encoding complete: {len(result.columns)} columns")
|
|
67
|
+
|
|
68
|
+
return result
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transpose DataFrame (rows ↔ columns).
|
|
3
|
+
|
|
4
|
+
This module provides transpose functionality for the transform function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
9
|
+
from additory.core.logging import Logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def perform_transpose(df: pl.DataFrame) -> pl.DataFrame:
|
|
13
|
+
"""
|
|
14
|
+
Transpose DataFrame (swap rows and columns).
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df: Input DataFrame
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Transposed DataFrame
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> # Before: 3 rows × 4 columns
|
|
24
|
+
>>> # After: 4 rows × 3 columns
|
|
25
|
+
>>> result = perform_transpose(df)
|
|
26
|
+
"""
|
|
27
|
+
logger = Logger()
|
|
28
|
+
|
|
29
|
+
# Validate
|
|
30
|
+
validate_dataframe(df)
|
|
31
|
+
validate_not_empty(df)
|
|
32
|
+
|
|
33
|
+
logger.info(f"Transposing DataFrame: {df.shape[0]} rows × {df.shape[1]} columns")
|
|
34
|
+
|
|
35
|
+
# Transpose using Polars
|
|
36
|
+
# The transpose method in Polars swaps rows and columns
|
|
37
|
+
# include_header=True keeps the original column names as the first column
|
|
38
|
+
result = df.transpose(include_header=True, header_name='column')
|
|
39
|
+
|
|
40
|
+
logger.info(f"Transpose complete: {result.shape[0]} rows × {result.shape[1]} columns")
|
|
41
|
+
|
|
42
|
+
return result
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: additory
|
|
3
|
+
Version: 0.1.1a1
|
|
4
|
+
Summary: Data augmentation library with Polars backend
|
|
5
|
+
Author-email: Additory Team <team@additory.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/additory/additory
|
|
8
|
+
Project-URL: Documentation, https://additory.readthedocs.io
|
|
9
|
+
Project-URL: Repository, https://github.com/additory/additory
|
|
10
|
+
Keywords: data,augmentation,polars,dataframe,synthetic
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: polars>=0.20.0
|
|
23
|
+
Requires-Dist: pandas>=2.0.0
|
|
24
|
+
Requires-Dist: numpy>=1.24.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: hypothesis>=6.0.0; extra == "dev"
|
|
31
|
+
Provides-Extra: gpu
|
|
32
|
+
Requires-Dist: cudf>=23.0.0; extra == "gpu"
|
|
33
|
+
|
|
34
|
+
# Additory v0.1.1a1
|
|
35
|
+
|
|
36
|
+
Data augmentation library with Polars backend.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install additory
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick Start
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import additory
|
|
48
|
+
import polars as pl
|
|
49
|
+
|
|
50
|
+
# Add columns from reference DataFrame
|
|
51
|
+
result = additory.add.to(df, reference_df, on='id', bring='price')
|
|
52
|
+
|
|
53
|
+
# Transform columns
|
|
54
|
+
result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
|
|
55
|
+
|
|
56
|
+
# Filter data
|
|
57
|
+
result = additory.add.snapshot(df, where='age > 18')
|
|
58
|
+
|
|
59
|
+
# Generate synthetic data
|
|
60
|
+
result = additory.add.synthetic(df, rows=1000)
|
|
61
|
+
|
|
62
|
+
# Analyze data
|
|
63
|
+
result = additory.add.analyze(df, preset='quick')
|
|
64
|
+
|
|
65
|
+
# Evaluate expressions
|
|
66
|
+
result = additory.add.expressions(df, 'inbuilt:bmi')
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Features
|
|
70
|
+
|
|
71
|
+
- **Blazing Fast**: Built on Polars for maximum performance
|
|
72
|
+
- **Simple API**: Clean, intuitive API with `additory.add.function()` pattern
|
|
73
|
+
- **Flexible**: Works with Polars, pandas, and cuDF
|
|
74
|
+
- **Comprehensive**: 6 main functions covering all data augmentation needs
|
|
75
|
+
- **Well Tested**: 1,023 tests with 90% coverage
|
|
76
|
+
|
|
77
|
+
## Documentation
|
|
78
|
+
|
|
79
|
+
Visit [https://additory.dev](https://additory.dev) for full documentation.
|
|
80
|
+
|
|
81
|
+
## License
|
|
82
|
+
|
|
83
|
+
MIT
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
additory/__init__.py,sha256=Ogj8nwpu_ERDNTNVGVxCLpI2O8LaPRI6YD9VC3mugVM,1810
|
|
2
|
+
additory/common/__init__.py,sha256=b70Ksk5yOw5Lb1hyO3m0iEzoMGnBnxKAHkof9s1B7yw,957
|
|
3
|
+
additory/common/column_selector.py,sha256=lWoK7sbrUy9m7F9Ya3zKjAvt7LCqJDjWfqv_qblAwqQ,6944
|
|
4
|
+
additory/common/distributions.py,sha256=aE8oXUXKdtJooeLiRALV2l40lMkxRghvb4bIW30b4mM,11245
|
|
5
|
+
additory/common/extractors.py,sha256=j10qoEXFWlKfCo0J7HfTC9dbE6eJefFUPaMAtIuQ_No,10756
|
|
6
|
+
additory/common/knn_imputation.py,sha256=Z0g5MAc88_yqDEph-pP20k8Njw83CA7RmFErTF478Lo,10756
|
|
7
|
+
additory/common/result.py,sha256=ug68SLnP9qS4qayYhWpFRvSfbz-hJV_GxrrFPLojdqE,12221
|
|
8
|
+
additory/common/strategy_parser.py,sha256=PE6kaW8JNOGTPOdxbGfCp8LBvK03USfInAt-C1LiOSw,8226
|
|
9
|
+
additory/common/unit_conversions.py,sha256=H3gaJTFFeIQJK3B7Xo23TX4cA0HfuPAWa79tuT5MciQ,9558
|
|
10
|
+
additory/common/validation.py,sha256=oOnPR_kQ8Eh5oWyd9z33UglYjgMy0HXV8HOBFqJdV94,9862
|
|
11
|
+
additory/core/__init__.py,sha256=ZOsJigoh78Mx_KlOPO-HquIYTyZK9jXQEw9AGFFg9us,885
|
|
12
|
+
additory/core/backend.py,sha256=_EknNteiK5gNJXmWrrTQRMYWQu5eCQEbfkZOL-uexio,6407
|
|
13
|
+
additory/core/config.py,sha256=Rs9A8wMPFcK0-n07L1I28jVCMJwO8_yUaO4k6t6OOHs,5826
|
|
14
|
+
additory/core/logging.py,sha256=o5mjEQywAghOO0XdMpyHxOcx-m-RHh7YJo4N0XjeRHI,6811
|
|
15
|
+
additory/core/memory_manager.py,sha256=xl9ROwHq55th5KAXbC2bf4l5z-6a1dARz1ccUS8g8zA,5898
|
|
16
|
+
additory/expressions/__init__.py,sha256=V02W3bbG4Nslh3hpCGZvmp6t6g8_Qp2Y8I6KrcaKF7M,99
|
|
17
|
+
additory/expressions/compiler.py,sha256=cj97jFQpOLkn-7KAqZHXJ0em3CM1rd0a9njkJ2CXDPA,15774
|
|
18
|
+
additory/expressions/engine.py,sha256=FtM0oB0RI06Zx1tsX68wAKx8uutzHrgxO-bnuQBTdz8,10397
|
|
19
|
+
additory/expressions/integrity.py,sha256=c52D4GrDAfEZhZAbJuzmkAZ-r-VljtrnYV_PTd2WCio,4769
|
|
20
|
+
additory/expressions/loader.py,sha256=gx9SVTquQJjqUVDHkDmk04UglPGdto0oehiDk5Xr_F4,7694
|
|
21
|
+
additory/expressions/parser.py,sha256=_nFam3o-O3wNLptb--XLFkKVACE1mnC_dW6NiVBGwNQ,12907
|
|
22
|
+
additory/expressions/resolver.py,sha256=AblxnXrSQvaSQYzNKBE-VdJC9l0kyj1nQhUd3KYMB78,8021
|
|
23
|
+
additory/functions/__init__.py,sha256=ZBYLMHRabOvOGTI1ORTRzYJcAGuYWJJiwiDU_P8xt8Y,25
|
|
24
|
+
additory/functions/analyze/__init__.py,sha256=OZipi8VCr45Fnd-t3cVR85JhEn6_O4oy3RIVmYDE3Yk,4645
|
|
25
|
+
additory/functions/analyze/cardinality.py,sha256=_u4NUGs_FjtYZlO5Wdf1KZLkC2E-NXjPf7WbC6jUdh4,2151
|
|
26
|
+
additory/functions/analyze/correlations.py,sha256=jnZrN05HG4bqR9b_0gQmytx4G2CjiZ_k5sAcK_x5A1o,2159
|
|
27
|
+
additory/functions/analyze/distributions.py,sha256=lPHPM64RolwJQ_AhhtdZBCsxguzkAfJQDme1JL2sx2U,1880
|
|
28
|
+
additory/functions/analyze/duplicates.py,sha256=klL4Agi_iyqU7puQu5pK9ljTemN_A32fPFkTH6pWeYM,1486
|
|
29
|
+
additory/functions/analyze/features.py,sha256=Gx6irn2y6C5Ld-Evdsu2YTc_GKvSzwlVgmEkFOrcFJw,1968
|
|
30
|
+
additory/functions/analyze/imputation.py,sha256=3aFzGsGbLcB5vzqVUh0N4i85S4wQ5TpF-C-jvRlGsys,2050
|
|
31
|
+
additory/functions/analyze/outliers.py,sha256=uPEy7pu4TcAmY1CD6thytbL21QHMAEqIMTbqkmwQtYE,2078
|
|
32
|
+
additory/functions/analyze/patterns.py,sha256=gLGxTn1g0z5W2aHuXXWRGrffA2q0a9NPi79RaaWlXcc,1937
|
|
33
|
+
additory/functions/analyze/presets.py,sha256=KBNTlPo5A85hxOEsw0pWpisDbWOxyNOGbIh86qDzhAc,1740
|
|
34
|
+
additory/functions/analyze/quality.py,sha256=0l6rqTEowrKpXvFD1C-2Fh6GbryJFk3o9zxZ3eNPWFE,1886
|
|
35
|
+
additory/functions/analyze/timeseries.py,sha256=-qgBBieqKzvgXK20Zn5VR8NtCgDUg99yIhRQvAZK3do,1606
|
|
36
|
+
additory/functions/analyze/types.py,sha256=8CK7XyzWgkjm1U-_8zyqmUV3xbeSp69ZUsItV6ffyoQ,1534
|
|
37
|
+
additory/functions/expressions/__init__.py,sha256=vwjnlSojjv-ommwXlzztQecRtZj1EViUNrDtbYSOIKM,4499
|
|
38
|
+
additory/functions/snapshot/__init__.py,sha256=kjp53VzQ1hf9r_WJCRZ9PyaU35YFwzcpV0b1VFWdwLs,2525
|
|
39
|
+
additory/functions/snapshot/filter.py,sha256=IWSUpX945KGCIJ6xUmoUcD86EHEqLKKC8_LHHxTyUig,3136
|
|
40
|
+
additory/functions/synthetic/__init__.py,sha256=J67FkrgfebEWc0abwq3sz0Aj3UY0m4mTDn7mbxXrGfg,3728
|
|
41
|
+
additory/functions/synthetic/mode_detector.py,sha256=3IueJBSaMmGAEmVvZBmRlLHCvu52sI4edE0HsqbAIvw,1332
|
|
42
|
+
additory/functions/synthetic/strategies/__init__.py,sha256=XjLa6W6DSHK1Z4x1LhzpfAi5BYMVmksYCABpuqcRPM8,30
|
|
43
|
+
additory/functions/synthetic/strategies/advanced.py,sha256=I4mZiHWGKRTwTDCQhdE5xvqW-CC9y79Br_fMUeL7uQ8,901
|
|
44
|
+
additory/functions/synthetic/strategies/augmentative.py,sha256=H6T2FFyYBGhv58VmLVf9LS1-2_kih4lMJi8VH4cqzeM,4862
|
|
45
|
+
additory/functions/synthetic/strategies/generative.py,sha256=VZNnCgkzAuTZVjcDH7G2UaDJsWmnGAouWyWIlJ_1vhc,4967
|
|
46
|
+
additory/functions/synthetic/strategies/presets.py,sha256=FxeBAo2KYXb-n2QGf5nW8amNDA7tSSVzo1fzCE6HzF0,2917
|
|
47
|
+
additory/functions/to/__init__.py,sha256=XlG_qayorYwSceKKJy95CiCyiG067WUUwQjSNNVmfYM,6217
|
|
48
|
+
additory/functions/to/lookup.py,sha256=csVndLKcSYqSlKHqA4MzPE5-7E1Yxv-WKAfclDuen4o,10126
|
|
49
|
+
additory/functions/to/merge.py,sha256=Y-OcMT3J9Wubz8rGNjiPwG2CjRGEGOzVMeeG_A_LbR0,5320
|
|
50
|
+
additory/functions/to/sort.py,sha256=QVSSVWdYx9EpFhVjZAoCL_cUd2b6ASMS2CjlNzhhJ7w,2506
|
|
51
|
+
additory/functions/to/summarize.py,sha256=1l8k8Wjl4W6VDoGHcHz6YoQsTqiAHkWwvwlPiWsEK1k,4816
|
|
52
|
+
additory/functions/transform/__init__.py,sha256=5XuEyn9LsS8pyxntEnzV8KF2YBernRw7bXkjf9owp-o,5468
|
|
53
|
+
additory/functions/transform/datetime.py,sha256=LUeh7kwzdYfw8veaQE_U1IXM0P-DKpRPJBHhVRnu2RU,2214
|
|
54
|
+
additory/functions/transform/extract.py,sha256=XF1ZA0Pdnxri1sh0J6OJDUQwXK5V4FkhJqDdUI0fBUg,2616
|
|
55
|
+
additory/functions/transform/harmonize.py,sha256=TY0ZYedhGh95GVpbG1-EgauT2e__amBKZkGDeNf9nVk,3150
|
|
56
|
+
additory/functions/transform/knn.py,sha256=wWay5L8Vgg8nDf_HwzJ7PSa27RdsvfvKLUqYw1125Gk,1676
|
|
57
|
+
additory/functions/transform/onehotencoding.py,sha256=LibYOfphxrESKNNm-XTByBkjOhaKeitTfvO5mVVC8Rc,1891
|
|
58
|
+
additory/functions/transform/transpose.py,sha256=Jy-50GgQbBsidBHY-F9YIBmwai1bPSSNzkN5X8n-sUk,1177
|
|
59
|
+
additory-0.1.1a1.dist-info/METADATA,sha256=R2A_EU0OTYn5OFJUrSPQfrDgAzs8YWta0Z0TY6b5jn4,2432
|
|
60
|
+
additory-0.1.1a1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
61
|
+
additory-0.1.1a1.dist-info/top_level.txt,sha256=4zphwXiI6HEl40fdjMXoUp9JNIqQ-tgYWeo3zqKqvEk,9
|
|
62
|
+
additory-0.1.1a1.dist-info/RECORD,,
|
additory/analysis/__init__.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Analysis Module for Data Profiling
|
|
3
|
-
|
|
4
|
-
Provides comprehensive data analysis capabilities:
|
|
5
|
-
- Distribution detection and fitting
|
|
6
|
-
- Correlation analysis
|
|
7
|
-
- Cardinality analysis
|
|
8
|
-
- Data quality metrics
|
|
9
|
-
- Data profiling and scanning
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
from additory.analysis.distributions import (
|
|
13
|
-
detect_distributions,
|
|
14
|
-
fit_distribution,
|
|
15
|
-
DistributionFit
|
|
16
|
-
)
|
|
17
|
-
from additory.analysis.correlations import (
|
|
18
|
-
calculate_correlations,
|
|
19
|
-
CorrelationResult
|
|
20
|
-
)
|
|
21
|
-
from additory.analysis.cardinality import (
|
|
22
|
-
analyze_cardinality,
|
|
23
|
-
CardinalityInfo
|
|
24
|
-
)
|
|
25
|
-
from additory.analysis.quality import (
|
|
26
|
-
analyze_quality,
|
|
27
|
-
QualityMetrics
|
|
28
|
-
)
|
|
29
|
-
from additory.analysis.scan import (
|
|
30
|
-
scan,
|
|
31
|
-
ScanResult,
|
|
32
|
-
ColumnInfo
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
__all__ = [
|
|
36
|
-
'detect_distributions',
|
|
37
|
-
'fit_distribution',
|
|
38
|
-
'DistributionFit',
|
|
39
|
-
'calculate_correlations',
|
|
40
|
-
'CorrelationResult',
|
|
41
|
-
'analyze_cardinality',
|
|
42
|
-
'CardinalityInfo',
|
|
43
|
-
'analyze_quality',
|
|
44
|
-
'QualityMetrics',
|
|
45
|
-
'scan',
|
|
46
|
-
'ScanResult',
|
|
47
|
-
'ColumnInfo',
|
|
48
|
-
]
|
additory/analysis/cardinality.py
DELETED
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Cardinality Analysis
|
|
3
|
-
|
|
4
|
-
Analyzes unique values and cardinality of columns.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from dataclasses import dataclass
|
|
8
|
-
from typing import List, Any, Dict
|
|
9
|
-
import polars as pl
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@dataclass
|
|
13
|
-
class CardinalityInfo:
|
|
14
|
-
"""Cardinality information for a column."""
|
|
15
|
-
unique_count: int
|
|
16
|
-
total_count: int
|
|
17
|
-
ratio: float
|
|
18
|
-
top_values: List[tuple] # [(value, count), ...]
|
|
19
|
-
classification: str # 'constant', 'low', 'medium', 'high'
|
|
20
|
-
|
|
21
|
-
def __repr__(self) -> str:
|
|
22
|
-
return (
|
|
23
|
-
f"CardinalityInfo(unique={self.unique_count}, "
|
|
24
|
-
f"ratio={self.ratio:.2%}, class='{self.classification}')"
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def classify_cardinality(ratio: float, unique_count: int) -> str:
|
|
29
|
-
"""
|
|
30
|
-
Classify cardinality based on ratio and unique count.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
ratio: Unique count / total count
|
|
34
|
-
unique_count: Number of unique values
|
|
35
|
-
|
|
36
|
-
Returns:
|
|
37
|
-
Classification: 'constant', 'low', 'medium', 'high'
|
|
38
|
-
"""
|
|
39
|
-
if unique_count == 1:
|
|
40
|
-
return 'constant'
|
|
41
|
-
elif ratio >= 0.5:
|
|
42
|
-
return 'high'
|
|
43
|
-
elif ratio >= 0.1:
|
|
44
|
-
return 'medium'
|
|
45
|
-
else:
|
|
46
|
-
return 'low'
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def analyze_cardinality(
|
|
50
|
-
df: pl.DataFrame,
|
|
51
|
-
column: str,
|
|
52
|
-
top_n: int = 10
|
|
53
|
-
) -> CardinalityInfo:
|
|
54
|
-
"""
|
|
55
|
-
Analyze cardinality of a column.
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
df: Polars DataFrame
|
|
59
|
-
column: Column name
|
|
60
|
-
top_n: Number of top values to return
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
CardinalityInfo object
|
|
64
|
-
"""
|
|
65
|
-
# Get total count (excluding nulls)
|
|
66
|
-
total_count = df[column].count()
|
|
67
|
-
|
|
68
|
-
if total_count == 0:
|
|
69
|
-
return CardinalityInfo(
|
|
70
|
-
unique_count=0,
|
|
71
|
-
total_count=0,
|
|
72
|
-
ratio=0.0,
|
|
73
|
-
top_values=[],
|
|
74
|
-
classification='constant'
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
# Get unique count (excluding nulls)
|
|
78
|
-
unique_count = df[column].drop_nulls().n_unique()
|
|
79
|
-
|
|
80
|
-
# Calculate ratio
|
|
81
|
-
ratio = unique_count / total_count if total_count > 0 else 0.0
|
|
82
|
-
|
|
83
|
-
# Get top values
|
|
84
|
-
value_counts = (
|
|
85
|
-
df
|
|
86
|
-
.group_by(column)
|
|
87
|
-
.agg(pl.len().alias('count'))
|
|
88
|
-
.sort('count', descending=True)
|
|
89
|
-
.head(top_n)
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
top_values = [
|
|
93
|
-
(row[column], row['count'])
|
|
94
|
-
for row in value_counts.iter_rows(named=True)
|
|
95
|
-
]
|
|
96
|
-
|
|
97
|
-
# Classify
|
|
98
|
-
classification = classify_cardinality(ratio, unique_count)
|
|
99
|
-
|
|
100
|
-
return CardinalityInfo(
|
|
101
|
-
unique_count=unique_count,
|
|
102
|
-
total_count=total_count,
|
|
103
|
-
ratio=ratio,
|
|
104
|
-
top_values=top_values,
|
|
105
|
-
classification=classification
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def analyze_all_cardinality(
|
|
110
|
-
df: pl.DataFrame,
|
|
111
|
-
top_n: int = 10
|
|
112
|
-
) -> Dict[str, CardinalityInfo]:
|
|
113
|
-
"""
|
|
114
|
-
Analyze cardinality for all columns.
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
df: Polars DataFrame
|
|
118
|
-
top_n: Number of top values to return per column
|
|
119
|
-
|
|
120
|
-
Returns:
|
|
121
|
-
Dictionary mapping column names to CardinalityInfo
|
|
122
|
-
"""
|
|
123
|
-
return {
|
|
124
|
-
col: analyze_cardinality(df, col, top_n)
|
|
125
|
-
for col in df.columns
|
|
126
|
-
}
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Correlation Analysis
|
|
3
|
-
|
|
4
|
-
Calculates correlations between numeric columns.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from dataclasses import dataclass
|
|
8
|
-
from typing import Dict, List, Tuple
|
|
9
|
-
import numpy as np
|
|
10
|
-
import polars as pl
|
|
11
|
-
from scipy import stats
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class CorrelationResult:
|
|
16
|
-
"""Result of correlation analysis between two columns."""
|
|
17
|
-
column1: str
|
|
18
|
-
column2: str
|
|
19
|
-
correlation: float
|
|
20
|
-
method: str
|
|
21
|
-
p_value: float = 0.0
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def calculate_correlations(
|
|
25
|
-
df: pl.DataFrame,
|
|
26
|
-
columns: List[str],
|
|
27
|
-
methods: List[str] = ['pearson', 'spearman'],
|
|
28
|
-
threshold: float = 0.0
|
|
29
|
-
) -> List[CorrelationResult]:
|
|
30
|
-
"""
|
|
31
|
-
Calculate correlations between numeric columns with optimized batch processing.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
df: Polars DataFrame
|
|
35
|
-
columns: List of numeric column names
|
|
36
|
-
methods: Correlation methods to calculate
|
|
37
|
-
threshold: Minimum correlation threshold to report
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
List of CorrelationResult objects (changed from single object for scan.py compatibility)
|
|
41
|
-
"""
|
|
42
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
43
|
-
import itertools
|
|
44
|
-
|
|
45
|
-
if len(columns) < 2:
|
|
46
|
-
return []
|
|
47
|
-
|
|
48
|
-
# Pre-extract all data as numpy arrays for efficiency
|
|
49
|
-
data_arrays = {}
|
|
50
|
-
for col in columns:
|
|
51
|
-
arr = df[col].to_numpy()
|
|
52
|
-
data_arrays[col] = arr
|
|
53
|
-
|
|
54
|
-
# Generate all column pairs
|
|
55
|
-
column_pairs = list(itertools.combinations(columns, 2))
|
|
56
|
-
|
|
57
|
-
results = []
|
|
58
|
-
|
|
59
|
-
def calculate_pair_correlations(pair):
|
|
60
|
-
"""Calculate correlations for a single pair of columns."""
|
|
61
|
-
col1, col2 = pair
|
|
62
|
-
arr1 = data_arrays[col1]
|
|
63
|
-
arr2 = data_arrays[col2]
|
|
64
|
-
|
|
65
|
-
# Get common non-NaN indices
|
|
66
|
-
mask = ~(np.isnan(arr1) | np.isnan(arr2))
|
|
67
|
-
arr1_clean = arr1[mask]
|
|
68
|
-
arr2_clean = arr2[mask]
|
|
69
|
-
|
|
70
|
-
if len(arr1_clean) < 3:
|
|
71
|
-
return None
|
|
72
|
-
|
|
73
|
-
pair_results = {}
|
|
74
|
-
|
|
75
|
-
# Calculate all requested methods for this pair
|
|
76
|
-
for method in methods:
|
|
77
|
-
try:
|
|
78
|
-
if method == 'pearson':
|
|
79
|
-
corr, p_value = stats.pearsonr(arr1_clean, arr2_clean)
|
|
80
|
-
elif method == 'spearman':
|
|
81
|
-
corr, p_value = stats.spearmanr(arr1_clean, arr2_clean)
|
|
82
|
-
elif method == 'kendall':
|
|
83
|
-
corr, p_value = stats.kendalltau(arr1_clean, arr2_clean)
|
|
84
|
-
else:
|
|
85
|
-
continue
|
|
86
|
-
|
|
87
|
-
# Only include if above threshold
|
|
88
|
-
if abs(corr) >= threshold:
|
|
89
|
-
pair_results[method] = {
|
|
90
|
-
'correlation': float(corr),
|
|
91
|
-
'p_value': float(p_value)
|
|
92
|
-
}
|
|
93
|
-
except Exception:
|
|
94
|
-
continue
|
|
95
|
-
|
|
96
|
-
if pair_results:
|
|
97
|
-
return (col1, col2, pair_results)
|
|
98
|
-
return None
|
|
99
|
-
|
|
100
|
-
# Use ThreadPoolExecutor for parallel processing of correlation pairs
|
|
101
|
-
with ThreadPoolExecutor(max_workers=min(4, len(column_pairs))) as executor:
|
|
102
|
-
# Submit all pair processing tasks
|
|
103
|
-
future_to_pair = {
|
|
104
|
-
executor.submit(calculate_pair_correlations, pair): pair
|
|
105
|
-
for pair in column_pairs
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
# Collect results as they complete
|
|
109
|
-
for future in as_completed(future_to_pair):
|
|
110
|
-
result = future.result()
|
|
111
|
-
if result is not None:
|
|
112
|
-
col1, col2, pair_results = result
|
|
113
|
-
|
|
114
|
-
# Create CorrelationResult objects for each method
|
|
115
|
-
for method, corr_data in pair_results.items():
|
|
116
|
-
results.append(CorrelationResult(
|
|
117
|
-
column1=col1,
|
|
118
|
-
column2=col2,
|
|
119
|
-
correlation=corr_data['correlation'],
|
|
120
|
-
method=method,
|
|
121
|
-
p_value=corr_data['p_value']
|
|
122
|
-
))
|
|
123
|
-
|
|
124
|
-
return results
|