additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,68 @@
1
+ """
2
+ One-hot encode categorical columns.
3
+
4
+ This module provides one-hot encoding functionality for the transform function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import List, Union
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.core.logging import Logger
12
+
13
+
14
+ def perform_onehotencoding(
15
+ df: pl.DataFrame,
16
+ columns: Union[str, List[str]]
17
+ ) -> pl.DataFrame:
18
+ """
19
+ One-hot encode categorical columns.
20
+
21
+ Args:
22
+ df: Input DataFrame
23
+ columns: Column(s) to encode
24
+
25
+ Returns:
26
+ DataFrame with one-hot encoded columns
27
+
28
+ Example:
29
+ >>> # Before: ['category'] = ['A', 'B', 'C']
30
+ >>> # After: ['category_A', 'category_B', 'category_C'] = [1, 0, 0]
31
+ >>> result = perform_onehotencoding(df, columns=['category'])
32
+ """
33
+ logger = Logger()
34
+
35
+ # Validate
36
+ validate_dataframe(df)
37
+ validate_not_empty(df)
38
+
39
+ # Normalize to list
40
+ columns_list = [columns] if isinstance(columns, str) else columns
41
+
42
+ # Validate columns exist
43
+ missing = [col for col in columns_list if col not in df.columns]
44
+ if missing:
45
+ raise ValueError(f"Columns not found: {missing}")
46
+
47
+ logger.info(f"One-hot encoding {len(columns_list)} columns")
48
+
49
+ result = df
50
+
51
+ # Encode each column
52
+ for col in columns_list:
53
+ # Get unique values
54
+ unique_vals = df[col].unique().drop_nulls().sort().to_list()
55
+
56
+ # Create binary columns for each unique value
57
+ for val in unique_vals:
58
+ new_col_name = f"{col}_{val}"
59
+ result = result.with_columns(
60
+ (pl.col(col) == val).cast(pl.Int8).alias(new_col_name)
61
+ )
62
+
63
+ # Drop original column
64
+ result = result.drop(col)
65
+
66
+ logger.info(f"One-hot encoding complete: {len(result.columns)} columns")
67
+
68
+ return result
@@ -0,0 +1,42 @@
1
+ """
2
+ Transpose DataFrame (rows ↔ columns).
3
+
4
+ This module provides transpose functionality for the transform function.
5
+ """
6
+
7
+ import polars as pl
8
+ from additory.common.validation import validate_dataframe, validate_not_empty
9
+ from additory.core.logging import Logger
10
+
11
+
12
+ def perform_transpose(df: pl.DataFrame) -> pl.DataFrame:
13
+ """
14
+ Transpose DataFrame (swap rows and columns).
15
+
16
+ Args:
17
+ df: Input DataFrame
18
+
19
+ Returns:
20
+ Transposed DataFrame
21
+
22
+ Example:
23
+ >>> # Before: 3 rows × 4 columns
24
+ >>> # After: 4 rows × 3 columns
25
+ >>> result = perform_transpose(df)
26
+ """
27
+ logger = Logger()
28
+
29
+ # Validate
30
+ validate_dataframe(df)
31
+ validate_not_empty(df)
32
+
33
+ logger.info(f"Transposing DataFrame: {df.shape[0]} rows × {df.shape[1]} columns")
34
+
35
+ # Transpose using Polars
36
+ # The transpose method in Polars swaps rows and columns
37
+ # include_header=True keeps the original column names as the first column
38
+ result = df.transpose(include_header=True, header_name='column')
39
+
40
+ logger.info(f"Transpose complete: {result.shape[0]} rows × {result.shape[1]} columns")
41
+
42
+ return result
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: additory
3
+ Version: 0.1.1a1
4
+ Summary: Data augmentation library with Polars backend
5
+ Author-email: Additory Team <team@additory.dev>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/additory/additory
8
+ Project-URL: Documentation, https://additory.readthedocs.io
9
+ Project-URL: Repository, https://github.com/additory/additory
10
+ Keywords: data,augmentation,polars,dataframe,synthetic
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: polars>=0.20.0
23
+ Requires-Dist: pandas>=2.0.0
24
+ Requires-Dist: numpy>=1.24.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
28
+ Requires-Dist: black>=23.0.0; extra == "dev"
29
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
30
+ Requires-Dist: hypothesis>=6.0.0; extra == "dev"
31
+ Provides-Extra: gpu
32
+ Requires-Dist: cudf>=23.0.0; extra == "gpu"
33
+
34
+ # Additory v0.1.1a1
35
+
36
+ Data augmentation library with Polars backend.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install additory
42
+ ```
43
+
44
+ ## Quick Start
45
+
46
+ ```python
47
+ import additory
48
+ import polars as pl
49
+
50
+ # Add columns from reference DataFrame
51
+ result = additory.add.to(df, reference_df, on='id', bring='price')
52
+
53
+ # Transform columns
54
+ result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
55
+
56
+ # Filter data
57
+ result = additory.add.snapshot(df, where='age > 18')
58
+
59
+ # Generate synthetic data
60
+ result = additory.add.synthetic(df, rows=1000)
61
+
62
+ # Analyze data
63
+ result = additory.add.analyze(df, preset='quick')
64
+
65
+ # Evaluate expressions
66
+ result = additory.add.expressions(df, 'inbuilt:bmi')
67
+ ```
68
+
69
+ ## Features
70
+
71
+ - **Blazing Fast**: Built on Polars for maximum performance
72
+ - **Simple API**: Clean, intuitive API with `additory.add.function()` pattern
73
+ - **Flexible**: Works with Polars, pandas, and cuDF
74
+ - **Comprehensive**: 6 main functions covering all data augmentation needs
75
+ - **Well Tested**: 1,023 tests with 90% coverage
76
+
77
+ ## Documentation
78
+
79
+ Visit [https://additory.dev](https://additory.dev) for full documentation.
80
+
81
+ ## License
82
+
83
+ MIT
@@ -0,0 +1,62 @@
1
+ additory/__init__.py,sha256=Ogj8nwpu_ERDNTNVGVxCLpI2O8LaPRI6YD9VC3mugVM,1810
2
+ additory/common/__init__.py,sha256=b70Ksk5yOw5Lb1hyO3m0iEzoMGnBnxKAHkof9s1B7yw,957
3
+ additory/common/column_selector.py,sha256=lWoK7sbrUy9m7F9Ya3zKjAvt7LCqJDjWfqv_qblAwqQ,6944
4
+ additory/common/distributions.py,sha256=aE8oXUXKdtJooeLiRALV2l40lMkxRghvb4bIW30b4mM,11245
5
+ additory/common/extractors.py,sha256=j10qoEXFWlKfCo0J7HfTC9dbE6eJefFUPaMAtIuQ_No,10756
6
+ additory/common/knn_imputation.py,sha256=Z0g5MAc88_yqDEph-pP20k8Njw83CA7RmFErTF478Lo,10756
7
+ additory/common/result.py,sha256=ug68SLnP9qS4qayYhWpFRvSfbz-hJV_GxrrFPLojdqE,12221
8
+ additory/common/strategy_parser.py,sha256=PE6kaW8JNOGTPOdxbGfCp8LBvK03USfInAt-C1LiOSw,8226
9
+ additory/common/unit_conversions.py,sha256=H3gaJTFFeIQJK3B7Xo23TX4cA0HfuPAWa79tuT5MciQ,9558
10
+ additory/common/validation.py,sha256=oOnPR_kQ8Eh5oWyd9z33UglYjgMy0HXV8HOBFqJdV94,9862
11
+ additory/core/__init__.py,sha256=ZOsJigoh78Mx_KlOPO-HquIYTyZK9jXQEw9AGFFg9us,885
12
+ additory/core/backend.py,sha256=_EknNteiK5gNJXmWrrTQRMYWQu5eCQEbfkZOL-uexio,6407
13
+ additory/core/config.py,sha256=Rs9A8wMPFcK0-n07L1I28jVCMJwO8_yUaO4k6t6OOHs,5826
14
+ additory/core/logging.py,sha256=o5mjEQywAghOO0XdMpyHxOcx-m-RHh7YJo4N0XjeRHI,6811
15
+ additory/core/memory_manager.py,sha256=xl9ROwHq55th5KAXbC2bf4l5z-6a1dARz1ccUS8g8zA,5898
16
+ additory/expressions/__init__.py,sha256=V02W3bbG4Nslh3hpCGZvmp6t6g8_Qp2Y8I6KrcaKF7M,99
17
+ additory/expressions/compiler.py,sha256=cj97jFQpOLkn-7KAqZHXJ0em3CM1rd0a9njkJ2CXDPA,15774
18
+ additory/expressions/engine.py,sha256=FtM0oB0RI06Zx1tsX68wAKx8uutzHrgxO-bnuQBTdz8,10397
19
+ additory/expressions/integrity.py,sha256=c52D4GrDAfEZhZAbJuzmkAZ-r-VljtrnYV_PTd2WCio,4769
20
+ additory/expressions/loader.py,sha256=gx9SVTquQJjqUVDHkDmk04UglPGdto0oehiDk5Xr_F4,7694
21
+ additory/expressions/parser.py,sha256=_nFam3o-O3wNLptb--XLFkKVACE1mnC_dW6NiVBGwNQ,12907
22
+ additory/expressions/resolver.py,sha256=AblxnXrSQvaSQYzNKBE-VdJC9l0kyj1nQhUd3KYMB78,8021
23
+ additory/functions/__init__.py,sha256=ZBYLMHRabOvOGTI1ORTRzYJcAGuYWJJiwiDU_P8xt8Y,25
24
+ additory/functions/analyze/__init__.py,sha256=OZipi8VCr45Fnd-t3cVR85JhEn6_O4oy3RIVmYDE3Yk,4645
25
+ additory/functions/analyze/cardinality.py,sha256=_u4NUGs_FjtYZlO5Wdf1KZLkC2E-NXjPf7WbC6jUdh4,2151
26
+ additory/functions/analyze/correlations.py,sha256=jnZrN05HG4bqR9b_0gQmytx4G2CjiZ_k5sAcK_x5A1o,2159
27
+ additory/functions/analyze/distributions.py,sha256=lPHPM64RolwJQ_AhhtdZBCsxguzkAfJQDme1JL2sx2U,1880
28
+ additory/functions/analyze/duplicates.py,sha256=klL4Agi_iyqU7puQu5pK9ljTemN_A32fPFkTH6pWeYM,1486
29
+ additory/functions/analyze/features.py,sha256=Gx6irn2y6C5Ld-Evdsu2YTc_GKvSzwlVgmEkFOrcFJw,1968
30
+ additory/functions/analyze/imputation.py,sha256=3aFzGsGbLcB5vzqVUh0N4i85S4wQ5TpF-C-jvRlGsys,2050
31
+ additory/functions/analyze/outliers.py,sha256=uPEy7pu4TcAmY1CD6thytbL21QHMAEqIMTbqkmwQtYE,2078
32
+ additory/functions/analyze/patterns.py,sha256=gLGxTn1g0z5W2aHuXXWRGrffA2q0a9NPi79RaaWlXcc,1937
33
+ additory/functions/analyze/presets.py,sha256=KBNTlPo5A85hxOEsw0pWpisDbWOxyNOGbIh86qDzhAc,1740
34
+ additory/functions/analyze/quality.py,sha256=0l6rqTEowrKpXvFD1C-2Fh6GbryJFk3o9zxZ3eNPWFE,1886
35
+ additory/functions/analyze/timeseries.py,sha256=-qgBBieqKzvgXK20Zn5VR8NtCgDUg99yIhRQvAZK3do,1606
36
+ additory/functions/analyze/types.py,sha256=8CK7XyzWgkjm1U-_8zyqmUV3xbeSp69ZUsItV6ffyoQ,1534
37
+ additory/functions/expressions/__init__.py,sha256=vwjnlSojjv-ommwXlzztQecRtZj1EViUNrDtbYSOIKM,4499
38
+ additory/functions/snapshot/__init__.py,sha256=kjp53VzQ1hf9r_WJCRZ9PyaU35YFwzcpV0b1VFWdwLs,2525
39
+ additory/functions/snapshot/filter.py,sha256=IWSUpX945KGCIJ6xUmoUcD86EHEqLKKC8_LHHxTyUig,3136
40
+ additory/functions/synthetic/__init__.py,sha256=J67FkrgfebEWc0abwq3sz0Aj3UY0m4mTDn7mbxXrGfg,3728
41
+ additory/functions/synthetic/mode_detector.py,sha256=3IueJBSaMmGAEmVvZBmRlLHCvu52sI4edE0HsqbAIvw,1332
42
+ additory/functions/synthetic/strategies/__init__.py,sha256=XjLa6W6DSHK1Z4x1LhzpfAi5BYMVmksYCABpuqcRPM8,30
43
+ additory/functions/synthetic/strategies/advanced.py,sha256=I4mZiHWGKRTwTDCQhdE5xvqW-CC9y79Br_fMUeL7uQ8,901
44
+ additory/functions/synthetic/strategies/augmentative.py,sha256=H6T2FFyYBGhv58VmLVf9LS1-2_kih4lMJi8VH4cqzeM,4862
45
+ additory/functions/synthetic/strategies/generative.py,sha256=VZNnCgkzAuTZVjcDH7G2UaDJsWmnGAouWyWIlJ_1vhc,4967
46
+ additory/functions/synthetic/strategies/presets.py,sha256=FxeBAo2KYXb-n2QGf5nW8amNDA7tSSVzo1fzCE6HzF0,2917
47
+ additory/functions/to/__init__.py,sha256=XlG_qayorYwSceKKJy95CiCyiG067WUUwQjSNNVmfYM,6217
48
+ additory/functions/to/lookup.py,sha256=csVndLKcSYqSlKHqA4MzPE5-7E1Yxv-WKAfclDuen4o,10126
49
+ additory/functions/to/merge.py,sha256=Y-OcMT3J9Wubz8rGNjiPwG2CjRGEGOzVMeeG_A_LbR0,5320
50
+ additory/functions/to/sort.py,sha256=QVSSVWdYx9EpFhVjZAoCL_cUd2b6ASMS2CjlNzhhJ7w,2506
51
+ additory/functions/to/summarize.py,sha256=1l8k8Wjl4W6VDoGHcHz6YoQsTqiAHkWwvwlPiWsEK1k,4816
52
+ additory/functions/transform/__init__.py,sha256=5XuEyn9LsS8pyxntEnzV8KF2YBernRw7bXkjf9owp-o,5468
53
+ additory/functions/transform/datetime.py,sha256=LUeh7kwzdYfw8veaQE_U1IXM0P-DKpRPJBHhVRnu2RU,2214
54
+ additory/functions/transform/extract.py,sha256=XF1ZA0Pdnxri1sh0J6OJDUQwXK5V4FkhJqDdUI0fBUg,2616
55
+ additory/functions/transform/harmonize.py,sha256=TY0ZYedhGh95GVpbG1-EgauT2e__amBKZkGDeNf9nVk,3150
56
+ additory/functions/transform/knn.py,sha256=wWay5L8Vgg8nDf_HwzJ7PSa27RdsvfvKLUqYw1125Gk,1676
57
+ additory/functions/transform/onehotencoding.py,sha256=LibYOfphxrESKNNm-XTByBkjOhaKeitTfvO5mVVC8Rc,1891
58
+ additory/functions/transform/transpose.py,sha256=Jy-50GgQbBsidBHY-F9YIBmwai1bPSSNzkN5X8n-sUk,1177
59
+ additory-0.1.1a1.dist-info/METADATA,sha256=R2A_EU0OTYn5OFJUrSPQfrDgAzs8YWta0Z0TY6b5jn4,2432
60
+ additory-0.1.1a1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
61
+ additory-0.1.1a1.dist-info/top_level.txt,sha256=4zphwXiI6HEl40fdjMXoUp9JNIqQ-tgYWeo3zqKqvEk,9
62
+ additory-0.1.1a1.dist-info/RECORD,,
@@ -1,48 +0,0 @@
1
- """
2
- Analysis Module for Data Profiling
3
-
4
- Provides comprehensive data analysis capabilities:
5
- - Distribution detection and fitting
6
- - Correlation analysis
7
- - Cardinality analysis
8
- - Data quality metrics
9
- - Data profiling and scanning
10
- """
11
-
12
- from additory.analysis.distributions import (
13
- detect_distributions,
14
- fit_distribution,
15
- DistributionFit
16
- )
17
- from additory.analysis.correlations import (
18
- calculate_correlations,
19
- CorrelationResult
20
- )
21
- from additory.analysis.cardinality import (
22
- analyze_cardinality,
23
- CardinalityInfo
24
- )
25
- from additory.analysis.quality import (
26
- analyze_quality,
27
- QualityMetrics
28
- )
29
- from additory.analysis.scan import (
30
- scan,
31
- ScanResult,
32
- ColumnInfo
33
- )
34
-
35
- __all__ = [
36
- 'detect_distributions',
37
- 'fit_distribution',
38
- 'DistributionFit',
39
- 'calculate_correlations',
40
- 'CorrelationResult',
41
- 'analyze_cardinality',
42
- 'CardinalityInfo',
43
- 'analyze_quality',
44
- 'QualityMetrics',
45
- 'scan',
46
- 'ScanResult',
47
- 'ColumnInfo',
48
- ]
@@ -1,126 +0,0 @@
1
- """
2
- Cardinality Analysis
3
-
4
- Analyzes unique values and cardinality of columns.
5
- """
6
-
7
- from dataclasses import dataclass
8
- from typing import List, Any, Dict
9
- import polars as pl
10
-
11
-
12
- @dataclass
13
- class CardinalityInfo:
14
- """Cardinality information for a column."""
15
- unique_count: int
16
- total_count: int
17
- ratio: float
18
- top_values: List[tuple] # [(value, count), ...]
19
- classification: str # 'constant', 'low', 'medium', 'high'
20
-
21
- def __repr__(self) -> str:
22
- return (
23
- f"CardinalityInfo(unique={self.unique_count}, "
24
- f"ratio={self.ratio:.2%}, class='{self.classification}')"
25
- )
26
-
27
-
28
- def classify_cardinality(ratio: float, unique_count: int) -> str:
29
- """
30
- Classify cardinality based on ratio and unique count.
31
-
32
- Args:
33
- ratio: Unique count / total count
34
- unique_count: Number of unique values
35
-
36
- Returns:
37
- Classification: 'constant', 'low', 'medium', 'high'
38
- """
39
- if unique_count == 1:
40
- return 'constant'
41
- elif ratio >= 0.5:
42
- return 'high'
43
- elif ratio >= 0.1:
44
- return 'medium'
45
- else:
46
- return 'low'
47
-
48
-
49
- def analyze_cardinality(
50
- df: pl.DataFrame,
51
- column: str,
52
- top_n: int = 10
53
- ) -> CardinalityInfo:
54
- """
55
- Analyze cardinality of a column.
56
-
57
- Args:
58
- df: Polars DataFrame
59
- column: Column name
60
- top_n: Number of top values to return
61
-
62
- Returns:
63
- CardinalityInfo object
64
- """
65
- # Get total count (excluding nulls)
66
- total_count = df[column].count()
67
-
68
- if total_count == 0:
69
- return CardinalityInfo(
70
- unique_count=0,
71
- total_count=0,
72
- ratio=0.0,
73
- top_values=[],
74
- classification='constant'
75
- )
76
-
77
- # Get unique count (excluding nulls)
78
- unique_count = df[column].drop_nulls().n_unique()
79
-
80
- # Calculate ratio
81
- ratio = unique_count / total_count if total_count > 0 else 0.0
82
-
83
- # Get top values
84
- value_counts = (
85
- df
86
- .group_by(column)
87
- .agg(pl.len().alias('count'))
88
- .sort('count', descending=True)
89
- .head(top_n)
90
- )
91
-
92
- top_values = [
93
- (row[column], row['count'])
94
- for row in value_counts.iter_rows(named=True)
95
- ]
96
-
97
- # Classify
98
- classification = classify_cardinality(ratio, unique_count)
99
-
100
- return CardinalityInfo(
101
- unique_count=unique_count,
102
- total_count=total_count,
103
- ratio=ratio,
104
- top_values=top_values,
105
- classification=classification
106
- )
107
-
108
-
109
- def analyze_all_cardinality(
110
- df: pl.DataFrame,
111
- top_n: int = 10
112
- ) -> Dict[str, CardinalityInfo]:
113
- """
114
- Analyze cardinality for all columns.
115
-
116
- Args:
117
- df: Polars DataFrame
118
- top_n: Number of top values to return per column
119
-
120
- Returns:
121
- Dictionary mapping column names to CardinalityInfo
122
- """
123
- return {
124
- col: analyze_cardinality(df, col, top_n)
125
- for col in df.columns
126
- }
@@ -1,124 +0,0 @@
1
- """
2
- Correlation Analysis
3
-
4
- Calculates correlations between numeric columns.
5
- """
6
-
7
- from dataclasses import dataclass
8
- from typing import Dict, List, Tuple
9
- import numpy as np
10
- import polars as pl
11
- from scipy import stats
12
-
13
-
14
- @dataclass
15
- class CorrelationResult:
16
- """Result of correlation analysis between two columns."""
17
- column1: str
18
- column2: str
19
- correlation: float
20
- method: str
21
- p_value: float = 0.0
22
-
23
-
24
- def calculate_correlations(
25
- df: pl.DataFrame,
26
- columns: List[str],
27
- methods: List[str] = ['pearson', 'spearman'],
28
- threshold: float = 0.0
29
- ) -> List[CorrelationResult]:
30
- """
31
- Calculate correlations between numeric columns with optimized batch processing.
32
-
33
- Args:
34
- df: Polars DataFrame
35
- columns: List of numeric column names
36
- methods: Correlation methods to calculate
37
- threshold: Minimum correlation threshold to report
38
-
39
- Returns:
40
- List of CorrelationResult objects (changed from single object for scan.py compatibility)
41
- """
42
- from concurrent.futures import ThreadPoolExecutor, as_completed
43
- import itertools
44
-
45
- if len(columns) < 2:
46
- return []
47
-
48
- # Pre-extract all data as numpy arrays for efficiency
49
- data_arrays = {}
50
- for col in columns:
51
- arr = df[col].to_numpy()
52
- data_arrays[col] = arr
53
-
54
- # Generate all column pairs
55
- column_pairs = list(itertools.combinations(columns, 2))
56
-
57
- results = []
58
-
59
- def calculate_pair_correlations(pair):
60
- """Calculate correlations for a single pair of columns."""
61
- col1, col2 = pair
62
- arr1 = data_arrays[col1]
63
- arr2 = data_arrays[col2]
64
-
65
- # Get common non-NaN indices
66
- mask = ~(np.isnan(arr1) | np.isnan(arr2))
67
- arr1_clean = arr1[mask]
68
- arr2_clean = arr2[mask]
69
-
70
- if len(arr1_clean) < 3:
71
- return None
72
-
73
- pair_results = {}
74
-
75
- # Calculate all requested methods for this pair
76
- for method in methods:
77
- try:
78
- if method == 'pearson':
79
- corr, p_value = stats.pearsonr(arr1_clean, arr2_clean)
80
- elif method == 'spearman':
81
- corr, p_value = stats.spearmanr(arr1_clean, arr2_clean)
82
- elif method == 'kendall':
83
- corr, p_value = stats.kendalltau(arr1_clean, arr2_clean)
84
- else:
85
- continue
86
-
87
- # Only include if above threshold
88
- if abs(corr) >= threshold:
89
- pair_results[method] = {
90
- 'correlation': float(corr),
91
- 'p_value': float(p_value)
92
- }
93
- except Exception:
94
- continue
95
-
96
- if pair_results:
97
- return (col1, col2, pair_results)
98
- return None
99
-
100
- # Use ThreadPoolExecutor for parallel processing of correlation pairs
101
- with ThreadPoolExecutor(max_workers=min(4, len(column_pairs))) as executor:
102
- # Submit all pair processing tasks
103
- future_to_pair = {
104
- executor.submit(calculate_pair_correlations, pair): pair
105
- for pair in column_pairs
106
- }
107
-
108
- # Collect results as they complete
109
- for future in as_completed(future_to_pair):
110
- result = future.result()
111
- if result is not None:
112
- col1, col2, pair_results = result
113
-
114
- # Create CorrelationResult objects for each method
115
- for method, corr_data in pair_results.items():
116
- results.append(CorrelationResult(
117
- column1=col1,
118
- column2=col2,
119
- correlation=corr_data['correlation'],
120
- method=method,
121
- p_value=corr_data['p_value']
122
- ))
123
-
124
- return results