additory 0.1.0a4__tar.gz → 0.1.1a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. additory-0.1.1a1/PKG-INFO +83 -0
  2. additory-0.1.1a1/README.md +50 -0
  3. additory-0.1.1a1/additory/__init__.py +63 -0
  4. additory-0.1.1a1/additory/common/__init__.py +41 -0
  5. additory-0.1.1a1/additory/common/column_selector.py +255 -0
  6. additory-0.1.1a1/additory/common/distributions.py +410 -0
  7. additory-0.1.1a1/additory/common/extractors.py +313 -0
  8. additory-0.1.1a1/additory/common/knn_imputation.py +332 -0
  9. additory-0.1.1a1/additory/common/result.py +380 -0
  10. additory-0.1.1a1/additory/common/strategy_parser.py +243 -0
  11. additory-0.1.1a1/additory/common/unit_conversions.py +338 -0
  12. additory-0.1.1a1/additory/common/validation.py +377 -0
  13. additory-0.1.1a1/additory/core/__init__.py +39 -0
  14. additory-0.1.1a1/additory/core/backend.py +258 -0
  15. additory-0.1.1a1/additory/core/config.py +214 -0
  16. additory-0.1.1a1/additory/core/logging.py +230 -0
  17. additory-0.1.1a1/additory/core/memory_manager.py +209 -0
  18. additory-0.1.1a1/additory/expressions/__init__.py +5 -0
  19. additory-0.1.1a1/additory/expressions/compiler.py +457 -0
  20. additory-0.1.1a1/additory/expressions/engine.py +328 -0
  21. additory-0.1.1a1/additory/expressions/integrity.py +179 -0
  22. additory-0.1.1a1/additory/expressions/loader.py +263 -0
  23. additory-0.1.1a1/additory/expressions/parser.py +372 -0
  24. additory-0.1.1a1/additory/expressions/resolver.py +274 -0
  25. additory-0.1.1a1/additory/functions/__init__.py +1 -0
  26. additory-0.1.1a1/additory/functions/analyze/__init__.py +144 -0
  27. additory-0.1.1a1/additory/functions/analyze/cardinality.py +58 -0
  28. additory-0.1.1a1/additory/functions/analyze/correlations.py +66 -0
  29. additory-0.1.1a1/additory/functions/analyze/distributions.py +53 -0
  30. additory-0.1.1a1/additory/functions/analyze/duplicates.py +49 -0
  31. additory-0.1.1a1/additory/functions/analyze/features.py +61 -0
  32. additory-0.1.1a1/additory/functions/analyze/imputation.py +66 -0
  33. additory-0.1.1a1/additory/functions/analyze/outliers.py +65 -0
  34. additory-0.1.1a1/additory/functions/analyze/patterns.py +65 -0
  35. additory-0.1.1a1/additory/functions/analyze/presets.py +72 -0
  36. additory-0.1.1a1/additory/functions/analyze/quality.py +59 -0
  37. additory-0.1.1a1/additory/functions/analyze/timeseries.py +53 -0
  38. additory-0.1.1a1/additory/functions/analyze/types.py +45 -0
  39. additory-0.1.1a1/additory/functions/expressions/__init__.py +161 -0
  40. additory-0.1.1a1/additory/functions/snapshot/__init__.py +82 -0
  41. additory-0.1.1a1/additory/functions/snapshot/filter.py +119 -0
  42. additory-0.1.1a1/additory/functions/synthetic/__init__.py +113 -0
  43. additory-0.1.1a1/additory/functions/synthetic/mode_detector.py +47 -0
  44. additory-0.1.1a1/additory/functions/synthetic/strategies/__init__.py +1 -0
  45. additory-0.1.1a1/additory/functions/synthetic/strategies/advanced.py +35 -0
  46. additory-0.1.1a1/additory/functions/synthetic/strategies/augmentative.py +160 -0
  47. additory-0.1.1a1/additory/functions/synthetic/strategies/generative.py +168 -0
  48. additory-0.1.1a1/additory/functions/synthetic/strategies/presets.py +116 -0
  49. additory-0.1.1a1/additory/functions/to/__init__.py +188 -0
  50. additory-0.1.1a1/additory/functions/to/lookup.py +351 -0
  51. additory-0.1.1a1/additory/functions/to/merge.py +189 -0
  52. additory-0.1.1a1/additory/functions/to/sort.py +91 -0
  53. additory-0.1.1a1/additory/functions/to/summarize.py +170 -0
  54. additory-0.1.1a1/additory/functions/transform/__init__.py +140 -0
  55. additory-0.1.1a1/additory/functions/transform/datetime.py +79 -0
  56. additory-0.1.1a1/additory/functions/transform/extract.py +85 -0
  57. additory-0.1.1a1/additory/functions/transform/harmonize.py +105 -0
  58. additory-0.1.1a1/additory/functions/transform/knn.py +62 -0
  59. additory-0.1.1a1/additory/functions/transform/onehotencoding.py +68 -0
  60. additory-0.1.1a1/additory/functions/transform/transpose.py +42 -0
  61. additory-0.1.1a1/additory.egg-info/PKG-INFO +83 -0
  62. additory-0.1.1a1/additory.egg-info/SOURCES.txt +65 -0
  63. additory-0.1.1a1/additory.egg-info/dependency_links.txt +1 -0
  64. additory-0.1.1a1/additory.egg-info/requires.txt +13 -0
  65. additory-0.1.1a1/additory.egg-info/top_level.txt +1 -0
  66. additory-0.1.1a1/pyproject.toml +79 -0
  67. additory-0.1.0a4/LICENSE +0 -21
  68. additory-0.1.0a4/PKG-INFO +0 -311
  69. additory-0.1.0a4/README.md +0 -276
  70. additory-0.1.0a4/additory/__init__.py +0 -19
  71. additory-0.1.0a4/additory/analysis/__init__.py +0 -48
  72. additory-0.1.0a4/additory/analysis/cardinality.py +0 -126
  73. additory-0.1.0a4/additory/analysis/correlations.py +0 -124
  74. additory-0.1.0a4/additory/analysis/distributions.py +0 -376
  75. additory-0.1.0a4/additory/analysis/quality.py +0 -158
  76. additory-0.1.0a4/additory/analysis/scan.py +0 -400
  77. additory-0.1.0a4/additory/common/__init__.py +0 -157
  78. additory-0.1.0a4/additory/common/backend.py +0 -371
  79. additory-0.1.0a4/additory/common/column_utils.py +0 -191
  80. additory-0.1.0a4/additory/common/distributions.py +0 -737
  81. additory-0.1.0a4/additory/common/exceptions.py +0 -62
  82. additory-0.1.0a4/additory/common/lists.py +0 -229
  83. additory-0.1.0a4/additory/common/patterns.py +0 -240
  84. additory-0.1.0a4/additory/common/resolver.py +0 -567
  85. additory-0.1.0a4/additory/common/sample_data.py +0 -182
  86. additory-0.1.0a4/additory/common/validation.py +0 -197
  87. additory-0.1.0a4/additory/core/__init__.py +0 -27
  88. additory-0.1.0a4/additory/core/ast_builder.py +0 -165
  89. additory-0.1.0a4/additory/core/backends/__init__.py +0 -23
  90. additory-0.1.0a4/additory/core/backends/arrow_bridge.py +0 -483
  91. additory-0.1.0a4/additory/core/backends/cudf_bridge.py +0 -355
  92. additory-0.1.0a4/additory/core/column_positioning.py +0 -358
  93. additory-0.1.0a4/additory/core/compiler_polars.py +0 -166
  94. additory-0.1.0a4/additory/core/config.py +0 -342
  95. additory-0.1.0a4/additory/core/enhanced_cache_manager.py +0 -1119
  96. additory-0.1.0a4/additory/core/enhanced_matchers.py +0 -473
  97. additory-0.1.0a4/additory/core/enhanced_version_manager.py +0 -325
  98. additory-0.1.0a4/additory/core/executor.py +0 -59
  99. additory-0.1.0a4/additory/core/integrity_manager.py +0 -477
  100. additory-0.1.0a4/additory/core/loader.py +0 -190
  101. additory-0.1.0a4/additory/core/logging.py +0 -24
  102. additory-0.1.0a4/additory/core/memory_manager.py +0 -547
  103. additory-0.1.0a4/additory/core/namespace_manager.py +0 -657
  104. additory-0.1.0a4/additory/core/parser.py +0 -176
  105. additory-0.1.0a4/additory/core/polars_expression_engine.py +0 -601
  106. additory-0.1.0a4/additory/core/registry.py +0 -177
  107. additory-0.1.0a4/additory/core/sample_data_manager.py +0 -492
  108. additory-0.1.0a4/additory/core/user_namespace.py +0 -751
  109. additory-0.1.0a4/additory/core/validator.py +0 -27
  110. additory-0.1.0a4/additory/dynamic_api.py +0 -352
  111. additory-0.1.0a4/additory/expressions/__init__.py +0 -26
  112. additory-0.1.0a4/additory/expressions/engine.py +0 -551
  113. additory-0.1.0a4/additory/expressions/parser.py +0 -176
  114. additory-0.1.0a4/additory/expressions/proxy.py +0 -549
  115. additory-0.1.0a4/additory/expressions/registry.py +0 -313
  116. additory-0.1.0a4/additory/expressions/samples.py +0 -492
  117. additory-0.1.0a4/additory/synthetic/__init__.py +0 -13
  118. additory-0.1.0a4/additory/synthetic/column_name_resolver.py +0 -149
  119. additory-0.1.0a4/additory/synthetic/deduce.py +0 -259
  120. additory-0.1.0a4/additory/synthetic/distributions.py +0 -22
  121. additory-0.1.0a4/additory/synthetic/forecast.py +0 -1132
  122. additory-0.1.0a4/additory/synthetic/linked_list_parser.py +0 -415
  123. additory-0.1.0a4/additory/synthetic/namespace_lookup.py +0 -129
  124. additory-0.1.0a4/additory/synthetic/smote.py +0 -320
  125. additory-0.1.0a4/additory/synthetic/strategies.py +0 -926
  126. additory-0.1.0a4/additory/synthetic/synthesizer.py +0 -713
  127. additory-0.1.0a4/additory/utilities/__init__.py +0 -53
  128. additory-0.1.0a4/additory/utilities/encoding.py +0 -600
  129. additory-0.1.0a4/additory/utilities/games.py +0 -300
  130. additory-0.1.0a4/additory/utilities/keys.py +0 -8
  131. additory-0.1.0a4/additory/utilities/lookup.py +0 -103
  132. additory-0.1.0a4/additory/utilities/matchers.py +0 -216
  133. additory-0.1.0a4/additory/utilities/resolvers.py +0 -286
  134. additory-0.1.0a4/additory/utilities/settings.py +0 -167
  135. additory-0.1.0a4/additory/utilities/units.py +0 -749
  136. additory-0.1.0a4/additory/utilities/validators.py +0 -153
  137. additory-0.1.0a4/additory.egg-info/SOURCES.txt +0 -95
  138. additory-0.1.0a4/documentation/V0.1.0/add_deduce_function.html +0 -759
  139. additory-0.1.0a4/documentation/V0.1.0/add_harmonize_units_function.html +0 -564
  140. additory-0.1.0a4/documentation/V0.1.0/add_onehotencoding_function.html +0 -533
  141. additory-0.1.0a4/documentation/V0.1.0/add_scan_function.html +0 -701
  142. additory-0.1.0a4/documentation/V0.1.0/add_synthetic_function.html +0 -596
  143. additory-0.1.0a4/documentation/V0.1.0/add_to_function.html +0 -707
  144. additory-0.1.0a4/documentation/V0.1.0/expressions.html +0 -597
  145. additory-0.1.0a4/pyproject.toml +0 -48
  146. additory-0.1.0a4/reference/expressions_definitions/age_category_0.1.add +0 -84
  147. additory-0.1.0a4/reference/expressions_definitions/blood_pressure_category_0.1.add +0 -98
  148. additory-0.1.0a4/reference/expressions_definitions/bmi2_0.1.add +0 -41
  149. additory-0.1.0a4/reference/expressions_definitions/bmi3_0.1.add +0 -41
  150. additory-0.1.0a4/reference/expressions_definitions/bmi_0.1.add +0 -77
  151. additory-0.1.0a4/reference/expressions_definitions/bmr_0.1.add +0 -117
  152. additory-0.1.0a4/reference/expressions_definitions/body_fat_percentage_0.1.add +0 -132
  153. additory-0.1.0a4/reference/expressions_definitions/bsa_0.1.add +0 -85
  154. additory-0.1.0a4/reference/expressions_definitions/cholesterol_ratio_0.1.add +0 -85
  155. additory-0.1.0a4/reference/expressions_definitions/fitness_score_0.1.add +0 -129
  156. additory-0.1.0a4/reference/expressions_definitions/ideal_body_weight_0.1.add +0 -88
  157. additory-0.1.0a4/reference/expressions_definitions/manifest.json +0 -35
  158. additory-0.1.0a4/reference/expressions_definitions/waist_hip_ratio_0.1.add +0 -85
  159. additory-0.1.0a4/user_expressions/bmi1_0.1.add +0 -41
  160. additory-0.1.0a4/user_expressions/bmi2_0.1.add +0 -41
  161. additory-0.1.0a4/user_expressions/bmi3_0.1.add +0 -26
  162. additory-0.1.0a4/user_expressions/bmi_0.1.add +0 -26
  163. additory-0.1.0a4/user_expressions/manifest.json +0 -22
  164. {additory-0.1.0a4 → additory-0.1.1a1}/setup.cfg +0 -0
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: additory
3
+ Version: 0.1.1a1
4
+ Summary: Data augmentation library with Polars backend
5
+ Author-email: Additory Team <team@additory.dev>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/additory/additory
8
+ Project-URL: Documentation, https://additory.readthedocs.io
9
+ Project-URL: Repository, https://github.com/additory/additory
10
+ Keywords: data,augmentation,polars,dataframe,synthetic
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: polars>=0.20.0
23
+ Requires-Dist: pandas>=2.0.0
24
+ Requires-Dist: numpy>=1.24.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
28
+ Requires-Dist: black>=23.0.0; extra == "dev"
29
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
30
+ Requires-Dist: hypothesis>=6.0.0; extra == "dev"
31
+ Provides-Extra: gpu
32
+ Requires-Dist: cudf>=23.0.0; extra == "gpu"
33
+
34
+ # Additory v0.1.1a1
35
+
36
+ Data augmentation library with Polars backend.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install additory
42
+ ```
43
+
44
+ ## Quick Start
45
+
46
+ ```python
47
+ import additory
48
+ import polars as pl
49
+
50
+ # Add columns from reference DataFrame
51
+ result = additory.add.to(df, reference_df, on='id', bring='price')
52
+
53
+ # Transform columns
54
+ result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
55
+
56
+ # Filter data
57
+ result = additory.add.snapshot(df, where='age > 18')
58
+
59
+ # Generate synthetic data
60
+ result = additory.add.synthetic(df, rows=1000)
61
+
62
+ # Analyze data
63
+ result = additory.add.analyze(df, preset='quick')
64
+
65
+ # Evaluate expressions
66
+ result = additory.add.expressions(df, 'inbuilt:bmi')
67
+ ```
68
+
69
+ ## Features
70
+
71
+ - **Blazing Fast**: Built on Polars for maximum performance
72
+ - **Simple API**: Clean, intuitive API with `additory.add.function()` pattern
73
+ - **Flexible**: Works with Polars, pandas, and cuDF
74
+ - **Comprehensive**: 6 main functions covering all data augmentation needs
75
+ - **Well Tested**: 1,023 tests with 90% coverage
76
+
77
+ ## Documentation
78
+
79
+ Visit [https://additory.dev](https://additory.dev) for full documentation.
80
+
81
+ ## License
82
+
83
+ MIT
@@ -0,0 +1,50 @@
1
+ # Additory v0.1.1a1
2
+
3
+ Data augmentation library with Polars backend.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install additory
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ import additory
15
+ import polars as pl
16
+
17
+ # Add columns from reference DataFrame
18
+ result = additory.add.to(df, reference_df, on='id', bring='price')
19
+
20
+ # Transform columns
21
+ result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
22
+
23
+ # Filter data
24
+ result = additory.add.snapshot(df, where='age > 18')
25
+
26
+ # Generate synthetic data
27
+ result = additory.add.synthetic(df, rows=1000)
28
+
29
+ # Analyze data
30
+ result = additory.add.analyze(df, preset='quick')
31
+
32
+ # Evaluate expressions
33
+ result = additory.add.expressions(df, 'inbuilt:bmi')
34
+ ```
35
+
36
+ ## Features
37
+
38
+ - **Blazing Fast**: Built on Polars for maximum performance
39
+ - **Simple API**: Clean, intuitive API with `additory.add.function()` pattern
40
+ - **Flexible**: Works with Polars, pandas, and cuDF
41
+ - **Comprehensive**: 6 main functions covering all data augmentation needs
42
+ - **Well Tested**: 1,023 tests with 90% coverage
43
+
44
+ ## Documentation
45
+
46
+ Visit [https://additory.dev](https://additory.dev) for full documentation.
47
+
48
+ ## License
49
+
50
+ MIT
@@ -0,0 +1,63 @@
1
+ """
2
+ Additory v0.1.1 - Data Augmentation Library
3
+
4
+ A Polars-first data augmentation library with 5 main functions:
5
+ - to: Add columns from other DataFrames
6
+ - transform: Transform columns (transpose, encode, extract, etc.)
7
+ - snapshot: Filter and select data
8
+ - synthetic: Generate synthetic data
9
+ - analyze: Analyze data quality and patterns
10
+ - expressions: Evaluate expressions and add computed columns
11
+
12
+ Usage:
13
+ import additory
14
+
15
+ # Add columns
16
+ result = additory.add.to(df, reference_df, on='id', bring='price')
17
+
18
+ # Transform columns
19
+ result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
20
+
21
+ # Filter data
22
+ result = additory.add.snapshot(df, where='age > 18')
23
+
24
+ # Generate synthetic data
25
+ result = additory.add.synthetic(df, rows=1000)
26
+
27
+ # Analyze data
28
+ result = additory.add.analyze(df, preset='quick')
29
+
30
+ # Evaluate expressions
31
+ result = additory.add.expressions(df, 'inbuilt:bmi', 'age * 12')
32
+ """
33
+
34
+ from types import SimpleNamespace
35
+
36
+ # Import main functions
37
+ from additory.functions.to import to
38
+ from additory.functions.transform import transform
39
+ from additory.functions.snapshot import snapshot
40
+ from additory.functions.synthetic import synthetic
41
+ from additory.functions.analyze import analyze
42
+ from additory.functions.expressions import expressions
43
+
44
+ # Import configuration functions
45
+ from additory.core.config import set_expressions_folder, set_default_backend
46
+
47
+ # Create simple API namespace
48
+ add = SimpleNamespace(
49
+ to=to,
50
+ transform=transform,
51
+ snapshot=snapshot,
52
+ synthetic=synthetic,
53
+ analyze=analyze,
54
+ expressions=expressions,
55
+ set_expressions_folder=set_expressions_folder,
56
+ set_default_backend=set_default_backend
57
+ )
58
+
59
+ # Version
60
+ __version__ = "0.1.1a1"
61
+
62
+ # Public API
63
+ __all__ = ['add', '__version__']
@@ -0,0 +1,41 @@
1
+ """
2
+ Common utilities for Additory.
3
+
4
+ This module provides shared utilities used across all functions:
5
+ - validation: Input validation
6
+ - strategy_parser: Strategy parsing
7
+ - column_selector: Column selection
8
+ - result: Result wrappers
9
+ - extractors: Feature extractors
10
+ - unit_conversions: Unit conversion utilities
11
+ - knn_imputation: KNN imputation
12
+ - distributions: Distribution generation
13
+ """
14
+
15
+ from .validation import (
16
+ validate_dataframe,
17
+ validate_not_empty,
18
+ validate_column_name,
19
+ validate_positive_integer,
20
+ validate_percentage,
21
+ validate_mode,
22
+ validate_dict,
23
+ validate_list,
24
+ validate_string,
25
+ validate_boolean,
26
+ validate_optional
27
+ )
28
+
29
+ __all__ = [
30
+ 'validate_dataframe',
31
+ 'validate_not_empty',
32
+ 'validate_column_name',
33
+ 'validate_positive_integer',
34
+ 'validate_percentage',
35
+ 'validate_mode',
36
+ 'validate_dict',
37
+ 'validate_list',
38
+ 'validate_string',
39
+ 'validate_boolean',
40
+ 'validate_optional'
41
+ ]
@@ -0,0 +1,255 @@
1
+ """
2
+ Column selection and validation utilities for Additory.
3
+
4
+ Provides pattern matching and type-based column selection.
5
+ """
6
+
7
+ import re
8
+ from typing import Any, List, Optional, Union
9
+ import polars as pl
10
+
11
+
12
+ def select_columns(df: pl.DataFrame, columns: Union[str, List[str], None]) -> List[str]:
13
+ """
14
+ Select columns from DataFrame with pattern matching support.
15
+
16
+ Args:
17
+ df: DataFrame to select columns from
18
+ columns: Column specification (None='*', str pattern, or list)
19
+
20
+ Returns:
21
+ List of selected column names
22
+
23
+ Raises:
24
+ ValueError: If no columns match the pattern
25
+
26
+ Example:
27
+ # Select all columns
28
+ cols = select_columns(df, '*')
29
+
30
+ # Select by pattern
31
+ cols = select_columns(df, 'age_*')
32
+
33
+ # Select specific columns
34
+ cols = select_columns(df, ['name', 'email', 'age'])
35
+ """
36
+ # None means all columns
37
+ if columns is None:
38
+ return df.columns
39
+
40
+ # String pattern
41
+ if isinstance(columns, str):
42
+ # '*' means all columns
43
+ if columns == '*':
44
+ return df.columns
45
+
46
+ # Check if it's a pattern or exact match
47
+ if '*' in columns:
48
+ # Pattern matching
49
+ matched = []
50
+ for col in df.columns:
51
+ if match_pattern(col, columns):
52
+ matched.append(col)
53
+
54
+ if not matched:
55
+ raise ValueError(f"No columns match pattern '{columns}'")
56
+
57
+ return matched
58
+ else:
59
+ # Exact match
60
+ if columns not in df.columns:
61
+ raise ValueError(f"Column '{columns}' not found in DataFrame")
62
+ return [columns]
63
+
64
+ # List of columns
65
+ if isinstance(columns, list):
66
+ # Expand any patterns in the list
67
+ expanded = expand_column_patterns(df, columns)
68
+
69
+ # Validate all columns exist
70
+ validate_columns_exist(df, expanded)
71
+
72
+ return expanded
73
+
74
+ raise TypeError(
75
+ f"columns must be None, str, or list, got {type(columns).__name__}"
76
+ )
77
+
78
+
79
+ def match_pattern(column_name: str, pattern: str) -> bool:
80
+ """
81
+ Check if column name matches pattern.
82
+
83
+ Args:
84
+ column_name: Column name to check
85
+ pattern: Pattern to match ('*', 'prefix_*', '*_suffix', 'exact')
86
+
87
+ Returns:
88
+ True if matches, False otherwise
89
+
90
+ Example:
91
+ match_pattern('age_years', 'age_*') # True
92
+ match_pattern('total_age', '*_age') # True
93
+ match_pattern('age', 'age') # True
94
+ """
95
+ # Exact match
96
+ if pattern == column_name:
97
+ return True
98
+
99
+ # Wildcard match all
100
+ if pattern == '*':
101
+ return True
102
+
103
+ # Convert pattern to regex
104
+ # Escape special regex characters except *
105
+ regex_pattern = re.escape(pattern).replace(r'\*', '.*')
106
+
107
+ # Anchor to start and end
108
+ regex_pattern = f'^{regex_pattern}$'
109
+
110
+ return bool(re.match(regex_pattern, column_name))
111
+
112
+
113
+ def validate_columns_exist(df: pl.DataFrame, columns: List[str]) -> bool:
114
+ """
115
+ Validate that all columns exist in DataFrame.
116
+
117
+ Args:
118
+ df: DataFrame to check
119
+ columns: List of column names to validate
120
+
121
+ Returns:
122
+ True if all exist
123
+
124
+ Raises:
125
+ ValueError: If any columns are missing
126
+
127
+ Example:
128
+ validate_columns_exist(df, ['name', 'age', 'email'])
129
+ """
130
+ missing = []
131
+ for col in columns:
132
+ if col not in df.columns:
133
+ missing.append(col)
134
+
135
+ if missing:
136
+ if len(missing) == 1:
137
+ raise ValueError(f"Column '{missing[0]}' not found in DataFrame")
138
+ else:
139
+ raise ValueError(
140
+ f"Columns {missing} not found in DataFrame. "
141
+ f"Available columns: {df.columns}"
142
+ )
143
+
144
+ return True
145
+
146
+
147
+ def expand_column_patterns(df: pl.DataFrame, patterns: List[str]) -> List[str]:
148
+ """
149
+ Expand column patterns to actual column names.
150
+
151
+ Args:
152
+ df: DataFrame to expand patterns from
153
+ patterns: List of patterns to expand
154
+
155
+ Returns:
156
+ List of expanded column names (no duplicates)
157
+
158
+ Example:
159
+ # Input: ['age_*', 'total_*']
160
+ # Output: ['age_years', 'age_months', 'total_sales', 'total_orders']
161
+ cols = expand_column_patterns(df, ['age_*', 'total_*'])
162
+ """
163
+ expanded = []
164
+ seen = set()
165
+
166
+ for pattern in patterns:
167
+ # If pattern contains wildcard, expand it
168
+ if '*' in pattern:
169
+ matched = False
170
+ for col in df.columns:
171
+ if match_pattern(col, pattern):
172
+ if col not in seen:
173
+ expanded.append(col)
174
+ seen.add(col)
175
+ matched = True
176
+
177
+ if not matched:
178
+ raise ValueError(f"No columns match pattern '{pattern}'")
179
+ else:
180
+ # Exact column name
181
+ if pattern not in seen:
182
+ expanded.append(pattern)
183
+ seen.add(pattern)
184
+
185
+ return expanded
186
+
187
+
188
+ def get_column_type(df: pl.DataFrame, column: str) -> str:
189
+ """
190
+ Get the data type of a column.
191
+
192
+ Args:
193
+ df: DataFrame containing the column
194
+ column: Column name
195
+
196
+ Returns:
197
+ Type string ('numeric', 'string', 'datetime', 'boolean', 'other')
198
+
199
+ Raises:
200
+ ValueError: If column doesn't exist
201
+ """
202
+ if column not in df.columns:
203
+ raise ValueError(f"Column '{column}' not found in DataFrame")
204
+
205
+ dtype = df[column].dtype
206
+
207
+ # Numeric types
208
+ if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
209
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
210
+ pl.Float32, pl.Float64]:
211
+ return 'numeric'
212
+
213
+ # String types
214
+ if dtype in [pl.Utf8, pl.Categorical]:
215
+ return 'string'
216
+
217
+ # Datetime types
218
+ if dtype in [pl.Date, pl.Time, pl.Duration]:
219
+ return 'datetime'
220
+
221
+ # Check for Datetime with timezone info
222
+ if isinstance(dtype, pl.Datetime):
223
+ return 'datetime'
224
+
225
+ # Boolean type
226
+ if dtype == pl.Boolean:
227
+ return 'boolean'
228
+
229
+ # Other types
230
+ return 'other'
231
+
232
+
233
+ def filter_columns_by_type(df: pl.DataFrame, columns: List[str], dtype: str) -> List[str]:
234
+ """
235
+ Filter columns by data type.
236
+
237
+ Args:
238
+ df: DataFrame to filter columns from
239
+ columns: List of columns to filter
240
+ dtype: Type to filter by ('numeric', 'string', 'datetime', 'boolean')
241
+
242
+ Returns:
243
+ List of columns matching the type
244
+
245
+ Example:
246
+ numeric_cols = filter_columns_by_type(df, all_cols, 'numeric')
247
+ """
248
+ filtered = []
249
+
250
+ for col in columns:
251
+ col_type = get_column_type(df, col)
252
+ if col_type == dtype:
253
+ filtered.append(col)
254
+
255
+ return filtered