additory 0.1.0a4__tar.gz → 0.1.1a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory-0.1.1a1/PKG-INFO +83 -0
- additory-0.1.1a1/README.md +50 -0
- additory-0.1.1a1/additory/__init__.py +63 -0
- additory-0.1.1a1/additory/common/__init__.py +41 -0
- additory-0.1.1a1/additory/common/column_selector.py +255 -0
- additory-0.1.1a1/additory/common/distributions.py +410 -0
- additory-0.1.1a1/additory/common/extractors.py +313 -0
- additory-0.1.1a1/additory/common/knn_imputation.py +332 -0
- additory-0.1.1a1/additory/common/result.py +380 -0
- additory-0.1.1a1/additory/common/strategy_parser.py +243 -0
- additory-0.1.1a1/additory/common/unit_conversions.py +338 -0
- additory-0.1.1a1/additory/common/validation.py +377 -0
- additory-0.1.1a1/additory/core/__init__.py +39 -0
- additory-0.1.1a1/additory/core/backend.py +258 -0
- additory-0.1.1a1/additory/core/config.py +214 -0
- additory-0.1.1a1/additory/core/logging.py +230 -0
- additory-0.1.1a1/additory/core/memory_manager.py +209 -0
- additory-0.1.1a1/additory/expressions/__init__.py +5 -0
- additory-0.1.1a1/additory/expressions/compiler.py +457 -0
- additory-0.1.1a1/additory/expressions/engine.py +328 -0
- additory-0.1.1a1/additory/expressions/integrity.py +179 -0
- additory-0.1.1a1/additory/expressions/loader.py +263 -0
- additory-0.1.1a1/additory/expressions/parser.py +372 -0
- additory-0.1.1a1/additory/expressions/resolver.py +274 -0
- additory-0.1.1a1/additory/functions/__init__.py +1 -0
- additory-0.1.1a1/additory/functions/analyze/__init__.py +144 -0
- additory-0.1.1a1/additory/functions/analyze/cardinality.py +58 -0
- additory-0.1.1a1/additory/functions/analyze/correlations.py +66 -0
- additory-0.1.1a1/additory/functions/analyze/distributions.py +53 -0
- additory-0.1.1a1/additory/functions/analyze/duplicates.py +49 -0
- additory-0.1.1a1/additory/functions/analyze/features.py +61 -0
- additory-0.1.1a1/additory/functions/analyze/imputation.py +66 -0
- additory-0.1.1a1/additory/functions/analyze/outliers.py +65 -0
- additory-0.1.1a1/additory/functions/analyze/patterns.py +65 -0
- additory-0.1.1a1/additory/functions/analyze/presets.py +72 -0
- additory-0.1.1a1/additory/functions/analyze/quality.py +59 -0
- additory-0.1.1a1/additory/functions/analyze/timeseries.py +53 -0
- additory-0.1.1a1/additory/functions/analyze/types.py +45 -0
- additory-0.1.1a1/additory/functions/expressions/__init__.py +161 -0
- additory-0.1.1a1/additory/functions/snapshot/__init__.py +82 -0
- additory-0.1.1a1/additory/functions/snapshot/filter.py +119 -0
- additory-0.1.1a1/additory/functions/synthetic/__init__.py +113 -0
- additory-0.1.1a1/additory/functions/synthetic/mode_detector.py +47 -0
- additory-0.1.1a1/additory/functions/synthetic/strategies/__init__.py +1 -0
- additory-0.1.1a1/additory/functions/synthetic/strategies/advanced.py +35 -0
- additory-0.1.1a1/additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory-0.1.1a1/additory/functions/synthetic/strategies/generative.py +168 -0
- additory-0.1.1a1/additory/functions/synthetic/strategies/presets.py +116 -0
- additory-0.1.1a1/additory/functions/to/__init__.py +188 -0
- additory-0.1.1a1/additory/functions/to/lookup.py +351 -0
- additory-0.1.1a1/additory/functions/to/merge.py +189 -0
- additory-0.1.1a1/additory/functions/to/sort.py +91 -0
- additory-0.1.1a1/additory/functions/to/summarize.py +170 -0
- additory-0.1.1a1/additory/functions/transform/__init__.py +140 -0
- additory-0.1.1a1/additory/functions/transform/datetime.py +79 -0
- additory-0.1.1a1/additory/functions/transform/extract.py +85 -0
- additory-0.1.1a1/additory/functions/transform/harmonize.py +105 -0
- additory-0.1.1a1/additory/functions/transform/knn.py +62 -0
- additory-0.1.1a1/additory/functions/transform/onehotencoding.py +68 -0
- additory-0.1.1a1/additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1/additory.egg-info/PKG-INFO +83 -0
- additory-0.1.1a1/additory.egg-info/SOURCES.txt +65 -0
- additory-0.1.1a1/additory.egg-info/dependency_links.txt +1 -0
- additory-0.1.1a1/additory.egg-info/requires.txt +13 -0
- additory-0.1.1a1/additory.egg-info/top_level.txt +1 -0
- additory-0.1.1a1/pyproject.toml +79 -0
- additory-0.1.0a4/LICENSE +0 -21
- additory-0.1.0a4/PKG-INFO +0 -311
- additory-0.1.0a4/README.md +0 -276
- additory-0.1.0a4/additory/__init__.py +0 -19
- additory-0.1.0a4/additory/analysis/__init__.py +0 -48
- additory-0.1.0a4/additory/analysis/cardinality.py +0 -126
- additory-0.1.0a4/additory/analysis/correlations.py +0 -124
- additory-0.1.0a4/additory/analysis/distributions.py +0 -376
- additory-0.1.0a4/additory/analysis/quality.py +0 -158
- additory-0.1.0a4/additory/analysis/scan.py +0 -400
- additory-0.1.0a4/additory/common/__init__.py +0 -157
- additory-0.1.0a4/additory/common/backend.py +0 -371
- additory-0.1.0a4/additory/common/column_utils.py +0 -191
- additory-0.1.0a4/additory/common/distributions.py +0 -737
- additory-0.1.0a4/additory/common/exceptions.py +0 -62
- additory-0.1.0a4/additory/common/lists.py +0 -229
- additory-0.1.0a4/additory/common/patterns.py +0 -240
- additory-0.1.0a4/additory/common/resolver.py +0 -567
- additory-0.1.0a4/additory/common/sample_data.py +0 -182
- additory-0.1.0a4/additory/common/validation.py +0 -197
- additory-0.1.0a4/additory/core/__init__.py +0 -27
- additory-0.1.0a4/additory/core/ast_builder.py +0 -165
- additory-0.1.0a4/additory/core/backends/__init__.py +0 -23
- additory-0.1.0a4/additory/core/backends/arrow_bridge.py +0 -483
- additory-0.1.0a4/additory/core/backends/cudf_bridge.py +0 -355
- additory-0.1.0a4/additory/core/column_positioning.py +0 -358
- additory-0.1.0a4/additory/core/compiler_polars.py +0 -166
- additory-0.1.0a4/additory/core/config.py +0 -342
- additory-0.1.0a4/additory/core/enhanced_cache_manager.py +0 -1119
- additory-0.1.0a4/additory/core/enhanced_matchers.py +0 -473
- additory-0.1.0a4/additory/core/enhanced_version_manager.py +0 -325
- additory-0.1.0a4/additory/core/executor.py +0 -59
- additory-0.1.0a4/additory/core/integrity_manager.py +0 -477
- additory-0.1.0a4/additory/core/loader.py +0 -190
- additory-0.1.0a4/additory/core/logging.py +0 -24
- additory-0.1.0a4/additory/core/memory_manager.py +0 -547
- additory-0.1.0a4/additory/core/namespace_manager.py +0 -657
- additory-0.1.0a4/additory/core/parser.py +0 -176
- additory-0.1.0a4/additory/core/polars_expression_engine.py +0 -601
- additory-0.1.0a4/additory/core/registry.py +0 -177
- additory-0.1.0a4/additory/core/sample_data_manager.py +0 -492
- additory-0.1.0a4/additory/core/user_namespace.py +0 -751
- additory-0.1.0a4/additory/core/validator.py +0 -27
- additory-0.1.0a4/additory/dynamic_api.py +0 -352
- additory-0.1.0a4/additory/expressions/__init__.py +0 -26
- additory-0.1.0a4/additory/expressions/engine.py +0 -551
- additory-0.1.0a4/additory/expressions/parser.py +0 -176
- additory-0.1.0a4/additory/expressions/proxy.py +0 -549
- additory-0.1.0a4/additory/expressions/registry.py +0 -313
- additory-0.1.0a4/additory/expressions/samples.py +0 -492
- additory-0.1.0a4/additory/synthetic/__init__.py +0 -13
- additory-0.1.0a4/additory/synthetic/column_name_resolver.py +0 -149
- additory-0.1.0a4/additory/synthetic/deduce.py +0 -259
- additory-0.1.0a4/additory/synthetic/distributions.py +0 -22
- additory-0.1.0a4/additory/synthetic/forecast.py +0 -1132
- additory-0.1.0a4/additory/synthetic/linked_list_parser.py +0 -415
- additory-0.1.0a4/additory/synthetic/namespace_lookup.py +0 -129
- additory-0.1.0a4/additory/synthetic/smote.py +0 -320
- additory-0.1.0a4/additory/synthetic/strategies.py +0 -926
- additory-0.1.0a4/additory/synthetic/synthesizer.py +0 -713
- additory-0.1.0a4/additory/utilities/__init__.py +0 -53
- additory-0.1.0a4/additory/utilities/encoding.py +0 -600
- additory-0.1.0a4/additory/utilities/games.py +0 -300
- additory-0.1.0a4/additory/utilities/keys.py +0 -8
- additory-0.1.0a4/additory/utilities/lookup.py +0 -103
- additory-0.1.0a4/additory/utilities/matchers.py +0 -216
- additory-0.1.0a4/additory/utilities/resolvers.py +0 -286
- additory-0.1.0a4/additory/utilities/settings.py +0 -167
- additory-0.1.0a4/additory/utilities/units.py +0 -749
- additory-0.1.0a4/additory/utilities/validators.py +0 -153
- additory-0.1.0a4/additory.egg-info/SOURCES.txt +0 -95
- additory-0.1.0a4/documentation/V0.1.0/add_deduce_function.html +0 -759
- additory-0.1.0a4/documentation/V0.1.0/add_harmonize_units_function.html +0 -564
- additory-0.1.0a4/documentation/V0.1.0/add_onehotencoding_function.html +0 -533
- additory-0.1.0a4/documentation/V0.1.0/add_scan_function.html +0 -701
- additory-0.1.0a4/documentation/V0.1.0/add_synthetic_function.html +0 -596
- additory-0.1.0a4/documentation/V0.1.0/add_to_function.html +0 -707
- additory-0.1.0a4/documentation/V0.1.0/expressions.html +0 -597
- additory-0.1.0a4/pyproject.toml +0 -48
- additory-0.1.0a4/reference/expressions_definitions/age_category_0.1.add +0 -84
- additory-0.1.0a4/reference/expressions_definitions/blood_pressure_category_0.1.add +0 -98
- additory-0.1.0a4/reference/expressions_definitions/bmi2_0.1.add +0 -41
- additory-0.1.0a4/reference/expressions_definitions/bmi3_0.1.add +0 -41
- additory-0.1.0a4/reference/expressions_definitions/bmi_0.1.add +0 -77
- additory-0.1.0a4/reference/expressions_definitions/bmr_0.1.add +0 -117
- additory-0.1.0a4/reference/expressions_definitions/body_fat_percentage_0.1.add +0 -132
- additory-0.1.0a4/reference/expressions_definitions/bsa_0.1.add +0 -85
- additory-0.1.0a4/reference/expressions_definitions/cholesterol_ratio_0.1.add +0 -85
- additory-0.1.0a4/reference/expressions_definitions/fitness_score_0.1.add +0 -129
- additory-0.1.0a4/reference/expressions_definitions/ideal_body_weight_0.1.add +0 -88
- additory-0.1.0a4/reference/expressions_definitions/manifest.json +0 -35
- additory-0.1.0a4/reference/expressions_definitions/waist_hip_ratio_0.1.add +0 -85
- additory-0.1.0a4/user_expressions/bmi1_0.1.add +0 -41
- additory-0.1.0a4/user_expressions/bmi2_0.1.add +0 -41
- additory-0.1.0a4/user_expressions/bmi3_0.1.add +0 -26
- additory-0.1.0a4/user_expressions/bmi_0.1.add +0 -26
- additory-0.1.0a4/user_expressions/manifest.json +0 -22
- {additory-0.1.0a4 → additory-0.1.1a1}/setup.cfg +0 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: additory
|
|
3
|
+
Version: 0.1.1a1
|
|
4
|
+
Summary: Data augmentation library with Polars backend
|
|
5
|
+
Author-email: Additory Team <team@additory.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/additory/additory
|
|
8
|
+
Project-URL: Documentation, https://additory.readthedocs.io
|
|
9
|
+
Project-URL: Repository, https://github.com/additory/additory
|
|
10
|
+
Keywords: data,augmentation,polars,dataframe,synthetic
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: polars>=0.20.0
|
|
23
|
+
Requires-Dist: pandas>=2.0.0
|
|
24
|
+
Requires-Dist: numpy>=1.24.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: hypothesis>=6.0.0; extra == "dev"
|
|
31
|
+
Provides-Extra: gpu
|
|
32
|
+
Requires-Dist: cudf>=23.0.0; extra == "gpu"
|
|
33
|
+
|
|
34
|
+
# Additory v0.1.1a1
|
|
35
|
+
|
|
36
|
+
Data augmentation library with Polars backend.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install additory
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick Start
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import additory
|
|
48
|
+
import polars as pl
|
|
49
|
+
|
|
50
|
+
# Add columns from reference DataFrame
|
|
51
|
+
result = additory.add.to(df, reference_df, on='id', bring='price')
|
|
52
|
+
|
|
53
|
+
# Transform columns
|
|
54
|
+
result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
|
|
55
|
+
|
|
56
|
+
# Filter data
|
|
57
|
+
result = additory.add.snapshot(df, where='age > 18')
|
|
58
|
+
|
|
59
|
+
# Generate synthetic data
|
|
60
|
+
result = additory.add.synthetic(df, rows=1000)
|
|
61
|
+
|
|
62
|
+
# Analyze data
|
|
63
|
+
result = additory.add.analyze(df, preset='quick')
|
|
64
|
+
|
|
65
|
+
# Evaluate expressions
|
|
66
|
+
result = additory.add.expressions(df, 'inbuilt:bmi')
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Features
|
|
70
|
+
|
|
71
|
+
- **Blazing Fast**: Built on Polars for maximum performance
|
|
72
|
+
- **Simple API**: Clean, intuitive API with `additory.add.function()` pattern
|
|
73
|
+
- **Flexible**: Works with Polars, pandas, and cuDF
|
|
74
|
+
- **Comprehensive**: 6 main functions covering all data augmentation needs
|
|
75
|
+
- **Well Tested**: 1,023 tests with 90% coverage
|
|
76
|
+
|
|
77
|
+
## Documentation
|
|
78
|
+
|
|
79
|
+
Visit [https://additory.dev](https://additory.dev) for full documentation.
|
|
80
|
+
|
|
81
|
+
## License
|
|
82
|
+
|
|
83
|
+
MIT
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Additory v0.1.1a1
|
|
2
|
+
|
|
3
|
+
Data augmentation library with Polars backend.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install additory
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import additory
|
|
15
|
+
import polars as pl
|
|
16
|
+
|
|
17
|
+
# Add columns from reference DataFrame
|
|
18
|
+
result = additory.add.to(df, reference_df, on='id', bring='price')
|
|
19
|
+
|
|
20
|
+
# Transform columns
|
|
21
|
+
result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
|
|
22
|
+
|
|
23
|
+
# Filter data
|
|
24
|
+
result = additory.add.snapshot(df, where='age > 18')
|
|
25
|
+
|
|
26
|
+
# Generate synthetic data
|
|
27
|
+
result = additory.add.synthetic(df, rows=1000)
|
|
28
|
+
|
|
29
|
+
# Analyze data
|
|
30
|
+
result = additory.add.analyze(df, preset='quick')
|
|
31
|
+
|
|
32
|
+
# Evaluate expressions
|
|
33
|
+
result = additory.add.expressions(df, 'inbuilt:bmi')
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
- **Blazing Fast**: Built on Polars for maximum performance
|
|
39
|
+
- **Simple API**: Clean, intuitive API with `additory.add.function()` pattern
|
|
40
|
+
- **Flexible**: Works with Polars, pandas, and cuDF
|
|
41
|
+
- **Comprehensive**: 6 main functions covering all data augmentation needs
|
|
42
|
+
- **Well Tested**: 1,023 tests with 90% coverage
|
|
43
|
+
|
|
44
|
+
## Documentation
|
|
45
|
+
|
|
46
|
+
Visit [https://additory.dev](https://additory.dev) for full documentation.
|
|
47
|
+
|
|
48
|
+
## License
|
|
49
|
+
|
|
50
|
+
MIT
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Additory v0.1.1 - Data Augmentation Library
|
|
3
|
+
|
|
4
|
+
A Polars-first data augmentation library with 5 main functions:
|
|
5
|
+
- to: Add columns from other DataFrames
|
|
6
|
+
- transform: Transform columns (transpose, encode, extract, etc.)
|
|
7
|
+
- snapshot: Filter and select data
|
|
8
|
+
- synthetic: Generate synthetic data
|
|
9
|
+
- analyze: Analyze data quality and patterns
|
|
10
|
+
- expressions: Evaluate expressions and add computed columns
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
import additory
|
|
14
|
+
|
|
15
|
+
# Add columns
|
|
16
|
+
result = additory.add.to(df, reference_df, on='id', bring='price')
|
|
17
|
+
|
|
18
|
+
# Transform columns
|
|
19
|
+
result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
|
|
20
|
+
|
|
21
|
+
# Filter data
|
|
22
|
+
result = additory.add.snapshot(df, where='age > 18')
|
|
23
|
+
|
|
24
|
+
# Generate synthetic data
|
|
25
|
+
result = additory.add.synthetic(df, rows=1000)
|
|
26
|
+
|
|
27
|
+
# Analyze data
|
|
28
|
+
result = additory.add.analyze(df, preset='quick')
|
|
29
|
+
|
|
30
|
+
# Evaluate expressions
|
|
31
|
+
result = additory.add.expressions(df, 'inbuilt:bmi', 'age * 12')
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from types import SimpleNamespace
|
|
35
|
+
|
|
36
|
+
# Import main functions
|
|
37
|
+
from additory.functions.to import to
|
|
38
|
+
from additory.functions.transform import transform
|
|
39
|
+
from additory.functions.snapshot import snapshot
|
|
40
|
+
from additory.functions.synthetic import synthetic
|
|
41
|
+
from additory.functions.analyze import analyze
|
|
42
|
+
from additory.functions.expressions import expressions
|
|
43
|
+
|
|
44
|
+
# Import configuration functions
|
|
45
|
+
from additory.core.config import set_expressions_folder, set_default_backend
|
|
46
|
+
|
|
47
|
+
# Create simple API namespace
|
|
48
|
+
add = SimpleNamespace(
|
|
49
|
+
to=to,
|
|
50
|
+
transform=transform,
|
|
51
|
+
snapshot=snapshot,
|
|
52
|
+
synthetic=synthetic,
|
|
53
|
+
analyze=analyze,
|
|
54
|
+
expressions=expressions,
|
|
55
|
+
set_expressions_folder=set_expressions_folder,
|
|
56
|
+
set_default_backend=set_default_backend
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Version
|
|
60
|
+
__version__ = "0.1.1a1"
|
|
61
|
+
|
|
62
|
+
# Public API
|
|
63
|
+
__all__ = ['add', '__version__']
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Common utilities for Additory.
|
|
3
|
+
|
|
4
|
+
This module provides shared utilities used across all functions:
|
|
5
|
+
- validation: Input validation
|
|
6
|
+
- strategy_parser: Strategy parsing
|
|
7
|
+
- column_selector: Column selection
|
|
8
|
+
- result: Result wrappers
|
|
9
|
+
- extractors: Feature extractors
|
|
10
|
+
- unit_conversions: Unit conversion utilities
|
|
11
|
+
- knn_imputation: KNN imputation
|
|
12
|
+
- distributions: Distribution generation
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .validation import (
|
|
16
|
+
validate_dataframe,
|
|
17
|
+
validate_not_empty,
|
|
18
|
+
validate_column_name,
|
|
19
|
+
validate_positive_integer,
|
|
20
|
+
validate_percentage,
|
|
21
|
+
validate_mode,
|
|
22
|
+
validate_dict,
|
|
23
|
+
validate_list,
|
|
24
|
+
validate_string,
|
|
25
|
+
validate_boolean,
|
|
26
|
+
validate_optional
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
'validate_dataframe',
|
|
31
|
+
'validate_not_empty',
|
|
32
|
+
'validate_column_name',
|
|
33
|
+
'validate_positive_integer',
|
|
34
|
+
'validate_percentage',
|
|
35
|
+
'validate_mode',
|
|
36
|
+
'validate_dict',
|
|
37
|
+
'validate_list',
|
|
38
|
+
'validate_string',
|
|
39
|
+
'validate_boolean',
|
|
40
|
+
'validate_optional'
|
|
41
|
+
]
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Column selection and validation utilities for Additory.
|
|
3
|
+
|
|
4
|
+
Provides pattern matching and type-based column selection.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any, List, Optional, Union
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def select_columns(df: pl.DataFrame, columns: Union[str, List[str], None]) -> List[str]:
|
|
13
|
+
"""
|
|
14
|
+
Select columns from DataFrame with pattern matching support.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df: DataFrame to select columns from
|
|
18
|
+
columns: Column specification (None='*', str pattern, or list)
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
List of selected column names
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
ValueError: If no columns match the pattern
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
# Select all columns
|
|
28
|
+
cols = select_columns(df, '*')
|
|
29
|
+
|
|
30
|
+
# Select by pattern
|
|
31
|
+
cols = select_columns(df, 'age_*')
|
|
32
|
+
|
|
33
|
+
# Select specific columns
|
|
34
|
+
cols = select_columns(df, ['name', 'email', 'age'])
|
|
35
|
+
"""
|
|
36
|
+
# None means all columns
|
|
37
|
+
if columns is None:
|
|
38
|
+
return df.columns
|
|
39
|
+
|
|
40
|
+
# String pattern
|
|
41
|
+
if isinstance(columns, str):
|
|
42
|
+
# '*' means all columns
|
|
43
|
+
if columns == '*':
|
|
44
|
+
return df.columns
|
|
45
|
+
|
|
46
|
+
# Check if it's a pattern or exact match
|
|
47
|
+
if '*' in columns:
|
|
48
|
+
# Pattern matching
|
|
49
|
+
matched = []
|
|
50
|
+
for col in df.columns:
|
|
51
|
+
if match_pattern(col, columns):
|
|
52
|
+
matched.append(col)
|
|
53
|
+
|
|
54
|
+
if not matched:
|
|
55
|
+
raise ValueError(f"No columns match pattern '{columns}'")
|
|
56
|
+
|
|
57
|
+
return matched
|
|
58
|
+
else:
|
|
59
|
+
# Exact match
|
|
60
|
+
if columns not in df.columns:
|
|
61
|
+
raise ValueError(f"Column '{columns}' not found in DataFrame")
|
|
62
|
+
return [columns]
|
|
63
|
+
|
|
64
|
+
# List of columns
|
|
65
|
+
if isinstance(columns, list):
|
|
66
|
+
# Expand any patterns in the list
|
|
67
|
+
expanded = expand_column_patterns(df, columns)
|
|
68
|
+
|
|
69
|
+
# Validate all columns exist
|
|
70
|
+
validate_columns_exist(df, expanded)
|
|
71
|
+
|
|
72
|
+
return expanded
|
|
73
|
+
|
|
74
|
+
raise TypeError(
|
|
75
|
+
f"columns must be None, str, or list, got {type(columns).__name__}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def match_pattern(column_name: str, pattern: str) -> bool:
|
|
80
|
+
"""
|
|
81
|
+
Check if column name matches pattern.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
column_name: Column name to check
|
|
85
|
+
pattern: Pattern to match ('*', 'prefix_*', '*_suffix', 'exact')
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
True if matches, False otherwise
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
match_pattern('age_years', 'age_*') # True
|
|
92
|
+
match_pattern('total_age', '*_age') # True
|
|
93
|
+
match_pattern('age', 'age') # True
|
|
94
|
+
"""
|
|
95
|
+
# Exact match
|
|
96
|
+
if pattern == column_name:
|
|
97
|
+
return True
|
|
98
|
+
|
|
99
|
+
# Wildcard match all
|
|
100
|
+
if pattern == '*':
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
# Convert pattern to regex
|
|
104
|
+
# Escape special regex characters except *
|
|
105
|
+
regex_pattern = re.escape(pattern).replace(r'\*', '.*')
|
|
106
|
+
|
|
107
|
+
# Anchor to start and end
|
|
108
|
+
regex_pattern = f'^{regex_pattern}$'
|
|
109
|
+
|
|
110
|
+
return bool(re.match(regex_pattern, column_name))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def validate_columns_exist(df: pl.DataFrame, columns: List[str]) -> bool:
|
|
114
|
+
"""
|
|
115
|
+
Validate that all columns exist in DataFrame.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
df: DataFrame to check
|
|
119
|
+
columns: List of column names to validate
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
True if all exist
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
ValueError: If any columns are missing
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
validate_columns_exist(df, ['name', 'age', 'email'])
|
|
129
|
+
"""
|
|
130
|
+
missing = []
|
|
131
|
+
for col in columns:
|
|
132
|
+
if col not in df.columns:
|
|
133
|
+
missing.append(col)
|
|
134
|
+
|
|
135
|
+
if missing:
|
|
136
|
+
if len(missing) == 1:
|
|
137
|
+
raise ValueError(f"Column '{missing[0]}' not found in DataFrame")
|
|
138
|
+
else:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f"Columns {missing} not found in DataFrame. "
|
|
141
|
+
f"Available columns: {df.columns}"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def expand_column_patterns(df: pl.DataFrame, patterns: List[str]) -> List[str]:
|
|
148
|
+
"""
|
|
149
|
+
Expand column patterns to actual column names.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
df: DataFrame to expand patterns from
|
|
153
|
+
patterns: List of patterns to expand
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
List of expanded column names (no duplicates)
|
|
157
|
+
|
|
158
|
+
Example:
|
|
159
|
+
# Input: ['age_*', 'total_*']
|
|
160
|
+
# Output: ['age_years', 'age_months', 'total_sales', 'total_orders']
|
|
161
|
+
cols = expand_column_patterns(df, ['age_*', 'total_*'])
|
|
162
|
+
"""
|
|
163
|
+
expanded = []
|
|
164
|
+
seen = set()
|
|
165
|
+
|
|
166
|
+
for pattern in patterns:
|
|
167
|
+
# If pattern contains wildcard, expand it
|
|
168
|
+
if '*' in pattern:
|
|
169
|
+
matched = False
|
|
170
|
+
for col in df.columns:
|
|
171
|
+
if match_pattern(col, pattern):
|
|
172
|
+
if col not in seen:
|
|
173
|
+
expanded.append(col)
|
|
174
|
+
seen.add(col)
|
|
175
|
+
matched = True
|
|
176
|
+
|
|
177
|
+
if not matched:
|
|
178
|
+
raise ValueError(f"No columns match pattern '{pattern}'")
|
|
179
|
+
else:
|
|
180
|
+
# Exact column name
|
|
181
|
+
if pattern not in seen:
|
|
182
|
+
expanded.append(pattern)
|
|
183
|
+
seen.add(pattern)
|
|
184
|
+
|
|
185
|
+
return expanded
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def get_column_type(df: pl.DataFrame, column: str) -> str:
|
|
189
|
+
"""
|
|
190
|
+
Get the data type of a column.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
df: DataFrame containing the column
|
|
194
|
+
column: Column name
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Type string ('numeric', 'string', 'datetime', 'boolean', 'other')
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
ValueError: If column doesn't exist
|
|
201
|
+
"""
|
|
202
|
+
if column not in df.columns:
|
|
203
|
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
|
204
|
+
|
|
205
|
+
dtype = df[column].dtype
|
|
206
|
+
|
|
207
|
+
# Numeric types
|
|
208
|
+
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
209
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
210
|
+
pl.Float32, pl.Float64]:
|
|
211
|
+
return 'numeric'
|
|
212
|
+
|
|
213
|
+
# String types
|
|
214
|
+
if dtype in [pl.Utf8, pl.Categorical]:
|
|
215
|
+
return 'string'
|
|
216
|
+
|
|
217
|
+
# Datetime types
|
|
218
|
+
if dtype in [pl.Date, pl.Time, pl.Duration]:
|
|
219
|
+
return 'datetime'
|
|
220
|
+
|
|
221
|
+
# Check for Datetime with timezone info
|
|
222
|
+
if isinstance(dtype, pl.Datetime):
|
|
223
|
+
return 'datetime'
|
|
224
|
+
|
|
225
|
+
# Boolean type
|
|
226
|
+
if dtype == pl.Boolean:
|
|
227
|
+
return 'boolean'
|
|
228
|
+
|
|
229
|
+
# Other types
|
|
230
|
+
return 'other'
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def filter_columns_by_type(df: pl.DataFrame, columns: List[str], dtype: str) -> List[str]:
|
|
234
|
+
"""
|
|
235
|
+
Filter columns by data type.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
df: DataFrame to filter columns from
|
|
239
|
+
columns: List of columns to filter
|
|
240
|
+
dtype: Type to filter by ('numeric', 'string', 'datetime', 'boolean')
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
List of columns matching the type
|
|
244
|
+
|
|
245
|
+
Example:
|
|
246
|
+
numeric_cols = filter_columns_by_type(df, all_cols, 'numeric')
|
|
247
|
+
"""
|
|
248
|
+
filtered = []
|
|
249
|
+
|
|
250
|
+
for col in columns:
|
|
251
|
+
col_type = get_column_type(df, col)
|
|
252
|
+
if col_type == dtype:
|
|
253
|
+
filtered.append(col)
|
|
254
|
+
|
|
255
|
+
return filtered
|