additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generate synthetic data from scratch.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to create synthetic data using various strategies.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Any
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from additory.common.distributions import (
|
|
12
|
+
generate_normal, generate_uniform, generate_correlated
|
|
13
|
+
)
|
|
14
|
+
from additory.core.logging import Logger
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def generate_data(n_rows: int, strategy: Dict[str, str]) -> pl.DataFrame:
|
|
18
|
+
"""
|
|
19
|
+
Generate synthetic data from scratch.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
n_rows: Number of rows to generate
|
|
23
|
+
strategy: Dictionary mapping column to generation strategy
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
DataFrame with synthetic data
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> result = generate_data(n_rows=1000, strategy={
|
|
30
|
+
... 'id': 'increment:start=1',
|
|
31
|
+
... 'age': 'range:18-65',
|
|
32
|
+
... 'status': 'choice:[Active,Inactive,Pending]'
|
|
33
|
+
... })
|
|
34
|
+
"""
|
|
35
|
+
logger = Logger()
|
|
36
|
+
logger.info(f"Generating {n_rows} rows with {len(strategy)} columns")
|
|
37
|
+
|
|
38
|
+
if not strategy:
|
|
39
|
+
raise ValueError("strategy dictionary cannot be empty")
|
|
40
|
+
|
|
41
|
+
if n_rows <= 0:
|
|
42
|
+
raise ValueError(f"n_rows must be positive, got {n_rows}")
|
|
43
|
+
|
|
44
|
+
# Generate each column
|
|
45
|
+
columns = {}
|
|
46
|
+
for col_name, strategy_value in strategy.items():
|
|
47
|
+
logger.info(f"Generating column: {col_name}")
|
|
48
|
+
columns[col_name] = generate_column(n_rows, strategy_value)
|
|
49
|
+
|
|
50
|
+
# Create DataFrame
|
|
51
|
+
result = pl.DataFrame(columns)
|
|
52
|
+
|
|
53
|
+
logger.info(f"Generated {len(result)} rows × {len(result.columns)} columns")
|
|
54
|
+
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def generate_column(n_rows: int, strategy_value: str) -> pl.Series:
|
|
59
|
+
"""
|
|
60
|
+
Generate a single column based on strategy.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
n_rows: Number of rows
|
|
64
|
+
strategy_value: Strategy string
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Series with generated data
|
|
68
|
+
|
|
69
|
+
Supported strategies:
|
|
70
|
+
- increment:start=1:step=1
|
|
71
|
+
- range:18-65
|
|
72
|
+
- choice:[A,B,C]
|
|
73
|
+
- normal:mean=50:std=10
|
|
74
|
+
- uniform:low=0:high=100
|
|
75
|
+
"""
|
|
76
|
+
# Parse strategy
|
|
77
|
+
strategy_type, params = parse_strategy_value(strategy_value)
|
|
78
|
+
|
|
79
|
+
# Generate based on type
|
|
80
|
+
if strategy_type == 'increment':
|
|
81
|
+
start = params.get('start', 1)
|
|
82
|
+
step = params.get('step', 1)
|
|
83
|
+
return pl.Series(range(start, start + n_rows * step, step))
|
|
84
|
+
|
|
85
|
+
elif strategy_type == 'range':
|
|
86
|
+
low, high = params['range']
|
|
87
|
+
return pl.Series(np.random.randint(low, high + 1, n_rows))
|
|
88
|
+
|
|
89
|
+
elif strategy_type == 'choice':
|
|
90
|
+
choices = params['choices']
|
|
91
|
+
weights = params.get('weights')
|
|
92
|
+
if weights:
|
|
93
|
+
return pl.Series(np.random.choice(choices, n_rows, p=weights))
|
|
94
|
+
else:
|
|
95
|
+
return pl.Series(np.random.choice(choices, n_rows))
|
|
96
|
+
|
|
97
|
+
elif strategy_type == 'normal':
|
|
98
|
+
mean = params['mean']
|
|
99
|
+
std = params['std']
|
|
100
|
+
return generate_normal(n_rows, mean, std)
|
|
101
|
+
|
|
102
|
+
elif strategy_type == 'uniform':
|
|
103
|
+
low = params['low']
|
|
104
|
+
high = params['high']
|
|
105
|
+
return generate_uniform(n_rows, low, high)
|
|
106
|
+
|
|
107
|
+
else:
|
|
108
|
+
raise ValueError(f"Unsupported strategy type: {strategy_type}")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def parse_strategy_value(strategy_value: str) -> tuple[str, Dict[str, Any]]:
|
|
112
|
+
"""
|
|
113
|
+
Parse strategy string into type and parameters.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
strategy_value: Strategy string
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Tuple of (strategy_type, parameters)
|
|
120
|
+
|
|
121
|
+
Examples:
|
|
122
|
+
>>> parse_strategy_value('increment:start=1:step=2')
|
|
123
|
+
('increment', {'start': 1, 'step': 2})
|
|
124
|
+
|
|
125
|
+
>>> parse_strategy_value('range:18-65')
|
|
126
|
+
('range', {'range': (18, 65)})
|
|
127
|
+
|
|
128
|
+
>>> parse_strategy_value('choice:[A,B,C]')
|
|
129
|
+
('choice', {'choices': ['A', 'B', 'C']})
|
|
130
|
+
"""
|
|
131
|
+
parts = strategy_value.split(':')
|
|
132
|
+
strategy_type = parts[0]
|
|
133
|
+
params = {}
|
|
134
|
+
|
|
135
|
+
if strategy_type == 'increment':
|
|
136
|
+
for part in parts[1:]:
|
|
137
|
+
if '=' in part:
|
|
138
|
+
key, value = part.split('=')
|
|
139
|
+
params[key] = int(value)
|
|
140
|
+
|
|
141
|
+
elif strategy_type == 'range':
|
|
142
|
+
if len(parts) > 1:
|
|
143
|
+
range_str = parts[1]
|
|
144
|
+
low, high = map(int, range_str.split('-'))
|
|
145
|
+
params['range'] = (low, high)
|
|
146
|
+
|
|
147
|
+
elif strategy_type == 'choice':
|
|
148
|
+
if len(parts) > 1:
|
|
149
|
+
choices_str = parts[1]
|
|
150
|
+
# Remove brackets
|
|
151
|
+
choices_str = choices_str.strip('[]')
|
|
152
|
+
# Split by comma
|
|
153
|
+
choices = [c.strip() for c in choices_str.split(',')]
|
|
154
|
+
params['choices'] = choices
|
|
155
|
+
|
|
156
|
+
elif strategy_type == 'normal':
|
|
157
|
+
for part in parts[1:]:
|
|
158
|
+
if '=' in part:
|
|
159
|
+
key, value = part.split('=')
|
|
160
|
+
params[key] = float(value)
|
|
161
|
+
|
|
162
|
+
elif strategy_type == 'uniform':
|
|
163
|
+
for part in parts[1:]:
|
|
164
|
+
if '=' in part:
|
|
165
|
+
key, value = part.split('=')
|
|
166
|
+
params[key] = float(value)
|
|
167
|
+
|
|
168
|
+
return strategy_type, params
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Preset configurations for common synthetic data types.
|
|
3
|
+
|
|
4
|
+
This module provides preset configurations for generating common data types.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, List
|
|
9
|
+
|
|
10
|
+
from additory.functions.synthetic.strategies.generative import generate_data
|
|
11
|
+
from additory.core.logging import Logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Define presets
|
|
15
|
+
PRESETS = {
|
|
16
|
+
'users': {
|
|
17
|
+
'id': 'increment:start=1',
|
|
18
|
+
'age': 'range:18-75',
|
|
19
|
+
'status': 'choice:[Active,Inactive,Pending]',
|
|
20
|
+
'country': 'choice:[USA,UK,Canada,Australia,Germany]'
|
|
21
|
+
},
|
|
22
|
+
'transactions': {
|
|
23
|
+
'transaction_id': 'increment:start=1000',
|
|
24
|
+
'amount': 'uniform:low=10:high=1000',
|
|
25
|
+
'currency': 'choice:[USD,EUR,GBP]',
|
|
26
|
+
'status': 'choice:[completed,pending,failed]'
|
|
27
|
+
},
|
|
28
|
+
'products': {
|
|
29
|
+
'product_id': 'increment:start=1',
|
|
30
|
+
'price': 'uniform:low=5:high=500',
|
|
31
|
+
'stock': 'range:0-1000',
|
|
32
|
+
'category': 'choice:[Electronics,Clothing,Food,Books]'
|
|
33
|
+
},
|
|
34
|
+
'timeseries': {
|
|
35
|
+
'value': 'normal:mean=100:std=20'
|
|
36
|
+
},
|
|
37
|
+
'medical': {
|
|
38
|
+
'patient_id': 'increment:start=1',
|
|
39
|
+
'age': 'range:18-90',
|
|
40
|
+
'weight': 'normal:mean=70:std=15',
|
|
41
|
+
'height': 'normal:mean=170:std=10'
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def apply_preset(preset_name: str, n_rows: int) -> pl.DataFrame:
|
|
47
|
+
"""
|
|
48
|
+
Apply preset configuration.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
preset_name: Name of preset
|
|
52
|
+
n_rows: Number of rows to generate
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
DataFrame with synthetic data
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ValueError: If preset not found
|
|
59
|
+
|
|
60
|
+
Example:
|
|
61
|
+
>>> result = apply_preset('users', n_rows=1000)
|
|
62
|
+
"""
|
|
63
|
+
logger = Logger()
|
|
64
|
+
|
|
65
|
+
if preset_name not in PRESETS:
|
|
66
|
+
available = list_presets()
|
|
67
|
+
raise ValueError(
|
|
68
|
+
f"Preset '{preset_name}' not found. "
|
|
69
|
+
f"Available presets: {', '.join(available)}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
logger.info(f"Applying preset: {preset_name}")
|
|
73
|
+
|
|
74
|
+
# Get preset strategy
|
|
75
|
+
strategy = get_preset_strategy(preset_name)
|
|
76
|
+
|
|
77
|
+
# Generate data
|
|
78
|
+
result = generate_data(n_rows, strategy)
|
|
79
|
+
|
|
80
|
+
logger.info(f"Preset '{preset_name}' applied: {len(result)} rows")
|
|
81
|
+
|
|
82
|
+
return result
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_preset_strategy(preset_name: str) -> Dict[str, str]:
|
|
86
|
+
"""
|
|
87
|
+
Get strategy dictionary for preset.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
preset_name: Name of preset
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Strategy dictionary
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If preset not found
|
|
97
|
+
"""
|
|
98
|
+
if preset_name not in PRESETS:
|
|
99
|
+
raise ValueError(f"Preset '{preset_name}' not found")
|
|
100
|
+
|
|
101
|
+
return PRESETS[preset_name].copy()
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def list_presets() -> List[str]:
|
|
105
|
+
"""
|
|
106
|
+
List available presets.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
List of preset names
|
|
110
|
+
|
|
111
|
+
Example:
|
|
112
|
+
>>> presets = list_presets()
|
|
113
|
+
>>> print(presets)
|
|
114
|
+
['users', 'transactions', 'products', 'timeseries', 'medical']
|
|
115
|
+
"""
|
|
116
|
+
return list(PRESETS.keys())
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main to function - add columns to DataFrame.
|
|
3
|
+
|
|
4
|
+
This module provides the main user-facing to() function with multiple modes:
|
|
5
|
+
- lookup: Add columns from reference DataFrame
|
|
6
|
+
- summarize: Group and aggregate data
|
|
7
|
+
- merge: Merge multiple DataFrames
|
|
8
|
+
- sort: Sort DataFrame
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import polars as pl
|
|
12
|
+
from typing import Union, List, Optional, Dict, Any
|
|
13
|
+
|
|
14
|
+
from additory.core.backend import detect_backend, to_polars, from_polars
|
|
15
|
+
from additory.core.logging import Logger
|
|
16
|
+
from additory.core.memory_manager import MemoryManager
|
|
17
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
18
|
+
from additory.common.result import wrap_result
|
|
19
|
+
|
|
20
|
+
from additory.functions.to.lookup import perform_lookup
|
|
21
|
+
from additory.functions.to.summarize import perform_summarize
|
|
22
|
+
from additory.functions.to.merge import perform_merge
|
|
23
|
+
from additory.functions.to.sort import perform_sort
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def to(
|
|
27
|
+
df: Any,
|
|
28
|
+
from_df: Optional[Any] = None,
|
|
29
|
+
bring: Optional[Union[str, List[str]]] = None,
|
|
30
|
+
against: Optional[Union[str, List[str]]] = None,
|
|
31
|
+
on: Optional[Union[str, List[str]]] = None, # Alias for 'against'
|
|
32
|
+
to: Optional[str] = None,
|
|
33
|
+
bring_at: Optional[str] = None,
|
|
34
|
+
strategy: Optional[Dict] = None,
|
|
35
|
+
**kwargs
|
|
36
|
+
) -> Any:
|
|
37
|
+
"""
|
|
38
|
+
Add columns to DataFrame using various modes.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
df: Input DataFrame (or list of DataFrames for merge mode)
|
|
42
|
+
from_df: Reference DataFrame (for lookup mode)
|
|
43
|
+
bring: Column(s) to bring (for lookup mode)
|
|
44
|
+
against: Key column(s) for matching (for lookup mode)
|
|
45
|
+
to: Special mode indicator ('@summarize', '@merge', '@sort')
|
|
46
|
+
bring_at: Position to insert columns
|
|
47
|
+
strategy: Strategy dictionary for advanced control
|
|
48
|
+
**kwargs: Mode-specific parameters
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
DataFrame with added columns (wrapped in Result)
|
|
52
|
+
|
|
53
|
+
Modes:
|
|
54
|
+
1. Lookup (default): Add columns from reference DataFrame
|
|
55
|
+
- Triggered by: from_df is provided
|
|
56
|
+
- Example: to(df, from_df=products, bring='price', against='product_id')
|
|
57
|
+
|
|
58
|
+
2. Summarize: Group and aggregate data
|
|
59
|
+
- Triggered by: to='@summarize'
|
|
60
|
+
- Example: to(df, to='@summarize', group_by='category', aggregations={'sales': 'sum'})
|
|
61
|
+
|
|
62
|
+
3. Merge: Merge multiple DataFrames
|
|
63
|
+
- Triggered by: to='@merge'
|
|
64
|
+
- Example: to([df1, df2, df3], to='@merge')
|
|
65
|
+
|
|
66
|
+
4. Sort: Sort DataFrame
|
|
67
|
+
- Triggered by: to='@sort'
|
|
68
|
+
- Example: to(df, to='@sort', by='date', descending=True)
|
|
69
|
+
"""
|
|
70
|
+
logger = Logger()
|
|
71
|
+
memory_manager = MemoryManager()
|
|
72
|
+
|
|
73
|
+
# Handle 'on' as alias for 'against'
|
|
74
|
+
if on is not None and against is None:
|
|
75
|
+
against = on
|
|
76
|
+
elif on is not None and against is not None:
|
|
77
|
+
raise ValueError("Cannot specify both 'on' and 'against' parameters")
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
# Detect mode
|
|
81
|
+
mode = detect_mode(df, from_df, to, **kwargs)
|
|
82
|
+
logger.set_context('to', {'mode': mode})
|
|
83
|
+
logger.info(f"Starting to() function in '{mode}' mode")
|
|
84
|
+
|
|
85
|
+
# Handle merge mode specially (df is a list)
|
|
86
|
+
if mode == 'merge':
|
|
87
|
+
# Validate list of DataFrames
|
|
88
|
+
if not isinstance(df, list):
|
|
89
|
+
raise TypeError("For merge mode, df must be a list of DataFrames")
|
|
90
|
+
|
|
91
|
+
# Detect backend from first DataFrame
|
|
92
|
+
backend = detect_backend(df[0])
|
|
93
|
+
|
|
94
|
+
# Convert all to Polars
|
|
95
|
+
polars_dfs = [to_polars(d) for d in df]
|
|
96
|
+
|
|
97
|
+
# Perform merge
|
|
98
|
+
result = perform_merge(polars_dfs, **kwargs)
|
|
99
|
+
|
|
100
|
+
else:
|
|
101
|
+
# Single DataFrame modes
|
|
102
|
+
# Validate input
|
|
103
|
+
validate_dataframe(df)
|
|
104
|
+
validate_not_empty(df)
|
|
105
|
+
|
|
106
|
+
# Detect backend and convert to Polars
|
|
107
|
+
backend = detect_backend(df)
|
|
108
|
+
polars_df = to_polars(df)
|
|
109
|
+
|
|
110
|
+
# Dispatch to appropriate mode
|
|
111
|
+
if mode == 'summarize':
|
|
112
|
+
result = perform_summarize(polars_df, **kwargs)
|
|
113
|
+
elif mode == 'sort':
|
|
114
|
+
result = perform_sort(polars_df, **kwargs)
|
|
115
|
+
elif mode == 'lookup':
|
|
116
|
+
# Convert from_df to Polars
|
|
117
|
+
polars_from_df = to_polars(from_df)
|
|
118
|
+
result = perform_lookup(
|
|
119
|
+
polars_df,
|
|
120
|
+
polars_from_df,
|
|
121
|
+
bring,
|
|
122
|
+
against,
|
|
123
|
+
bring_at,
|
|
124
|
+
strategy
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
raise ValueError(f"Unknown mode: {mode}")
|
|
128
|
+
|
|
129
|
+
# Convert back to original backend
|
|
130
|
+
result = from_polars(result, backend)
|
|
131
|
+
|
|
132
|
+
# Cleanup
|
|
133
|
+
memory_manager.cleanup()
|
|
134
|
+
|
|
135
|
+
# Wrap result
|
|
136
|
+
logger.info(f"to() function complete: {len(result)} rows, {len(result.columns)} columns")
|
|
137
|
+
return wrap_result(result, 'to', metadata={'mode': mode})
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.error(f"Error in to() function: {str(e)}", error_location="to")
|
|
141
|
+
raise
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def detect_mode(
|
|
145
|
+
df: Any,
|
|
146
|
+
from_df: Optional[Any],
|
|
147
|
+
to: Optional[str],
|
|
148
|
+
**kwargs
|
|
149
|
+
) -> str:
|
|
150
|
+
"""
|
|
151
|
+
Detect which mode to use based on parameters.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
df: Input DataFrame
|
|
155
|
+
from_df: Reference DataFrame
|
|
156
|
+
to: Mode indicator
|
|
157
|
+
**kwargs: Additional parameters
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Mode string ('lookup', 'summarize', 'merge', 'sort')
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
ValueError: If mode cannot be determined
|
|
164
|
+
"""
|
|
165
|
+
# Check for explicit mode indicators
|
|
166
|
+
if to == '@summarize':
|
|
167
|
+
return 'summarize'
|
|
168
|
+
elif to == '@merge':
|
|
169
|
+
return 'merge'
|
|
170
|
+
elif to == '@sort':
|
|
171
|
+
return 'sort'
|
|
172
|
+
|
|
173
|
+
# Check for lookup mode (from_df provided)
|
|
174
|
+
elif from_df is not None:
|
|
175
|
+
return 'lookup'
|
|
176
|
+
|
|
177
|
+
# Cannot determine mode
|
|
178
|
+
else:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
"Cannot determine mode. Please provide either:\n"
|
|
181
|
+
" - from_df (for lookup mode)\n"
|
|
182
|
+
" - to='@summarize' (for summarize mode)\n"
|
|
183
|
+
" - to='@merge' (for merge mode)\n"
|
|
184
|
+
" - to='@sort' (for sort mode)"
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
__all__ = ['to']
|