additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Snapshot function - Filter and select data.
|
|
3
|
+
|
|
4
|
+
This module provides the main snapshot function for filtering and selecting data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.common.result import wrap_result
|
|
12
|
+
from additory.core.backend import detect_backend, to_polars, from_polars
|
|
13
|
+
from additory.core.logging import Logger
|
|
14
|
+
from additory.functions.snapshot.filter import apply_filter, select_columns
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def snapshot(
|
|
18
|
+
df,
|
|
19
|
+
where: Optional[str] = None,
|
|
20
|
+
columns: Optional[List[str]] = None
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Filter and select data from DataFrame.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
df: Input DataFrame (Polars, pandas, or cuDF)
|
|
27
|
+
where: Optional filter expression (SQL-like WHERE clause)
|
|
28
|
+
columns: Optional list of columns to select
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
DataFrameResult with filtered/selected data
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
>>> # Simple filter
|
|
35
|
+
>>> result = snapshot(df, where='age > 18')
|
|
36
|
+
|
|
37
|
+
>>> # Filter + select
|
|
38
|
+
>>> result = snapshot(df,
|
|
39
|
+
... where='age > 18 AND status == "active"',
|
|
40
|
+
... columns=['name', 'email', 'age'])
|
|
41
|
+
|
|
42
|
+
>>> # Complex conditions
|
|
43
|
+
>>> result = snapshot(df,
|
|
44
|
+
... where='(age >= 18 AND status == "active") OR country == "USA"')
|
|
45
|
+
"""
|
|
46
|
+
logger = Logger()
|
|
47
|
+
logger.info("[snapshot] Starting snapshot() function")
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
# Validate and convert
|
|
51
|
+
validate_dataframe(df)
|
|
52
|
+
validate_not_empty(df)
|
|
53
|
+
backend = detect_backend(df)
|
|
54
|
+
polars_df = to_polars(df)
|
|
55
|
+
|
|
56
|
+
logger.info(f"Input: {polars_df.shape[0]} rows × {polars_df.shape[1]} columns")
|
|
57
|
+
|
|
58
|
+
# Apply filter
|
|
59
|
+
if where:
|
|
60
|
+
polars_df = apply_filter(polars_df, where)
|
|
61
|
+
|
|
62
|
+
# Select columns
|
|
63
|
+
if columns:
|
|
64
|
+
polars_df = select_columns(polars_df, columns)
|
|
65
|
+
|
|
66
|
+
# Convert back
|
|
67
|
+
result = from_polars(polars_df, backend)
|
|
68
|
+
|
|
69
|
+
logger.info(f"Output: {polars_df.shape[0]} rows × {polars_df.shape[1]} columns")
|
|
70
|
+
logger.info("[snapshot] snapshot() function complete")
|
|
71
|
+
|
|
72
|
+
# Wrap result
|
|
73
|
+
return wrap_result(result, 'snapshot', metadata={
|
|
74
|
+
'where': where,
|
|
75
|
+
'columns': columns,
|
|
76
|
+
'rows_filtered': len(polars_df)
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f"Error in snapshot() function: {e}", error_location="snapshot")
|
|
81
|
+
raise
|
|
82
|
+
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Filter and select data.
|
|
3
|
+
|
|
4
|
+
This module provides filtering functionality for the snapshot function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe
|
|
11
|
+
from additory.common.column_selector import select_columns as select_cols
|
|
12
|
+
from additory.core.logging import Logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def apply_filter(df: pl.DataFrame, where: str) -> pl.DataFrame:
|
|
16
|
+
"""
|
|
17
|
+
Apply WHERE clause filter to DataFrame.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
df: Input DataFrame
|
|
21
|
+
where: Filter expression (SQL-like WHERE clause)
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Filtered DataFrame
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> result = apply_filter(df, 'age > 18 AND status == "active"')
|
|
28
|
+
"""
|
|
29
|
+
logger = Logger()
|
|
30
|
+
|
|
31
|
+
# Validate
|
|
32
|
+
validate_dataframe(df)
|
|
33
|
+
|
|
34
|
+
if not where:
|
|
35
|
+
return df
|
|
36
|
+
|
|
37
|
+
logger.info(f"Applying filter: {where}")
|
|
38
|
+
|
|
39
|
+
# Parse and evaluate WHERE clause
|
|
40
|
+
where_expr = parse_where_clause(df, where)
|
|
41
|
+
result = df.filter(where_expr)
|
|
42
|
+
|
|
43
|
+
logger.info(f"Filter applied: {len(result)} rows remaining")
|
|
44
|
+
|
|
45
|
+
return result
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_where_clause(df: pl.DataFrame, where: str) -> pl.Expr:
|
|
49
|
+
"""
|
|
50
|
+
Parse WHERE clause into Polars expression.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
df: Input DataFrame (for column validation)
|
|
54
|
+
where: Filter expression
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Polars expression
|
|
58
|
+
|
|
59
|
+
Supports:
|
|
60
|
+
- Comparison: >, <, >=, <=, ==, !=
|
|
61
|
+
- Logical: AND, OR, NOT
|
|
62
|
+
- Parentheses for grouping
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
>>> expr = parse_where_clause(df, 'age > 18 AND status == "active"')
|
|
66
|
+
"""
|
|
67
|
+
# Simple implementation - convert to Polars expression
|
|
68
|
+
# This is a simplified parser - full implementation would use expressions.parser
|
|
69
|
+
|
|
70
|
+
# Replace SQL-style logical operators with Python/Polars operators
|
|
71
|
+
expr_str = where
|
|
72
|
+
expr_str = expr_str.replace(' AND ', ' & ')
|
|
73
|
+
expr_str = expr_str.replace(' OR ', ' | ')
|
|
74
|
+
expr_str = expr_str.replace(' NOT ', ' ~ ')
|
|
75
|
+
|
|
76
|
+
# For now, use Polars' SQL context for parsing
|
|
77
|
+
# Create a temporary SQL query and extract the filter
|
|
78
|
+
try:
|
|
79
|
+
# Build namespace with column references
|
|
80
|
+
namespace = {col: pl.col(col) for col in df.columns}
|
|
81
|
+
namespace['pl'] = pl
|
|
82
|
+
|
|
83
|
+
# Evaluate the expression
|
|
84
|
+
result = eval(expr_str, namespace)
|
|
85
|
+
return result
|
|
86
|
+
except Exception as e:
|
|
87
|
+
raise ValueError(f"Invalid WHERE clause: {where}. Error: {e}")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def select_columns(df: pl.DataFrame, columns: List[str]) -> pl.DataFrame:
|
|
91
|
+
"""
|
|
92
|
+
Select specified columns from DataFrame.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
df: Input DataFrame
|
|
96
|
+
columns: List of column names to select
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
DataFrame with selected columns
|
|
100
|
+
|
|
101
|
+
Example:
|
|
102
|
+
>>> result = select_columns(df, ['name', 'age', 'email'])
|
|
103
|
+
"""
|
|
104
|
+
logger = Logger()
|
|
105
|
+
|
|
106
|
+
# Validate
|
|
107
|
+
validate_dataframe(df)
|
|
108
|
+
|
|
109
|
+
if not columns:
|
|
110
|
+
return df
|
|
111
|
+
|
|
112
|
+
logger.info(f"Selecting {len(columns)} columns")
|
|
113
|
+
|
|
114
|
+
# Use Polars select directly
|
|
115
|
+
result = df.select(columns)
|
|
116
|
+
|
|
117
|
+
logger.info(f"Column selection complete")
|
|
118
|
+
|
|
119
|
+
return result
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Synthetic function - Generate synthetic data.
|
|
3
|
+
|
|
4
|
+
This module provides the main synthetic function for generating synthetic data
|
|
5
|
+
in three modes: augment, create, and preset.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
from typing import Dict, Optional, Union
|
|
10
|
+
|
|
11
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
12
|
+
from additory.common.result import wrap_result
|
|
13
|
+
from additory.core.backend import detect_backend, to_polars, from_polars
|
|
14
|
+
from additory.core.logging import Logger
|
|
15
|
+
from additory.functions.synthetic.mode_detector import detect_mode
|
|
16
|
+
from additory.functions.synthetic.strategies.generative import generate_data
|
|
17
|
+
from additory.functions.synthetic.strategies.augmentative import augment_data
|
|
18
|
+
from additory.functions.synthetic.strategies.presets import apply_preset
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def synthetic(
|
|
22
|
+
df_or_mode,
|
|
23
|
+
n_rows: Optional[Union[int, str]] = None,
|
|
24
|
+
strategy: Optional[Dict] = None,
|
|
25
|
+
preset: Optional[str] = None,
|
|
26
|
+
**kwargs
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Generate synthetic data.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
df_or_mode: DataFrame (augment mode) or '@new' (create mode)
|
|
33
|
+
n_rows: Number of rows to generate
|
|
34
|
+
strategy: Strategy dictionary
|
|
35
|
+
preset: Preset name
|
|
36
|
+
**kwargs: Mode-specific parameters
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
DataFrameResult with synthetic data
|
|
40
|
+
|
|
41
|
+
Modes:
|
|
42
|
+
- augment: Add synthetic rows to existing data
|
|
43
|
+
- create: Create synthetic data from scratch
|
|
44
|
+
- preset: Use preset configuration
|
|
45
|
+
|
|
46
|
+
Examples:
|
|
47
|
+
>>> # Augment mode
|
|
48
|
+
>>> result = synthetic(df, n_rows=100)
|
|
49
|
+
>>> result = synthetic(df, n_rows="50%")
|
|
50
|
+
|
|
51
|
+
>>> # Create mode
|
|
52
|
+
>>> result = synthetic('@new', n_rows=100, strategy={
|
|
53
|
+
... 'id': 'increment:start=1',
|
|
54
|
+
... 'age': 'range:18-65',
|
|
55
|
+
... 'status': 'choice:[Active,Inactive,Pending]'
|
|
56
|
+
... })
|
|
57
|
+
|
|
58
|
+
>>> # Preset mode
|
|
59
|
+
>>> result = synthetic('@new', n_rows=100, preset='users')
|
|
60
|
+
"""
|
|
61
|
+
logger = Logger()
|
|
62
|
+
logger.info("[synthetic] Starting synthetic() function")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
# Detect mode
|
|
66
|
+
mode = detect_mode(df_or_mode, strategy, preset)
|
|
67
|
+
logger.info(f"Mode detected: {mode}")
|
|
68
|
+
|
|
69
|
+
if mode == 'augment':
|
|
70
|
+
# Validate and convert
|
|
71
|
+
validate_dataframe(df_or_mode)
|
|
72
|
+
validate_not_empty(df_or_mode)
|
|
73
|
+
backend = detect_backend(df_or_mode)
|
|
74
|
+
polars_df = to_polars(df_or_mode)
|
|
75
|
+
|
|
76
|
+
# Augment
|
|
77
|
+
result = augment_data(polars_df, n_rows, strategy)
|
|
78
|
+
|
|
79
|
+
elif mode == 'create':
|
|
80
|
+
# Create from scratch
|
|
81
|
+
if not n_rows:
|
|
82
|
+
raise ValueError("n_rows is required for create mode")
|
|
83
|
+
|
|
84
|
+
result = generate_data(n_rows, strategy)
|
|
85
|
+
backend = 'polars'
|
|
86
|
+
|
|
87
|
+
elif mode == 'preset':
|
|
88
|
+
# Use preset
|
|
89
|
+
if not n_rows:
|
|
90
|
+
raise ValueError("n_rows is required for preset mode")
|
|
91
|
+
|
|
92
|
+
result = apply_preset(preset, n_rows)
|
|
93
|
+
backend = 'polars'
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
raise ValueError(f"Unknown mode: {mode}")
|
|
97
|
+
|
|
98
|
+
# Convert back
|
|
99
|
+
result = from_polars(result, backend)
|
|
100
|
+
|
|
101
|
+
logger.info(f"Output: {len(result)} rows × {len(result.columns) if hasattr(result, 'columns') else 'N/A'} columns")
|
|
102
|
+
logger.info("[synthetic] synthetic() function complete")
|
|
103
|
+
|
|
104
|
+
# Wrap result
|
|
105
|
+
return wrap_result(result, 'synthetic', metadata={
|
|
106
|
+
'mode': mode,
|
|
107
|
+
'rows_generated': n_rows
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.error(f"Error in synthetic() function: {e}", error_location="synthetic")
|
|
112
|
+
raise
|
|
113
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mode detection for synthetic data generation.
|
|
3
|
+
|
|
4
|
+
This module detects which mode to use: augment, create, or preset.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional, Dict
|
|
8
|
+
from additory.common.validation import is_dataframe
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def detect_mode(df_or_mode, strategy: Optional[Dict], preset: Optional[str]) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Detect synthetic data generation mode.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df_or_mode: DataFrame (augment mode) or '@new' (create mode)
|
|
17
|
+
strategy: Strategy dictionary
|
|
18
|
+
preset: Preset name
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Mode string ('augment', 'create', 'preset')
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
ValueError: If parameters are invalid
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> mode = detect_mode(df, None, None) # Returns: 'augment'
|
|
28
|
+
>>> mode = detect_mode('@new', strategy, None) # Returns: 'create'
|
|
29
|
+
>>> mode = detect_mode('@new', None, 'users') # Returns: 'preset'
|
|
30
|
+
"""
|
|
31
|
+
# Preset mode takes precedence
|
|
32
|
+
if preset is not None:
|
|
33
|
+
return 'preset'
|
|
34
|
+
|
|
35
|
+
# Check if it's a DataFrame first (augment mode)
|
|
36
|
+
elif is_dataframe(df_or_mode):
|
|
37
|
+
return 'augment'
|
|
38
|
+
|
|
39
|
+
# Create mode
|
|
40
|
+
elif df_or_mode == '@new':
|
|
41
|
+
return 'create'
|
|
42
|
+
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
"Invalid parameters for synthetic(). "
|
|
46
|
+
"First parameter must be a DataFrame or '@new'"
|
|
47
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Synthetic strategies module
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Advanced synthetic data generation strategies.
|
|
3
|
+
|
|
4
|
+
This module provides advanced strategies like SMOTE, correlations, etc.
|
|
5
|
+
(Placeholder for future implementation)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
from typing import Dict
|
|
10
|
+
|
|
11
|
+
from additory.core.logging import Logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def apply_advanced_strategy(df: pl.DataFrame, strategy: Dict) -> pl.DataFrame:
|
|
15
|
+
"""
|
|
16
|
+
Apply advanced synthetic data strategy.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
df: Input DataFrame
|
|
20
|
+
strategy: Advanced strategy configuration
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
DataFrame with synthetic data
|
|
24
|
+
|
|
25
|
+
Note:
|
|
26
|
+
This is a placeholder for future advanced strategies like:
|
|
27
|
+
- SMOTE (Synthetic Minority Over-sampling Technique)
|
|
28
|
+
- Conditional generation
|
|
29
|
+
- Time series forecasting
|
|
30
|
+
- Correlation preservation
|
|
31
|
+
"""
|
|
32
|
+
logger = Logger()
|
|
33
|
+
logger.warning("Advanced strategies not yet implemented")
|
|
34
|
+
|
|
35
|
+
return df
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Augment existing data with synthetic rows.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to add synthetic rows to existing data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Optional, Union
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
12
|
+
from additory.common.distributions import generate_normal, generate_uniform
|
|
13
|
+
from additory.core.logging import Logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def augment_data(
|
|
17
|
+
df: pl.DataFrame,
|
|
18
|
+
n_rows: Union[int, str],
|
|
19
|
+
strategy: Optional[Dict] = None
|
|
20
|
+
) -> pl.DataFrame:
|
|
21
|
+
"""
|
|
22
|
+
Add synthetic rows to existing data.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
df: Input DataFrame
|
|
26
|
+
n_rows: Number of rows (int or percentage string like '50%')
|
|
27
|
+
strategy: Optional augmentation strategy
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
DataFrame with original + synthetic rows
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> result = augment_data(df, n_rows=100) # Add 100 rows
|
|
34
|
+
>>> result = augment_data(df, n_rows="50%") # Add 50% more rows
|
|
35
|
+
"""
|
|
36
|
+
logger = Logger()
|
|
37
|
+
|
|
38
|
+
# Validate
|
|
39
|
+
validate_dataframe(df)
|
|
40
|
+
validate_not_empty(df)
|
|
41
|
+
|
|
42
|
+
# Parse n_rows
|
|
43
|
+
n_synthetic = parse_n_rows(n_rows, len(df))
|
|
44
|
+
|
|
45
|
+
logger.info(f"Augmenting {len(df)} rows with {n_synthetic} synthetic rows")
|
|
46
|
+
|
|
47
|
+
# Generate synthetic rows
|
|
48
|
+
synthetic_df = generate_synthetic_rows(df, n_synthetic, strategy)
|
|
49
|
+
|
|
50
|
+
# Concatenate
|
|
51
|
+
result = pl.concat([df, synthetic_df])
|
|
52
|
+
|
|
53
|
+
logger.info(f"Augmentation complete: {len(result)} total rows")
|
|
54
|
+
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def parse_n_rows(n_rows: Union[int, str], df_len: int) -> int:
|
|
59
|
+
"""
|
|
60
|
+
Parse n_rows parameter.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
n_rows: Number of rows (int or percentage string)
|
|
64
|
+
df_len: Length of DataFrame
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Number of rows to generate
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
>>> parse_n_rows(100, 1000)
|
|
71
|
+
100
|
|
72
|
+
>>> parse_n_rows("50%", 1000)
|
|
73
|
+
500
|
|
74
|
+
"""
|
|
75
|
+
if isinstance(n_rows, int):
|
|
76
|
+
return n_rows
|
|
77
|
+
|
|
78
|
+
elif isinstance(n_rows, str) and n_rows.endswith('%'):
|
|
79
|
+
percentage = float(n_rows.rstrip('%'))
|
|
80
|
+
return int(df_len * percentage / 100)
|
|
81
|
+
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError(f"Invalid n_rows: {n_rows}. Must be int or percentage string")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def generate_synthetic_rows(
|
|
87
|
+
df: pl.DataFrame,
|
|
88
|
+
n_rows: int,
|
|
89
|
+
strategy: Optional[Dict]
|
|
90
|
+
) -> pl.DataFrame:
|
|
91
|
+
"""
|
|
92
|
+
Generate synthetic rows matching existing data.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
df: Input DataFrame
|
|
96
|
+
n_rows: Number of rows to generate
|
|
97
|
+
strategy: Optional strategy
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
DataFrame with synthetic rows
|
|
101
|
+
"""
|
|
102
|
+
logger = Logger()
|
|
103
|
+
|
|
104
|
+
# Generate each column
|
|
105
|
+
columns = {}
|
|
106
|
+
|
|
107
|
+
for col in df.columns:
|
|
108
|
+
logger.info(f"Generating synthetic column: {col}")
|
|
109
|
+
|
|
110
|
+
# Get column dtype
|
|
111
|
+
dtype = df[col].dtype
|
|
112
|
+
|
|
113
|
+
# Generate based on dtype
|
|
114
|
+
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
115
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64]:
|
|
116
|
+
# Integer column - match distribution
|
|
117
|
+
col_data = df[col].drop_nulls()
|
|
118
|
+
if len(col_data) > 0:
|
|
119
|
+
mean = float(col_data.mean())
|
|
120
|
+
std = float(col_data.std())
|
|
121
|
+
synthetic = generate_normal(n_rows, mean, std).cast(dtype)
|
|
122
|
+
columns[col] = synthetic
|
|
123
|
+
else:
|
|
124
|
+
columns[col] = pl.Series([None] * n_rows, dtype=dtype)
|
|
125
|
+
|
|
126
|
+
elif dtype in [pl.Float32, pl.Float64]:
|
|
127
|
+
# Float column - match distribution
|
|
128
|
+
col_data = df[col].drop_nulls()
|
|
129
|
+
if len(col_data) > 0:
|
|
130
|
+
mean = float(col_data.mean())
|
|
131
|
+
std = float(col_data.std())
|
|
132
|
+
columns[col] = generate_normal(n_rows, mean, std)
|
|
133
|
+
else:
|
|
134
|
+
columns[col] = pl.Series([None] * n_rows, dtype=dtype)
|
|
135
|
+
|
|
136
|
+
elif dtype == pl.Utf8:
|
|
137
|
+
# String column - sample from existing values
|
|
138
|
+
col_data = df[col].drop_nulls()
|
|
139
|
+
if len(col_data) > 0:
|
|
140
|
+
values = col_data.to_list()
|
|
141
|
+
synthetic = np.random.choice(values, n_rows)
|
|
142
|
+
columns[col] = pl.Series(synthetic)
|
|
143
|
+
else:
|
|
144
|
+
columns[col] = pl.Series([None] * n_rows, dtype=dtype)
|
|
145
|
+
|
|
146
|
+
elif dtype == pl.Boolean:
|
|
147
|
+
# Boolean column - match distribution
|
|
148
|
+
col_data = df[col].drop_nulls()
|
|
149
|
+
if len(col_data) > 0:
|
|
150
|
+
true_ratio = float(col_data.sum()) / len(col_data)
|
|
151
|
+
synthetic = np.random.random(n_rows) < true_ratio
|
|
152
|
+
columns[col] = pl.Series(synthetic)
|
|
153
|
+
else:
|
|
154
|
+
columns[col] = pl.Series([None] * n_rows, dtype=dtype)
|
|
155
|
+
|
|
156
|
+
else:
|
|
157
|
+
# Other types - just use nulls
|
|
158
|
+
columns[col] = pl.Series([None] * n_rows, dtype=dtype)
|
|
159
|
+
|
|
160
|
+
return pl.DataFrame(columns)
|