additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/synthetic/smote.py
DELETED
|
@@ -1,320 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
SMOTE (Synthetic Minority Over-sampling Technique) for Synthetic Data Generation
|
|
3
|
-
|
|
4
|
-
Provides imbalanced data handling strategies:
|
|
5
|
-
- SMOTE: Generate synthetic samples for minority class
|
|
6
|
-
- Balance: Balance class distribution
|
|
7
|
-
- Oversample: Simple oversampling with variation
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from typing import List, Optional, Dict, Any, Tuple
|
|
11
|
-
import warnings
|
|
12
|
-
|
|
13
|
-
import numpy as np
|
|
14
|
-
|
|
15
|
-
from additory.common.exceptions import ValidationError, AugmentError
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def calculate_distances(point: np.ndarray, data: np.ndarray) -> np.ndarray:
|
|
19
|
-
"""
|
|
20
|
-
Calculate Euclidean distances from point to all points in data.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
point: Single data point (1D array)
|
|
24
|
-
data: Array of data points (2D array)
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
Array of distances
|
|
28
|
-
"""
|
|
29
|
-
return np.sqrt(np.sum((data - point) ** 2, axis=1))
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def find_k_nearest_neighbors(
|
|
33
|
-
point_idx: int,
|
|
34
|
-
data: np.ndarray,
|
|
35
|
-
k: int = 5
|
|
36
|
-
) -> np.ndarray:
|
|
37
|
-
"""
|
|
38
|
-
Find k nearest neighbors of a point.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
point_idx: Index of the point
|
|
42
|
-
data: Array of all data points
|
|
43
|
-
k: Number of neighbors to find
|
|
44
|
-
|
|
45
|
-
Returns:
|
|
46
|
-
Array of indices of k nearest neighbors
|
|
47
|
-
"""
|
|
48
|
-
point = data[point_idx]
|
|
49
|
-
distances = calculate_distances(point, data)
|
|
50
|
-
|
|
51
|
-
# Exclude the point itself
|
|
52
|
-
distances[point_idx] = np.inf
|
|
53
|
-
|
|
54
|
-
# Get k nearest
|
|
55
|
-
nearest_indices = np.argsort(distances)[:k]
|
|
56
|
-
|
|
57
|
-
return nearest_indices
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def generate_synthetic_sample(
|
|
61
|
-
point: np.ndarray,
|
|
62
|
-
neighbor: np.ndarray,
|
|
63
|
-
seed: Optional[int] = None
|
|
64
|
-
) -> np.ndarray:
|
|
65
|
-
"""
|
|
66
|
-
Generate synthetic sample between point and neighbor.
|
|
67
|
-
|
|
68
|
-
Uses linear interpolation with random weight.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
point: Original data point
|
|
72
|
-
neighbor: Neighbor data point
|
|
73
|
-
seed: Random seed
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
Synthetic sample
|
|
77
|
-
"""
|
|
78
|
-
if seed is not None:
|
|
79
|
-
np.random.seed(seed)
|
|
80
|
-
|
|
81
|
-
# Random weight between 0 and 1
|
|
82
|
-
weight = np.random.random()
|
|
83
|
-
|
|
84
|
-
# Linear interpolation
|
|
85
|
-
synthetic = point + weight * (neighbor - point)
|
|
86
|
-
|
|
87
|
-
return synthetic
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def smote_generate(
|
|
91
|
-
data: np.ndarray,
|
|
92
|
-
n_samples: int,
|
|
93
|
-
k_neighbors: int = 5,
|
|
94
|
-
seed: Optional[int] = None
|
|
95
|
-
) -> np.ndarray:
|
|
96
|
-
"""
|
|
97
|
-
Generate synthetic samples using SMOTE algorithm.
|
|
98
|
-
|
|
99
|
-
SMOTE creates synthetic samples by:
|
|
100
|
-
1. For each sample, find k nearest neighbors
|
|
101
|
-
2. Randomly select one neighbor
|
|
102
|
-
3. Create synthetic sample along line between sample and neighbor
|
|
103
|
-
|
|
104
|
-
Args:
|
|
105
|
-
data: Original data (2D array: samples x features)
|
|
106
|
-
n_samples: Number of synthetic samples to generate
|
|
107
|
-
k_neighbors: Number of nearest neighbors to consider
|
|
108
|
-
seed: Random seed for reproducibility
|
|
109
|
-
|
|
110
|
-
Returns:
|
|
111
|
-
Array of synthetic samples
|
|
112
|
-
|
|
113
|
-
Raises:
|
|
114
|
-
ValidationError: If parameters invalid
|
|
115
|
-
"""
|
|
116
|
-
n_original, n_features = data.shape
|
|
117
|
-
|
|
118
|
-
# Validate parameters
|
|
119
|
-
if n_samples <= 0:
|
|
120
|
-
raise ValidationError(f"n_samples must be positive, got {n_samples}")
|
|
121
|
-
|
|
122
|
-
if k_neighbors <= 0:
|
|
123
|
-
raise ValidationError(f"k_neighbors must be positive, got {k_neighbors}")
|
|
124
|
-
|
|
125
|
-
if k_neighbors >= n_original:
|
|
126
|
-
warnings.warn(
|
|
127
|
-
f"k_neighbors ({k_neighbors}) >= number of samples ({n_original}). "
|
|
128
|
-
f"Using k_neighbors={n_original - 1}"
|
|
129
|
-
)
|
|
130
|
-
k_neighbors = n_original - 1
|
|
131
|
-
|
|
132
|
-
if n_original < 2:
|
|
133
|
-
raise ValidationError(
|
|
134
|
-
f"Need at least 2 samples for SMOTE, got {n_original}"
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
# Set seed for reproducibility
|
|
138
|
-
if seed is not None:
|
|
139
|
-
np.random.seed(seed)
|
|
140
|
-
|
|
141
|
-
# Generate synthetic samples
|
|
142
|
-
synthetic_samples = []
|
|
143
|
-
|
|
144
|
-
for i in range(n_samples):
|
|
145
|
-
# Randomly select a sample
|
|
146
|
-
sample_idx = np.random.randint(0, n_original)
|
|
147
|
-
sample = data[sample_idx]
|
|
148
|
-
|
|
149
|
-
# Find k nearest neighbors
|
|
150
|
-
neighbor_indices = find_k_nearest_neighbors(sample_idx, data, k_neighbors)
|
|
151
|
-
|
|
152
|
-
# Randomly select one neighbor
|
|
153
|
-
neighbor_idx = np.random.choice(neighbor_indices)
|
|
154
|
-
neighbor = data[neighbor_idx]
|
|
155
|
-
|
|
156
|
-
# Generate synthetic sample
|
|
157
|
-
synthetic = generate_synthetic_sample(sample, neighbor, seed=None)
|
|
158
|
-
synthetic_samples.append(synthetic)
|
|
159
|
-
|
|
160
|
-
return np.array(synthetic_samples)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def apply_smote_strategy(
|
|
164
|
-
df_polars,
|
|
165
|
-
columns: List[str],
|
|
166
|
-
n_rows: int,
|
|
167
|
-
k_neighbors: int = 5,
|
|
168
|
-
seed: Optional[int] = None
|
|
169
|
-
) -> Dict[str, List[float]]:
|
|
170
|
-
"""
|
|
171
|
-
Apply SMOTE to generate synthetic rows for specified columns.
|
|
172
|
-
|
|
173
|
-
Args:
|
|
174
|
-
df_polars: Input Polars DataFrame
|
|
175
|
-
columns: List of column names to use for SMOTE
|
|
176
|
-
n_rows: Number of synthetic rows to generate
|
|
177
|
-
k_neighbors: Number of nearest neighbors
|
|
178
|
-
seed: Random seed for reproducibility
|
|
179
|
-
|
|
180
|
-
Returns:
|
|
181
|
-
Dictionary mapping column names to generated values
|
|
182
|
-
|
|
183
|
-
Raises:
|
|
184
|
-
ValidationError: If columns invalid or insufficient data
|
|
185
|
-
"""
|
|
186
|
-
# Validate columns exist
|
|
187
|
-
for col in columns:
|
|
188
|
-
if col not in df_polars.columns:
|
|
189
|
-
raise ValidationError(f"Column '{col}' not found in DataFrame")
|
|
190
|
-
|
|
191
|
-
# Extract data for specified columns
|
|
192
|
-
data_list = []
|
|
193
|
-
for col in columns:
|
|
194
|
-
col_data = df_polars[col].to_numpy()
|
|
195
|
-
|
|
196
|
-
# Check if numeric
|
|
197
|
-
if not np.issubdtype(col_data.dtype, np.number):
|
|
198
|
-
raise ValidationError(
|
|
199
|
-
f"SMOTE requires numeric columns. Column '{col}' is not numeric."
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
# Check for nulls
|
|
203
|
-
if np.any(np.isnan(col_data)):
|
|
204
|
-
raise ValidationError(
|
|
205
|
-
f"SMOTE requires non-null values. Column '{col}' contains nulls."
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
data_list.append(col_data)
|
|
209
|
-
|
|
210
|
-
# Stack into 2D array (samples x features)
|
|
211
|
-
data = np.column_stack(data_list)
|
|
212
|
-
|
|
213
|
-
# Generate synthetic samples
|
|
214
|
-
synthetic_data = smote_generate(data, n_rows, k_neighbors, seed)
|
|
215
|
-
|
|
216
|
-
# Split back into columns
|
|
217
|
-
result = {}
|
|
218
|
-
for i, col in enumerate(columns):
|
|
219
|
-
result[col] = synthetic_data[:, i].tolist()
|
|
220
|
-
|
|
221
|
-
return result
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
def balance_classes(
|
|
225
|
-
df_polars,
|
|
226
|
-
class_column: str,
|
|
227
|
-
target_ratio: float = 1.0,
|
|
228
|
-
method: str = "smote",
|
|
229
|
-
k_neighbors: int = 5,
|
|
230
|
-
seed: Optional[int] = None
|
|
231
|
-
) -> Tuple[int, str]:
|
|
232
|
-
"""
|
|
233
|
-
Calculate how many samples needed to balance classes.
|
|
234
|
-
|
|
235
|
-
Args:
|
|
236
|
-
df_polars: Input Polars DataFrame
|
|
237
|
-
class_column: Column containing class labels
|
|
238
|
-
target_ratio: Target ratio of minority to majority class (default: 1.0 for perfect balance)
|
|
239
|
-
method: Balancing method ('smote' or 'oversample')
|
|
240
|
-
k_neighbors: Number of neighbors for SMOTE
|
|
241
|
-
seed: Random seed
|
|
242
|
-
|
|
243
|
-
Returns:
|
|
244
|
-
Tuple of (n_samples_needed, minority_class)
|
|
245
|
-
|
|
246
|
-
Raises:
|
|
247
|
-
ValidationError: If class column invalid
|
|
248
|
-
"""
|
|
249
|
-
# Validate class column
|
|
250
|
-
if class_column not in df_polars.columns:
|
|
251
|
-
raise ValidationError(f"Class column '{class_column}' not found in DataFrame")
|
|
252
|
-
|
|
253
|
-
# Get class counts
|
|
254
|
-
class_counts = df_polars[class_column].value_counts()
|
|
255
|
-
|
|
256
|
-
if len(class_counts) < 2:
|
|
257
|
-
raise ValidationError(
|
|
258
|
-
f"Need at least 2 classes for balancing, found {len(class_counts)}"
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
# Find minority and majority classes
|
|
262
|
-
class_counts_dict = dict(zip(
|
|
263
|
-
class_counts[class_column].to_list(),
|
|
264
|
-
class_counts['counts'].to_list()
|
|
265
|
-
))
|
|
266
|
-
|
|
267
|
-
minority_class = min(class_counts_dict, key=class_counts_dict.get)
|
|
268
|
-
majority_class = max(class_counts_dict, key=class_counts_dict.get)
|
|
269
|
-
|
|
270
|
-
minority_count = class_counts_dict[minority_class]
|
|
271
|
-
majority_count = class_counts_dict[majority_class]
|
|
272
|
-
|
|
273
|
-
# Calculate target count for minority class
|
|
274
|
-
target_count = int(majority_count * target_ratio)
|
|
275
|
-
|
|
276
|
-
# Calculate how many samples needed
|
|
277
|
-
n_samples_needed = max(0, target_count - minority_count)
|
|
278
|
-
|
|
279
|
-
return n_samples_needed, minority_class
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def generate_smote_values(
|
|
283
|
-
df_polars,
|
|
284
|
-
columns: List[str],
|
|
285
|
-
n_rows: int,
|
|
286
|
-
k_neighbors: int = 5,
|
|
287
|
-
seed: Optional[int] = None,
|
|
288
|
-
**params
|
|
289
|
-
) -> Dict[str, List[Any]]:
|
|
290
|
-
"""
|
|
291
|
-
Main SMOTE generation function.
|
|
292
|
-
|
|
293
|
-
Args:
|
|
294
|
-
df_polars: Input Polars DataFrame
|
|
295
|
-
columns: Columns to use for SMOTE (numeric only)
|
|
296
|
-
n_rows: Number of synthetic rows to generate
|
|
297
|
-
k_neighbors: Number of nearest neighbors (default: 5)
|
|
298
|
-
seed: Random seed for reproducibility
|
|
299
|
-
**params: Additional parameters (reserved for future use)
|
|
300
|
-
|
|
301
|
-
Returns:
|
|
302
|
-
Dictionary mapping column names to generated values
|
|
303
|
-
|
|
304
|
-
Raises:
|
|
305
|
-
ValidationError: If parameters invalid
|
|
306
|
-
AugmentError: If generation fails
|
|
307
|
-
"""
|
|
308
|
-
try:
|
|
309
|
-
return apply_smote_strategy(
|
|
310
|
-
df_polars,
|
|
311
|
-
columns,
|
|
312
|
-
n_rows,
|
|
313
|
-
k_neighbors,
|
|
314
|
-
seed
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
except Exception as e:
|
|
318
|
-
if isinstance(e, (ValidationError, AugmentError)):
|
|
319
|
-
raise
|
|
320
|
-
raise AugmentError(f"SMOTE generation failed: {e}")
|