additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,320 +0,0 @@
1
- """
2
- SMOTE (Synthetic Minority Over-sampling Technique) for Synthetic Data Generation
3
-
4
- Provides imbalanced data handling strategies:
5
- - SMOTE: Generate synthetic samples for minority class
6
- - Balance: Balance class distribution
7
- - Oversample: Simple oversampling with variation
8
- """
9
-
10
- from typing import List, Optional, Dict, Any, Tuple
11
- import warnings
12
-
13
- import numpy as np
14
-
15
- from additory.common.exceptions import ValidationError, AugmentError
16
-
17
-
18
- def calculate_distances(point: np.ndarray, data: np.ndarray) -> np.ndarray:
19
- """
20
- Calculate Euclidean distances from point to all points in data.
21
-
22
- Args:
23
- point: Single data point (1D array)
24
- data: Array of data points (2D array)
25
-
26
- Returns:
27
- Array of distances
28
- """
29
- return np.sqrt(np.sum((data - point) ** 2, axis=1))
30
-
31
-
32
- def find_k_nearest_neighbors(
33
- point_idx: int,
34
- data: np.ndarray,
35
- k: int = 5
36
- ) -> np.ndarray:
37
- """
38
- Find k nearest neighbors of a point.
39
-
40
- Args:
41
- point_idx: Index of the point
42
- data: Array of all data points
43
- k: Number of neighbors to find
44
-
45
- Returns:
46
- Array of indices of k nearest neighbors
47
- """
48
- point = data[point_idx]
49
- distances = calculate_distances(point, data)
50
-
51
- # Exclude the point itself
52
- distances[point_idx] = np.inf
53
-
54
- # Get k nearest
55
- nearest_indices = np.argsort(distances)[:k]
56
-
57
- return nearest_indices
58
-
59
-
60
- def generate_synthetic_sample(
61
- point: np.ndarray,
62
- neighbor: np.ndarray,
63
- seed: Optional[int] = None
64
- ) -> np.ndarray:
65
- """
66
- Generate synthetic sample between point and neighbor.
67
-
68
- Uses linear interpolation with random weight.
69
-
70
- Args:
71
- point: Original data point
72
- neighbor: Neighbor data point
73
- seed: Random seed
74
-
75
- Returns:
76
- Synthetic sample
77
- """
78
- if seed is not None:
79
- np.random.seed(seed)
80
-
81
- # Random weight between 0 and 1
82
- weight = np.random.random()
83
-
84
- # Linear interpolation
85
- synthetic = point + weight * (neighbor - point)
86
-
87
- return synthetic
88
-
89
-
90
- def smote_generate(
91
- data: np.ndarray,
92
- n_samples: int,
93
- k_neighbors: int = 5,
94
- seed: Optional[int] = None
95
- ) -> np.ndarray:
96
- """
97
- Generate synthetic samples using SMOTE algorithm.
98
-
99
- SMOTE creates synthetic samples by:
100
- 1. For each sample, find k nearest neighbors
101
- 2. Randomly select one neighbor
102
- 3. Create synthetic sample along line between sample and neighbor
103
-
104
- Args:
105
- data: Original data (2D array: samples x features)
106
- n_samples: Number of synthetic samples to generate
107
- k_neighbors: Number of nearest neighbors to consider
108
- seed: Random seed for reproducibility
109
-
110
- Returns:
111
- Array of synthetic samples
112
-
113
- Raises:
114
- ValidationError: If parameters invalid
115
- """
116
- n_original, n_features = data.shape
117
-
118
- # Validate parameters
119
- if n_samples <= 0:
120
- raise ValidationError(f"n_samples must be positive, got {n_samples}")
121
-
122
- if k_neighbors <= 0:
123
- raise ValidationError(f"k_neighbors must be positive, got {k_neighbors}")
124
-
125
- if k_neighbors >= n_original:
126
- warnings.warn(
127
- f"k_neighbors ({k_neighbors}) >= number of samples ({n_original}). "
128
- f"Using k_neighbors={n_original - 1}"
129
- )
130
- k_neighbors = n_original - 1
131
-
132
- if n_original < 2:
133
- raise ValidationError(
134
- f"Need at least 2 samples for SMOTE, got {n_original}"
135
- )
136
-
137
- # Set seed for reproducibility
138
- if seed is not None:
139
- np.random.seed(seed)
140
-
141
- # Generate synthetic samples
142
- synthetic_samples = []
143
-
144
- for i in range(n_samples):
145
- # Randomly select a sample
146
- sample_idx = np.random.randint(0, n_original)
147
- sample = data[sample_idx]
148
-
149
- # Find k nearest neighbors
150
- neighbor_indices = find_k_nearest_neighbors(sample_idx, data, k_neighbors)
151
-
152
- # Randomly select one neighbor
153
- neighbor_idx = np.random.choice(neighbor_indices)
154
- neighbor = data[neighbor_idx]
155
-
156
- # Generate synthetic sample
157
- synthetic = generate_synthetic_sample(sample, neighbor, seed=None)
158
- synthetic_samples.append(synthetic)
159
-
160
- return np.array(synthetic_samples)
161
-
162
-
163
- def apply_smote_strategy(
164
- df_polars,
165
- columns: List[str],
166
- n_rows: int,
167
- k_neighbors: int = 5,
168
- seed: Optional[int] = None
169
- ) -> Dict[str, List[float]]:
170
- """
171
- Apply SMOTE to generate synthetic rows for specified columns.
172
-
173
- Args:
174
- df_polars: Input Polars DataFrame
175
- columns: List of column names to use for SMOTE
176
- n_rows: Number of synthetic rows to generate
177
- k_neighbors: Number of nearest neighbors
178
- seed: Random seed for reproducibility
179
-
180
- Returns:
181
- Dictionary mapping column names to generated values
182
-
183
- Raises:
184
- ValidationError: If columns invalid or insufficient data
185
- """
186
- # Validate columns exist
187
- for col in columns:
188
- if col not in df_polars.columns:
189
- raise ValidationError(f"Column '{col}' not found in DataFrame")
190
-
191
- # Extract data for specified columns
192
- data_list = []
193
- for col in columns:
194
- col_data = df_polars[col].to_numpy()
195
-
196
- # Check if numeric
197
- if not np.issubdtype(col_data.dtype, np.number):
198
- raise ValidationError(
199
- f"SMOTE requires numeric columns. Column '{col}' is not numeric."
200
- )
201
-
202
- # Check for nulls
203
- if np.any(np.isnan(col_data)):
204
- raise ValidationError(
205
- f"SMOTE requires non-null values. Column '{col}' contains nulls."
206
- )
207
-
208
- data_list.append(col_data)
209
-
210
- # Stack into 2D array (samples x features)
211
- data = np.column_stack(data_list)
212
-
213
- # Generate synthetic samples
214
- synthetic_data = smote_generate(data, n_rows, k_neighbors, seed)
215
-
216
- # Split back into columns
217
- result = {}
218
- for i, col in enumerate(columns):
219
- result[col] = synthetic_data[:, i].tolist()
220
-
221
- return result
222
-
223
-
224
- def balance_classes(
225
- df_polars,
226
- class_column: str,
227
- target_ratio: float = 1.0,
228
- method: str = "smote",
229
- k_neighbors: int = 5,
230
- seed: Optional[int] = None
231
- ) -> Tuple[int, str]:
232
- """
233
- Calculate how many samples needed to balance classes.
234
-
235
- Args:
236
- df_polars: Input Polars DataFrame
237
- class_column: Column containing class labels
238
- target_ratio: Target ratio of minority to majority class (default: 1.0 for perfect balance)
239
- method: Balancing method ('smote' or 'oversample')
240
- k_neighbors: Number of neighbors for SMOTE
241
- seed: Random seed
242
-
243
- Returns:
244
- Tuple of (n_samples_needed, minority_class)
245
-
246
- Raises:
247
- ValidationError: If class column invalid
248
- """
249
- # Validate class column
250
- if class_column not in df_polars.columns:
251
- raise ValidationError(f"Class column '{class_column}' not found in DataFrame")
252
-
253
- # Get class counts
254
- class_counts = df_polars[class_column].value_counts()
255
-
256
- if len(class_counts) < 2:
257
- raise ValidationError(
258
- f"Need at least 2 classes for balancing, found {len(class_counts)}"
259
- )
260
-
261
- # Find minority and majority classes
262
- class_counts_dict = dict(zip(
263
- class_counts[class_column].to_list(),
264
- class_counts['counts'].to_list()
265
- ))
266
-
267
- minority_class = min(class_counts_dict, key=class_counts_dict.get)
268
- majority_class = max(class_counts_dict, key=class_counts_dict.get)
269
-
270
- minority_count = class_counts_dict[minority_class]
271
- majority_count = class_counts_dict[majority_class]
272
-
273
- # Calculate target count for minority class
274
- target_count = int(majority_count * target_ratio)
275
-
276
- # Calculate how many samples needed
277
- n_samples_needed = max(0, target_count - minority_count)
278
-
279
- return n_samples_needed, minority_class
280
-
281
-
282
- def generate_smote_values(
283
- df_polars,
284
- columns: List[str],
285
- n_rows: int,
286
- k_neighbors: int = 5,
287
- seed: Optional[int] = None,
288
- **params
289
- ) -> Dict[str, List[Any]]:
290
- """
291
- Main SMOTE generation function.
292
-
293
- Args:
294
- df_polars: Input Polars DataFrame
295
- columns: Columns to use for SMOTE (numeric only)
296
- n_rows: Number of synthetic rows to generate
297
- k_neighbors: Number of nearest neighbors (default: 5)
298
- seed: Random seed for reproducibility
299
- **params: Additional parameters (reserved for future use)
300
-
301
- Returns:
302
- Dictionary mapping column names to generated values
303
-
304
- Raises:
305
- ValidationError: If parameters invalid
306
- AugmentError: If generation fails
307
- """
308
- try:
309
- return apply_smote_strategy(
310
- df_polars,
311
- columns,
312
- n_rows,
313
- k_neighbors,
314
- seed
315
- )
316
-
317
- except Exception as e:
318
- if isinstance(e, (ValidationError, AugmentError)):
319
- raise
320
- raise AugmentError(f"SMOTE generation failed: {e}")