additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/__init__.py CHANGED
@@ -1,19 +1,63 @@
1
- # additory/__init__.py
1
+ """
2
+ Additory v0.1.1 - Data Augmentation Library
2
3
 
3
- from .dynamic_api import add as _api_instance
4
+ A Polars-first data augmentation library with 5 main functions:
5
+ - to: Add columns from other DataFrames
6
+ - transform: Transform columns (transpose, encode, extract, etc.)
7
+ - snapshot: Filter and select data
8
+ - synthetic: Generate synthetic data
9
+ - analyze: Analyze data quality and patterns
10
+ - expressions: Evaluate expressions and add computed columns
4
11
 
5
- # Version information
6
- __version__ = "0.1.0a4"
12
+ Usage:
13
+ import additory
14
+
15
+ # Add columns
16
+ result = additory.add.to(df, reference_df, on='id', bring='price')
17
+
18
+ # Transform columns
19
+ result = additory.add.transform(df, mode='onehotencoding', columns=['category'])
20
+
21
+ # Filter data
22
+ result = additory.add.snapshot(df, where='age > 18')
23
+
24
+ # Generate synthetic data
25
+ result = additory.add.synthetic(df, rows=1000)
26
+
27
+ # Analyze data
28
+ result = additory.add.analyze(df, preset='quick')
29
+
30
+ # Evaluate expressions
31
+ result = additory.add.expressions(df, 'inbuilt:bmi', 'age * 12')
32
+ """
7
33
 
8
- # Expose the API instance normally
9
- add = _api_instance
34
+ from types import SimpleNamespace
10
35
 
11
- # Module-level __getattr__ to forward dynamic attributes
12
- def __getattr__(name):
13
- # Delegate all unknown attributes to the API instance
14
- return getattr(_api_instance, name)
36
+ # Import main functions
37
+ from additory.functions.to import to
38
+ from additory.functions.transform import transform
39
+ from additory.functions.snapshot import snapshot
40
+ from additory.functions.synthetic import synthetic
41
+ from additory.functions.analyze import analyze
42
+ from additory.functions.expressions import expressions
15
43
 
16
- __all__ = [
17
- "add",
18
- "__version__",
19
- ]
44
+ # Import configuration functions
45
+ from additory.core.config import set_expressions_folder, set_default_backend
46
+
47
+ # Create simple API namespace
48
+ add = SimpleNamespace(
49
+ to=to,
50
+ transform=transform,
51
+ snapshot=snapshot,
52
+ synthetic=synthetic,
53
+ analyze=analyze,
54
+ expressions=expressions,
55
+ set_expressions_folder=set_expressions_folder,
56
+ set_default_backend=set_default_backend
57
+ )
58
+
59
+ # Version
60
+ __version__ = "0.1.1a1"
61
+
62
+ # Public API
63
+ __all__ = ['add', '__version__']
@@ -1,157 +1,41 @@
1
1
  """
2
- Common Utilities Module
3
-
4
- Shared functionality used by both synthetic and expressions modules:
5
- - Distribution functions (normal, uniform, skewed, etc.)
6
- - List file management (.list format)
7
- - Pattern file management (.properties format)
8
- - Fallback resolution logic
9
-
10
- This module eliminates code duplication and provides consistent behavior
11
- across synthetic and expression data generation.
2
+ Common utilities for Additory.
3
+
4
+ This module provides shared utilities used across all functions:
5
+ - validation: Input validation
6
+ - strategy_parser: Strategy parsing
7
+ - column_selector: Column selection
8
+ - result: Result wrappers
9
+ - extractors: Feature extractors
10
+ - unit_conversions: Unit conversion utilities
11
+ - knn_imputation: KNN imputation
12
+ - distributions: Distribution generation
12
13
  """
13
14
 
14
- from .distributions import (
15
- generate_normal,
16
- generate_uniform,
17
- generate_skewed,
18
- generate_beta,
19
- generate_gamma,
20
- generate_exponential_dist,
21
- generate_kde,
22
- generate_multivariate_normal,
23
- generate_distribution_values,
24
- estimate_distribution_params,
25
- calculate_skewness,
26
- detect_distribution_type,
27
- DistributionType,
28
- )
29
-
30
- from .lists import (
31
- load_list_file,
32
- parse_list_file,
33
- get_list_values,
34
- list_all_lists,
35
- )
36
-
37
- from .patterns import (
38
- load_properties_file,
39
- parse_properties_file,
40
- get_pattern,
41
- list_all_patterns,
42
- )
43
-
44
- from .resolver import (
45
- resolve_pattern,
46
- resolve_with_logging,
47
- PatternResolutionResult,
48
- PreferMode,
49
- )
50
-
51
- from .backend import (
52
- detect_backend,
53
- is_dataframe,
54
- to_polars,
55
- from_polars,
56
- BackendType,
57
- )
58
-
59
15
  from .validation import (
60
16
  validate_dataframe,
61
- validate_columns_exist,
62
- validate_positive_number,
63
- validate_non_negative_number,
64
- validate_parameter_choice,
65
- validate_ratio,
66
- validate_string_not_empty,
67
- validate_integer_in_range,
68
- ValidationError,
69
- )
70
-
71
- from .exceptions import (
72
- AdditoryError,
73
- ValidationError,
74
- BackendError,
75
- ConversionError,
76
- ExpressionError,
77
- ConfigurationError,
78
- UnitConversionError,
79
- EncodingError,
80
- LookupError,
81
- SyntheticDataError,
82
- AugmentError,
83
- )
84
-
85
- from .column_utils import (
86
- sanitize_column_name,
87
- generate_safe_column_name,
17
+ validate_not_empty,
88
18
  validate_column_name,
89
- truncate_column_name,
90
- generate_column_names_with_prefix_suffix,
19
+ validate_positive_integer,
20
+ validate_percentage,
21
+ validate_mode,
22
+ validate_dict,
23
+ validate_list,
24
+ validate_string,
25
+ validate_boolean,
26
+ validate_optional
91
27
  )
92
28
 
93
29
  __all__ = [
94
- # Distribution functions
95
- "generate_normal",
96
- "generate_uniform",
97
- "generate_skewed",
98
- "generate_beta",
99
- "generate_gamma",
100
- "generate_exponential_dist",
101
- "generate_kde",
102
- "generate_multivariate_normal",
103
- "generate_distribution_values",
104
- "estimate_distribution_params",
105
- "calculate_skewness",
106
- "detect_distribution_type",
107
- "DistributionType",
108
- # List management
109
- "load_list_file",
110
- "parse_list_file",
111
- "get_list_values",
112
- "list_all_lists",
113
- # Pattern management
114
- "load_properties_file",
115
- "parse_properties_file",
116
- "get_pattern",
117
- "list_all_patterns",
118
- # Resolution
119
- "resolve_pattern",
120
- "resolve_with_logging",
121
- "PatternResolutionResult",
122
- "PreferMode",
123
- # Backend detection
124
- "detect_backend",
125
- "is_dataframe",
126
- "to_polars",
127
- "from_polars",
128
- "BackendType",
129
- # Validation
130
- "validate_dataframe",
131
- "validate_columns_exist",
132
- "validate_positive_number",
133
- "validate_non_negative_number",
134
- "validate_parameter_choice",
135
- "validate_ratio",
136
- "validate_string_not_empty",
137
- "validate_integer_in_range",
138
- "ValidationError",
139
- # Exceptions
140
- "AdditoryError",
141
- "ValidationError",
142
- "BackendError",
143
- "ConversionError",
144
- "ExpressionError",
145
- "ConfigurationError",
146
- "UnitConversionError",
147
- "EncodingError",
148
- "LookupError",
149
- "SyntheticDataError",
150
- "AugmentError",
151
- # Column utilities
152
- "sanitize_column_name",
153
- "generate_safe_column_name",
154
- "validate_column_name",
155
- "truncate_column_name",
156
- "generate_column_names_with_prefix_suffix",
30
+ 'validate_dataframe',
31
+ 'validate_not_empty',
32
+ 'validate_column_name',
33
+ 'validate_positive_integer',
34
+ 'validate_percentage',
35
+ 'validate_mode',
36
+ 'validate_dict',
37
+ 'validate_list',
38
+ 'validate_string',
39
+ 'validate_boolean',
40
+ 'validate_optional'
157
41
  ]
@@ -0,0 +1,255 @@
1
+ """
2
+ Column selection and validation utilities for Additory.
3
+
4
+ Provides pattern matching and type-based column selection.
5
+ """
6
+
7
+ import re
8
+ from typing import Any, List, Optional, Union
9
+ import polars as pl
10
+
11
+
12
+ def select_columns(df: pl.DataFrame, columns: Union[str, List[str], None]) -> List[str]:
13
+ """
14
+ Select columns from DataFrame with pattern matching support.
15
+
16
+ Args:
17
+ df: DataFrame to select columns from
18
+ columns: Column specification (None='*', str pattern, or list)
19
+
20
+ Returns:
21
+ List of selected column names
22
+
23
+ Raises:
24
+ ValueError: If no columns match the pattern
25
+
26
+ Example:
27
+ # Select all columns
28
+ cols = select_columns(df, '*')
29
+
30
+ # Select by pattern
31
+ cols = select_columns(df, 'age_*')
32
+
33
+ # Select specific columns
34
+ cols = select_columns(df, ['name', 'email', 'age'])
35
+ """
36
+ # None means all columns
37
+ if columns is None:
38
+ return df.columns
39
+
40
+ # String pattern
41
+ if isinstance(columns, str):
42
+ # '*' means all columns
43
+ if columns == '*':
44
+ return df.columns
45
+
46
+ # Check if it's a pattern or exact match
47
+ if '*' in columns:
48
+ # Pattern matching
49
+ matched = []
50
+ for col in df.columns:
51
+ if match_pattern(col, columns):
52
+ matched.append(col)
53
+
54
+ if not matched:
55
+ raise ValueError(f"No columns match pattern '{columns}'")
56
+
57
+ return matched
58
+ else:
59
+ # Exact match
60
+ if columns not in df.columns:
61
+ raise ValueError(f"Column '{columns}' not found in DataFrame")
62
+ return [columns]
63
+
64
+ # List of columns
65
+ if isinstance(columns, list):
66
+ # Expand any patterns in the list
67
+ expanded = expand_column_patterns(df, columns)
68
+
69
+ # Validate all columns exist
70
+ validate_columns_exist(df, expanded)
71
+
72
+ return expanded
73
+
74
+ raise TypeError(
75
+ f"columns must be None, str, or list, got {type(columns).__name__}"
76
+ )
77
+
78
+
79
+ def match_pattern(column_name: str, pattern: str) -> bool:
80
+ """
81
+ Check if column name matches pattern.
82
+
83
+ Args:
84
+ column_name: Column name to check
85
+ pattern: Pattern to match ('*', 'prefix_*', '*_suffix', 'exact')
86
+
87
+ Returns:
88
+ True if matches, False otherwise
89
+
90
+ Example:
91
+ match_pattern('age_years', 'age_*') # True
92
+ match_pattern('total_age', '*_age') # True
93
+ match_pattern('age', 'age') # True
94
+ """
95
+ # Exact match
96
+ if pattern == column_name:
97
+ return True
98
+
99
+ # Wildcard match all
100
+ if pattern == '*':
101
+ return True
102
+
103
+ # Convert pattern to regex
104
+ # Escape special regex characters except *
105
+ regex_pattern = re.escape(pattern).replace(r'\*', '.*')
106
+
107
+ # Anchor to start and end
108
+ regex_pattern = f'^{regex_pattern}$'
109
+
110
+ return bool(re.match(regex_pattern, column_name))
111
+
112
+
113
+ def validate_columns_exist(df: pl.DataFrame, columns: List[str]) -> bool:
114
+ """
115
+ Validate that all columns exist in DataFrame.
116
+
117
+ Args:
118
+ df: DataFrame to check
119
+ columns: List of column names to validate
120
+
121
+ Returns:
122
+ True if all exist
123
+
124
+ Raises:
125
+ ValueError: If any columns are missing
126
+
127
+ Example:
128
+ validate_columns_exist(df, ['name', 'age', 'email'])
129
+ """
130
+ missing = []
131
+ for col in columns:
132
+ if col not in df.columns:
133
+ missing.append(col)
134
+
135
+ if missing:
136
+ if len(missing) == 1:
137
+ raise ValueError(f"Column '{missing[0]}' not found in DataFrame")
138
+ else:
139
+ raise ValueError(
140
+ f"Columns {missing} not found in DataFrame. "
141
+ f"Available columns: {df.columns}"
142
+ )
143
+
144
+ return True
145
+
146
+
147
+ def expand_column_patterns(df: pl.DataFrame, patterns: List[str]) -> List[str]:
148
+ """
149
+ Expand column patterns to actual column names.
150
+
151
+ Args:
152
+ df: DataFrame to expand patterns from
153
+ patterns: List of patterns to expand
154
+
155
+ Returns:
156
+ List of expanded column names (no duplicates)
157
+
158
+ Example:
159
+ # Input: ['age_*', 'total_*']
160
+ # Output: ['age_years', 'age_months', 'total_sales', 'total_orders']
161
+ cols = expand_column_patterns(df, ['age_*', 'total_*'])
162
+ """
163
+ expanded = []
164
+ seen = set()
165
+
166
+ for pattern in patterns:
167
+ # If pattern contains wildcard, expand it
168
+ if '*' in pattern:
169
+ matched = False
170
+ for col in df.columns:
171
+ if match_pattern(col, pattern):
172
+ if col not in seen:
173
+ expanded.append(col)
174
+ seen.add(col)
175
+ matched = True
176
+
177
+ if not matched:
178
+ raise ValueError(f"No columns match pattern '{pattern}'")
179
+ else:
180
+ # Exact column name
181
+ if pattern not in seen:
182
+ expanded.append(pattern)
183
+ seen.add(pattern)
184
+
185
+ return expanded
186
+
187
+
188
+ def get_column_type(df: pl.DataFrame, column: str) -> str:
189
+ """
190
+ Get the data type of a column.
191
+
192
+ Args:
193
+ df: DataFrame containing the column
194
+ column: Column name
195
+
196
+ Returns:
197
+ Type string ('numeric', 'string', 'datetime', 'boolean', 'other')
198
+
199
+ Raises:
200
+ ValueError: If column doesn't exist
201
+ """
202
+ if column not in df.columns:
203
+ raise ValueError(f"Column '{column}' not found in DataFrame")
204
+
205
+ dtype = df[column].dtype
206
+
207
+ # Numeric types
208
+ if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
209
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
210
+ pl.Float32, pl.Float64]:
211
+ return 'numeric'
212
+
213
+ # String types
214
+ if dtype in [pl.Utf8, pl.Categorical]:
215
+ return 'string'
216
+
217
+ # Datetime types
218
+ if dtype in [pl.Date, pl.Time, pl.Duration]:
219
+ return 'datetime'
220
+
221
+ # Check for Datetime with timezone info
222
+ if isinstance(dtype, pl.Datetime):
223
+ return 'datetime'
224
+
225
+ # Boolean type
226
+ if dtype == pl.Boolean:
227
+ return 'boolean'
228
+
229
+ # Other types
230
+ return 'other'
231
+
232
+
233
+ def filter_columns_by_type(df: pl.DataFrame, columns: List[str], dtype: str) -> List[str]:
234
+ """
235
+ Filter columns by data type.
236
+
237
+ Args:
238
+ df: DataFrame to filter columns from
239
+ columns: List of columns to filter
240
+ dtype: Type to filter by ('numeric', 'string', 'datetime', 'boolean')
241
+
242
+ Returns:
243
+ List of columns matching the type
244
+
245
+ Example:
246
+ numeric_cols = filter_columns_by_type(df, all_cols, 'numeric')
247
+ """
248
+ filtered = []
249
+
250
+ for col in columns:
251
+ col_type = get_column_type(df, col)
252
+ if col_type == dtype:
253
+ filtered.append(col)
254
+
255
+ return filtered