additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,168 @@
1
+ """
2
+ Generate synthetic data from scratch.
3
+
4
+ This module provides functionality to create synthetic data using various strategies.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any
9
+ import numpy as np
10
+
11
+ from additory.common.distributions import (
12
+ generate_normal, generate_uniform, generate_correlated
13
+ )
14
+ from additory.core.logging import Logger
15
+
16
+
17
+ def generate_data(n_rows: int, strategy: Dict[str, str]) -> pl.DataFrame:
18
+ """
19
+ Generate synthetic data from scratch.
20
+
21
+ Args:
22
+ n_rows: Number of rows to generate
23
+ strategy: Dictionary mapping column to generation strategy
24
+
25
+ Returns:
26
+ DataFrame with synthetic data
27
+
28
+ Example:
29
+ >>> result = generate_data(n_rows=1000, strategy={
30
+ ... 'id': 'increment:start=1',
31
+ ... 'age': 'range:18-65',
32
+ ... 'status': 'choice:[Active,Inactive,Pending]'
33
+ ... })
34
+ """
35
+ logger = Logger()
36
+ logger.info(f"Generating {n_rows} rows with {len(strategy)} columns")
37
+
38
+ if not strategy:
39
+ raise ValueError("strategy dictionary cannot be empty")
40
+
41
+ if n_rows <= 0:
42
+ raise ValueError(f"n_rows must be positive, got {n_rows}")
43
+
44
+ # Generate each column
45
+ columns = {}
46
+ for col_name, strategy_value in strategy.items():
47
+ logger.info(f"Generating column: {col_name}")
48
+ columns[col_name] = generate_column(n_rows, strategy_value)
49
+
50
+ # Create DataFrame
51
+ result = pl.DataFrame(columns)
52
+
53
+ logger.info(f"Generated {len(result)} rows × {len(result.columns)} columns")
54
+
55
+ return result
56
+
57
+
58
+ def generate_column(n_rows: int, strategy_value: str) -> pl.Series:
59
+ """
60
+ Generate a single column based on strategy.
61
+
62
+ Args:
63
+ n_rows: Number of rows
64
+ strategy_value: Strategy string
65
+
66
+ Returns:
67
+ Series with generated data
68
+
69
+ Supported strategies:
70
+ - increment:start=1:step=1
71
+ - range:18-65
72
+ - choice:[A,B,C]
73
+ - normal:mean=50:std=10
74
+ - uniform:low=0:high=100
75
+ """
76
+ # Parse strategy
77
+ strategy_type, params = parse_strategy_value(strategy_value)
78
+
79
+ # Generate based on type
80
+ if strategy_type == 'increment':
81
+ start = params.get('start', 1)
82
+ step = params.get('step', 1)
83
+ return pl.Series(range(start, start + n_rows * step, step))
84
+
85
+ elif strategy_type == 'range':
86
+ low, high = params['range']
87
+ return pl.Series(np.random.randint(low, high + 1, n_rows))
88
+
89
+ elif strategy_type == 'choice':
90
+ choices = params['choices']
91
+ weights = params.get('weights')
92
+ if weights:
93
+ return pl.Series(np.random.choice(choices, n_rows, p=weights))
94
+ else:
95
+ return pl.Series(np.random.choice(choices, n_rows))
96
+
97
+ elif strategy_type == 'normal':
98
+ mean = params['mean']
99
+ std = params['std']
100
+ return generate_normal(n_rows, mean, std)
101
+
102
+ elif strategy_type == 'uniform':
103
+ low = params['low']
104
+ high = params['high']
105
+ return generate_uniform(n_rows, low, high)
106
+
107
+ else:
108
+ raise ValueError(f"Unsupported strategy type: {strategy_type}")
109
+
110
+
111
+ def parse_strategy_value(strategy_value: str) -> tuple[str, Dict[str, Any]]:
112
+ """
113
+ Parse strategy string into type and parameters.
114
+
115
+ Args:
116
+ strategy_value: Strategy string
117
+
118
+ Returns:
119
+ Tuple of (strategy_type, parameters)
120
+
121
+ Examples:
122
+ >>> parse_strategy_value('increment:start=1:step=2')
123
+ ('increment', {'start': 1, 'step': 2})
124
+
125
+ >>> parse_strategy_value('range:18-65')
126
+ ('range', {'range': (18, 65)})
127
+
128
+ >>> parse_strategy_value('choice:[A,B,C]')
129
+ ('choice', {'choices': ['A', 'B', 'C']})
130
+ """
131
+ parts = strategy_value.split(':')
132
+ strategy_type = parts[0]
133
+ params = {}
134
+
135
+ if strategy_type == 'increment':
136
+ for part in parts[1:]:
137
+ if '=' in part:
138
+ key, value = part.split('=')
139
+ params[key] = int(value)
140
+
141
+ elif strategy_type == 'range':
142
+ if len(parts) > 1:
143
+ range_str = parts[1]
144
+ low, high = map(int, range_str.split('-'))
145
+ params['range'] = (low, high)
146
+
147
+ elif strategy_type == 'choice':
148
+ if len(parts) > 1:
149
+ choices_str = parts[1]
150
+ # Remove brackets
151
+ choices_str = choices_str.strip('[]')
152
+ # Split by comma
153
+ choices = [c.strip() for c in choices_str.split(',')]
154
+ params['choices'] = choices
155
+
156
+ elif strategy_type == 'normal':
157
+ for part in parts[1:]:
158
+ if '=' in part:
159
+ key, value = part.split('=')
160
+ params[key] = float(value)
161
+
162
+ elif strategy_type == 'uniform':
163
+ for part in parts[1:]:
164
+ if '=' in part:
165
+ key, value = part.split('=')
166
+ params[key] = float(value)
167
+
168
+ return strategy_type, params
@@ -0,0 +1,116 @@
1
+ """
2
+ Preset configurations for common synthetic data types.
3
+
4
+ This module provides preset configurations for generating common data types.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, List
9
+
10
+ from additory.functions.synthetic.strategies.generative import generate_data
11
+ from additory.core.logging import Logger
12
+
13
+
14
+ # Define presets
15
+ PRESETS = {
16
+ 'users': {
17
+ 'id': 'increment:start=1',
18
+ 'age': 'range:18-75',
19
+ 'status': 'choice:[Active,Inactive,Pending]',
20
+ 'country': 'choice:[USA,UK,Canada,Australia,Germany]'
21
+ },
22
+ 'transactions': {
23
+ 'transaction_id': 'increment:start=1000',
24
+ 'amount': 'uniform:low=10:high=1000',
25
+ 'currency': 'choice:[USD,EUR,GBP]',
26
+ 'status': 'choice:[completed,pending,failed]'
27
+ },
28
+ 'products': {
29
+ 'product_id': 'increment:start=1',
30
+ 'price': 'uniform:low=5:high=500',
31
+ 'stock': 'range:0-1000',
32
+ 'category': 'choice:[Electronics,Clothing,Food,Books]'
33
+ },
34
+ 'timeseries': {
35
+ 'value': 'normal:mean=100:std=20'
36
+ },
37
+ 'medical': {
38
+ 'patient_id': 'increment:start=1',
39
+ 'age': 'range:18-90',
40
+ 'weight': 'normal:mean=70:std=15',
41
+ 'height': 'normal:mean=170:std=10'
42
+ }
43
+ }
44
+
45
+
46
+ def apply_preset(preset_name: str, n_rows: int) -> pl.DataFrame:
47
+ """
48
+ Apply preset configuration.
49
+
50
+ Args:
51
+ preset_name: Name of preset
52
+ n_rows: Number of rows to generate
53
+
54
+ Returns:
55
+ DataFrame with synthetic data
56
+
57
+ Raises:
58
+ ValueError: If preset not found
59
+
60
+ Example:
61
+ >>> result = apply_preset('users', n_rows=1000)
62
+ """
63
+ logger = Logger()
64
+
65
+ if preset_name not in PRESETS:
66
+ available = list_presets()
67
+ raise ValueError(
68
+ f"Preset '{preset_name}' not found. "
69
+ f"Available presets: {', '.join(available)}"
70
+ )
71
+
72
+ logger.info(f"Applying preset: {preset_name}")
73
+
74
+ # Get preset strategy
75
+ strategy = get_preset_strategy(preset_name)
76
+
77
+ # Generate data
78
+ result = generate_data(n_rows, strategy)
79
+
80
+ logger.info(f"Preset '{preset_name}' applied: {len(result)} rows")
81
+
82
+ return result
83
+
84
+
85
+ def get_preset_strategy(preset_name: str) -> Dict[str, str]:
86
+ """
87
+ Get strategy dictionary for preset.
88
+
89
+ Args:
90
+ preset_name: Name of preset
91
+
92
+ Returns:
93
+ Strategy dictionary
94
+
95
+ Raises:
96
+ ValueError: If preset not found
97
+ """
98
+ if preset_name not in PRESETS:
99
+ raise ValueError(f"Preset '{preset_name}' not found")
100
+
101
+ return PRESETS[preset_name].copy()
102
+
103
+
104
+ def list_presets() -> List[str]:
105
+ """
106
+ List available presets.
107
+
108
+ Returns:
109
+ List of preset names
110
+
111
+ Example:
112
+ >>> presets = list_presets()
113
+ >>> print(presets)
114
+ ['users', 'transactions', 'products', 'timeseries', 'medical']
115
+ """
116
+ return list(PRESETS.keys())
@@ -0,0 +1,188 @@
1
+ """
2
+ Main to function - add columns to DataFrame.
3
+
4
+ This module provides the main user-facing to() function with multiple modes:
5
+ - lookup: Add columns from reference DataFrame
6
+ - summarize: Group and aggregate data
7
+ - merge: Merge multiple DataFrames
8
+ - sort: Sort DataFrame
9
+ """
10
+
11
+ import polars as pl
12
+ from typing import Union, List, Optional, Dict, Any
13
+
14
+ from additory.core.backend import detect_backend, to_polars, from_polars
15
+ from additory.core.logging import Logger
16
+ from additory.core.memory_manager import MemoryManager
17
+ from additory.common.validation import validate_dataframe, validate_not_empty
18
+ from additory.common.result import wrap_result
19
+
20
+ from additory.functions.to.lookup import perform_lookup
21
+ from additory.functions.to.summarize import perform_summarize
22
+ from additory.functions.to.merge import perform_merge
23
+ from additory.functions.to.sort import perform_sort
24
+
25
+
26
+ def to(
27
+ df: Any,
28
+ from_df: Optional[Any] = None,
29
+ bring: Optional[Union[str, List[str]]] = None,
30
+ against: Optional[Union[str, List[str]]] = None,
31
+ on: Optional[Union[str, List[str]]] = None, # Alias for 'against'
32
+ to: Optional[str] = None,
33
+ bring_at: Optional[str] = None,
34
+ strategy: Optional[Dict] = None,
35
+ **kwargs
36
+ ) -> Any:
37
+ """
38
+ Add columns to DataFrame using various modes.
39
+
40
+ Args:
41
+ df: Input DataFrame (or list of DataFrames for merge mode)
42
+ from_df: Reference DataFrame (for lookup mode)
43
+ bring: Column(s) to bring (for lookup mode)
44
+ against: Key column(s) for matching (for lookup mode)
45
+ to: Special mode indicator ('@summarize', '@merge', '@sort')
46
+ bring_at: Position to insert columns
47
+ strategy: Strategy dictionary for advanced control
48
+ **kwargs: Mode-specific parameters
49
+
50
+ Returns:
51
+ DataFrame with added columns (wrapped in Result)
52
+
53
+ Modes:
54
+ 1. Lookup (default): Add columns from reference DataFrame
55
+ - Triggered by: from_df is provided
56
+ - Example: to(df, from_df=products, bring='price', against='product_id')
57
+
58
+ 2. Summarize: Group and aggregate data
59
+ - Triggered by: to='@summarize'
60
+ - Example: to(df, to='@summarize', group_by='category', aggregations={'sales': 'sum'})
61
+
62
+ 3. Merge: Merge multiple DataFrames
63
+ - Triggered by: to='@merge'
64
+ - Example: to([df1, df2, df3], to='@merge')
65
+
66
+ 4. Sort: Sort DataFrame
67
+ - Triggered by: to='@sort'
68
+ - Example: to(df, to='@sort', by='date', descending=True)
69
+ """
70
+ logger = Logger()
71
+ memory_manager = MemoryManager()
72
+
73
+ # Handle 'on' as alias for 'against'
74
+ if on is not None and against is None:
75
+ against = on
76
+ elif on is not None and against is not None:
77
+ raise ValueError("Cannot specify both 'on' and 'against' parameters")
78
+
79
+ try:
80
+ # Detect mode
81
+ mode = detect_mode(df, from_df, to, **kwargs)
82
+ logger.set_context('to', {'mode': mode})
83
+ logger.info(f"Starting to() function in '{mode}' mode")
84
+
85
+ # Handle merge mode specially (df is a list)
86
+ if mode == 'merge':
87
+ # Validate list of DataFrames
88
+ if not isinstance(df, list):
89
+ raise TypeError("For merge mode, df must be a list of DataFrames")
90
+
91
+ # Detect backend from first DataFrame
92
+ backend = detect_backend(df[0])
93
+
94
+ # Convert all to Polars
95
+ polars_dfs = [to_polars(d) for d in df]
96
+
97
+ # Perform merge
98
+ result = perform_merge(polars_dfs, **kwargs)
99
+
100
+ else:
101
+ # Single DataFrame modes
102
+ # Validate input
103
+ validate_dataframe(df)
104
+ validate_not_empty(df)
105
+
106
+ # Detect backend and convert to Polars
107
+ backend = detect_backend(df)
108
+ polars_df = to_polars(df)
109
+
110
+ # Dispatch to appropriate mode
111
+ if mode == 'summarize':
112
+ result = perform_summarize(polars_df, **kwargs)
113
+ elif mode == 'sort':
114
+ result = perform_sort(polars_df, **kwargs)
115
+ elif mode == 'lookup':
116
+ # Convert from_df to Polars
117
+ polars_from_df = to_polars(from_df)
118
+ result = perform_lookup(
119
+ polars_df,
120
+ polars_from_df,
121
+ bring,
122
+ against,
123
+ bring_at,
124
+ strategy
125
+ )
126
+ else:
127
+ raise ValueError(f"Unknown mode: {mode}")
128
+
129
+ # Convert back to original backend
130
+ result = from_polars(result, backend)
131
+
132
+ # Cleanup
133
+ memory_manager.cleanup()
134
+
135
+ # Wrap result
136
+ logger.info(f"to() function complete: {len(result)} rows, {len(result.columns)} columns")
137
+ return wrap_result(result, 'to', metadata={'mode': mode})
138
+
139
+ except Exception as e:
140
+ logger.error(f"Error in to() function: {str(e)}", error_location="to")
141
+ raise
142
+
143
+
144
+ def detect_mode(
145
+ df: Any,
146
+ from_df: Optional[Any],
147
+ to: Optional[str],
148
+ **kwargs
149
+ ) -> str:
150
+ """
151
+ Detect which mode to use based on parameters.
152
+
153
+ Args:
154
+ df: Input DataFrame
155
+ from_df: Reference DataFrame
156
+ to: Mode indicator
157
+ **kwargs: Additional parameters
158
+
159
+ Returns:
160
+ Mode string ('lookup', 'summarize', 'merge', 'sort')
161
+
162
+ Raises:
163
+ ValueError: If mode cannot be determined
164
+ """
165
+ # Check for explicit mode indicators
166
+ if to == '@summarize':
167
+ return 'summarize'
168
+ elif to == '@merge':
169
+ return 'merge'
170
+ elif to == '@sort':
171
+ return 'sort'
172
+
173
+ # Check for lookup mode (from_df provided)
174
+ elif from_df is not None:
175
+ return 'lookup'
176
+
177
+ # Cannot determine mode
178
+ else:
179
+ raise ValueError(
180
+ "Cannot determine mode. Please provide either:\n"
181
+ " - from_df (for lookup mode)\n"
182
+ " - to='@summarize' (for summarize mode)\n"
183
+ " - to='@merge' (for merge mode)\n"
184
+ " - to='@sort' (for sort mode)"
185
+ )
186
+
187
+
188
+ __all__ = ['to']