additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,179 @@
1
+ """
2
+ Expression integrity verification for Additory.
3
+
4
+ Verifies SHA-256 hashes of expressions to detect tampering or corruption.
5
+ """
6
+
7
+ import hashlib
8
+ import re
9
+ from typing import Dict
10
+ from pathlib import Path
11
+
12
+
13
+ def verify_sha(expression: str, expected_sha: str) -> bool:
14
+ """
15
+ Verify SHA hash of expression.
16
+
17
+ Args:
18
+ expression: Expression string
19
+ expected_sha: Expected SHA hash
20
+
21
+ Returns:
22
+ True if matches, False if mismatch
23
+
24
+ Example:
25
+ is_valid = verify_sha('weight / (height ** 2)', 'a1b2c3d4e5f6...')
26
+ """
27
+ # Compute actual SHA
28
+ actual_sha = compute_sha(expression)
29
+
30
+ # Compare
31
+ if actual_sha != expected_sha:
32
+ return False
33
+
34
+ return True
35
+
36
+
37
+ def compute_sha(expression: str) -> str:
38
+ """
39
+ Compute SHA-256 hash of expression.
40
+
41
+ Args:
42
+ expression: Expression string
43
+
44
+ Returns:
45
+ SHA-256 hash as hex string
46
+
47
+ Example:
48
+ sha = compute_sha('weight / (height ** 2)')
49
+ # Returns: 'a1b2c3d4e5f6...'
50
+ """
51
+ # Normalize expression
52
+ normalized = normalize_expression(expression)
53
+
54
+ # Compute SHA-256
55
+ sha = hashlib.sha256(normalized.encode('utf-8')).hexdigest()
56
+
57
+ return sha
58
+
59
+
60
+ def normalize_expression(expression: str) -> str:
61
+ """
62
+ Normalize expression for consistent hashing.
63
+
64
+ Args:
65
+ expression: Expression string
66
+
67
+ Returns:
68
+ Normalized expression string
69
+
70
+ Example:
71
+ normalized = normalize_expression(' weight / ( height ** 2 ) ')
72
+ # Returns: 'weight / (height ** 2)'
73
+ """
74
+ # Remove leading/trailing whitespace
75
+ normalized = expression.strip()
76
+
77
+ # Collapse multiple spaces to single space
78
+ normalized = re.sub(r'\s+', ' ', normalized)
79
+
80
+ return normalized
81
+
82
+
83
+ def generate_sha_for_file(file_path: str) -> Dict[str, str]:
84
+ """
85
+ Generate SHA hashes for all expressions in a file.
86
+
87
+ Args:
88
+ file_path: Path to .add file
89
+
90
+ Returns:
91
+ Dictionary mapping expression name to SHA hash
92
+
93
+ Example:
94
+ hashes = generate_sha_for_file('core.add')
95
+ # Returns: {'bmi': 'a1b2c3...', 'bsa': 'f6e5d4...'}
96
+ """
97
+ from additory.expressions.loader import load_expressions_from_file
98
+
99
+ # Load expressions from file (use 'temp' as namespace for generation)
100
+ expressions = load_expressions_from_file(file_path, 'temp')
101
+
102
+ # Generate SHA for each expression
103
+ hashes = {}
104
+ for name, expr_def in expressions.items():
105
+ # Extract expression string from definition
106
+ if isinstance(expr_def, dict):
107
+ expr_string = expr_def.get('expression', '')
108
+ else:
109
+ expr_string = expr_def
110
+
111
+ hashes[name] = compute_sha(expr_string)
112
+
113
+ return hashes
114
+
115
+
116
+ def verify_all_expressions(expressions: Dict[str, Dict]) -> Dict[str, bool]:
117
+ """
118
+ Verify SHA hashes for all expressions.
119
+
120
+ Args:
121
+ expressions: Dictionary of expression definitions
122
+ Each value should have 'expression' and 'sha' keys
123
+
124
+ Returns:
125
+ Dictionary mapping expression name to verification result
126
+
127
+ Example:
128
+ results = verify_all_expressions(loaded_expressions)
129
+ # Returns: {'bmi': True, 'bsa': True, 'custom': False}
130
+ """
131
+ results = {}
132
+
133
+ for name, expr_def in expressions.items():
134
+ # Get expression string and expected SHA
135
+ expr_string = expr_def.get('expression', '')
136
+ expected_sha = expr_def.get('sha', '')
137
+
138
+ # Skip if no SHA provided
139
+ if not expected_sha:
140
+ results[name] = True
141
+ continue
142
+
143
+ # Verify SHA
144
+ is_valid = verify_sha(expr_string, expected_sha)
145
+ results[name] = is_valid
146
+
147
+ # Log warning if mismatch
148
+ if not is_valid:
149
+ actual_sha = compute_sha(expr_string)
150
+ namespace = expr_def.get('namespace', 'unknown')
151
+ log_integrity_warning(name, namespace, expected_sha, actual_sha)
152
+
153
+ return results
154
+
155
+
156
+ def log_integrity_warning(
157
+ expression_name: str,
158
+ namespace: str,
159
+ expected_sha: str,
160
+ actual_sha: str
161
+ ):
162
+ """
163
+ Log integrity warning.
164
+
165
+ Args:
166
+ expression_name: Name of expression
167
+ namespace: Namespace name
168
+ expected_sha: Expected SHA hash
169
+ actual_sha: Actual SHA hash
170
+ """
171
+ from additory.core.logging import Logger
172
+
173
+ logger = Logger()
174
+ logger.warning(
175
+ f"Expression '{expression_name}' in namespace '{namespace}' failed integrity check\n"
176
+ f" Expected SHA: {expected_sha}\n"
177
+ f" Actual SHA: {actual_sha}\n"
178
+ f" This expression may have been modified. Use in development mode only."
179
+ )
@@ -0,0 +1,263 @@
1
+ """
2
+ Expression loader for Additory.
3
+
4
+ Loads expressions from .add files (TOML format) in specified folders.
5
+ """
6
+
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Dict, List
10
+
11
+ # Use tomllib for Python 3.11+ or tomli for earlier versions
12
+ if sys.version_info >= (3, 11):
13
+ import tomllib
14
+ else:
15
+ try:
16
+ import tomli as tomllib
17
+ except ImportError:
18
+ raise ImportError(
19
+ "tomli is required for Python < 3.11. Install with: pip install tomli"
20
+ )
21
+
22
+
23
+ def load_expressions_from_folder(folder_path: str, namespace: str) -> Dict[str, Dict]:
24
+ """
25
+ Load all expressions from .add files in a folder.
26
+
27
+ Args:
28
+ folder_path: Path to folder containing .add files
29
+ namespace: Namespace name for these expressions
30
+
31
+ Returns:
32
+ Dictionary mapping expression name to definition
33
+
34
+ Example:
35
+ expressions = load_expressions_from_folder('/path/to/inbuilt', 'inbuilt')
36
+ # Returns: {'bmi': {...}, 'bsa': {...}, ...}
37
+ """
38
+ # Check if folder exists
39
+ folder = Path(folder_path)
40
+ if not folder.exists():
41
+ raise FileNotFoundError(f"Folder '{folder_path}' not found")
42
+
43
+ if not folder.is_dir():
44
+ raise ValueError(f"'{folder_path}' is not a directory")
45
+
46
+ # Find all .add files
47
+ add_files = find_add_files(folder_path)
48
+
49
+ if not add_files:
50
+ return {}
51
+
52
+ # Load expressions from each file
53
+ all_expressions = {}
54
+
55
+ for file_path in add_files:
56
+ file_expressions = load_expressions_from_file(file_path, namespace)
57
+
58
+ # Check for duplicates
59
+ check_duplicate_names(all_expressions, file_expressions, file_path)
60
+
61
+ # Add to combined dictionary
62
+ all_expressions.update(file_expressions)
63
+
64
+ return all_expressions
65
+
66
+
67
+ def load_expressions_from_file(file_path: str, namespace: str) -> Dict[str, Dict]:
68
+ """
69
+ Load expressions from a single .add file.
70
+
71
+ Args:
72
+ file_path: Path to .add file
73
+ namespace: Namespace name
74
+
75
+ Returns:
76
+ Dictionary mapping expression name to definition
77
+
78
+ Example:
79
+ expressions = load_expressions_from_file('/path/to/core.add', 'inbuilt')
80
+ # Returns: {'bmi': {...}, 'bsa': {...}}
81
+ """
82
+ # Parse TOML file
83
+ try:
84
+ toml_data = parse_toml_file(file_path)
85
+ except Exception as e:
86
+ raise ValueError(f"File '{Path(file_path).name}' is not valid TOML: {str(e)}")
87
+
88
+ # Process each expression
89
+ expressions = {}
90
+ file_name = Path(file_path).name
91
+
92
+ for name, definition in toml_data.items():
93
+ # Validate definition
94
+ validate_expression_definition(name, definition, file_path)
95
+
96
+ # Add metadata
97
+ expression_def = {
98
+ 'name': name,
99
+ 'expression': definition['expression'],
100
+ 'description': definition['description'],
101
+ 'sha': definition['sha'],
102
+ 'namespace': namespace,
103
+ 'source_file': file_name
104
+ }
105
+
106
+ # Add optional fields
107
+ if 'author' in definition:
108
+ expression_def['author'] = definition['author']
109
+ if 'version' in definition:
110
+ expression_def['version'] = definition['version']
111
+ if 'tags' in definition:
112
+ expression_def['tags'] = definition['tags']
113
+
114
+ expressions[name] = expression_def
115
+
116
+ return expressions
117
+
118
+
119
+ def parse_toml_file(file_path: str) -> Dict:
120
+ """
121
+ Parse TOML file.
122
+
123
+ Args:
124
+ file_path: Path to TOML file
125
+
126
+ Returns:
127
+ Parsed TOML as dictionary
128
+ """
129
+ path = Path(file_path)
130
+
131
+ if not path.exists():
132
+ raise FileNotFoundError(f"File '{file_path}' not found")
133
+
134
+ # Read and parse TOML
135
+ with open(path, 'rb') as f:
136
+ return tomllib.load(f)
137
+
138
+
139
+ def validate_expression_definition(name: str, definition: Dict, file_path: str) -> bool:
140
+ """
141
+ Validate expression definition has required fields.
142
+
143
+ Args:
144
+ name: Expression name
145
+ definition: Expression definition dictionary
146
+ file_path: Source file path (for error messages)
147
+
148
+ Returns:
149
+ True if valid
150
+
151
+ Raises:
152
+ ValueError: If validation fails
153
+ """
154
+ file_name = Path(file_path).name
155
+
156
+ # Check if definition is a dictionary
157
+ if not isinstance(definition, dict):
158
+ raise ValueError(
159
+ f"Expression '{name}' in '{file_name}' must be a table/dictionary"
160
+ )
161
+
162
+ # Required fields
163
+ required_fields = ['expression', 'description', 'sha']
164
+
165
+ for field in required_fields:
166
+ if field not in definition:
167
+ raise ValueError(
168
+ f"Expression '{name}' in '{file_name}' missing required field '{field}'"
169
+ )
170
+
171
+ # Check field is a string
172
+ if not isinstance(definition[field], str):
173
+ raise ValueError(
174
+ f"Expression '{name}' in '{file_name}' field '{field}' must be a string"
175
+ )
176
+
177
+ # Check field is not empty
178
+ if not definition[field].strip():
179
+ raise ValueError(
180
+ f"Expression '{name}' in '{file_name}' field '{field}' cannot be empty"
181
+ )
182
+
183
+ # Validate optional fields if present
184
+ if 'author' in definition and not isinstance(definition['author'], str):
185
+ raise ValueError(
186
+ f"Expression '{name}' in '{file_name}' field 'author' must be a string"
187
+ )
188
+
189
+ if 'version' in definition and not isinstance(definition['version'], str):
190
+ raise ValueError(
191
+ f"Expression '{name}' in '{file_name}' field 'version' must be a string"
192
+ )
193
+
194
+ if 'tags' in definition:
195
+ if not isinstance(definition['tags'], list):
196
+ raise ValueError(
197
+ f"Expression '{name}' in '{file_name}' field 'tags' must be a list"
198
+ )
199
+ if not all(isinstance(tag, str) for tag in definition['tags']):
200
+ raise ValueError(
201
+ f"Expression '{name}' in '{file_name}' field 'tags' must contain only strings"
202
+ )
203
+
204
+ return True
205
+
206
+
207
+ def find_add_files(folder_path: str) -> List[str]:
208
+ """
209
+ Find all .add files in a folder.
210
+
211
+ Args:
212
+ folder_path: Path to folder
213
+
214
+ Returns:
215
+ List of .add file paths (sorted alphabetically)
216
+ """
217
+ folder = Path(folder_path)
218
+
219
+ # Find all .add files (not recursive)
220
+ add_files = list(folder.glob('*.add'))
221
+
222
+ # Sort alphabetically
223
+ add_files.sort()
224
+
225
+ # Convert to strings
226
+ return [str(f) for f in add_files]
227
+
228
+
229
+ def check_duplicate_names(
230
+ expressions_dict: Dict[str, Dict],
231
+ new_expressions: Dict[str, Dict],
232
+ source_file: str
233
+ ) -> bool:
234
+ """
235
+ Check for duplicate expression names.
236
+
237
+ Args:
238
+ expressions_dict: Existing expressions
239
+ new_expressions: New expressions to add
240
+ source_file: Source file of new expressions
241
+
242
+ Returns:
243
+ True if no duplicates
244
+
245
+ Raises:
246
+ ValueError: If duplicates found
247
+ """
248
+ duplicates = []
249
+
250
+ for name in new_expressions:
251
+ if name in expressions_dict:
252
+ existing_file = expressions_dict[name]['source_file']
253
+ new_file = Path(source_file).name
254
+ duplicates.append((name, existing_file, new_file))
255
+
256
+ if duplicates:
257
+ error_lines = ["Duplicate expression names found:"]
258
+ for name, existing_file, new_file in duplicates:
259
+ error_lines.append(f" - '{name}' in '{existing_file}' and '{new_file}'")
260
+ error_lines.append("Please rename one of them.")
261
+ raise ValueError('\n'.join(error_lines))
262
+
263
+ return True