additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/common/lists.py DELETED
@@ -1,229 +0,0 @@
1
- """
2
- List File Management
3
-
4
- Handles loading and parsing of .list files containing static value lists.
5
-
6
- File Format (.list):
7
- [lists]
8
- first_names = Arjun, Vikram, Samuel, James, Mary
9
- last_names = Sharma, Kumar, Smith, Johnson
10
-
11
- [relationships]
12
- first_names[0] = last_names[0, 1]
13
-
14
- Usage:
15
- from additory.common.lists import load_list_file, get_list_values
16
-
17
- lists = load_list_file("reference/schema_definitions/global.list")
18
- first_names = get_list_values("first_names", lists)
19
- """
20
-
21
- from typing import Dict, List, Optional
22
- from pathlib import Path
23
- import re
24
-
25
-
26
- class ListFileError(Exception):
27
- """Raised when list file parsing fails."""
28
- pass
29
-
30
-
31
- def parse_list_file(content: str) -> Dict[str, List[str]]:
32
- """
33
- Parse .list file content into dictionary of lists.
34
-
35
- Format:
36
- [lists]
37
- list_name = value1, value2, value3
38
-
39
- [relationships]
40
- list1[0] = list2[1, 2]
41
-
42
- Args:
43
- content: File content as string
44
-
45
- Returns:
46
- Dictionary mapping list names to value lists
47
-
48
- Raises:
49
- ListFileError: If parsing fails
50
-
51
- Example:
52
- >>> content = '''
53
- ... [lists]
54
- ... names = Alice, Bob, Charlie
55
- ... statuses = Active, Inactive
56
- ... '''
57
- >>> lists = parse_list_file(content)
58
- >>> lists['names']
59
- ['Alice', 'Bob', 'Charlie']
60
- """
61
- lists = {}
62
- current_section = None
63
-
64
- for line_num, line in enumerate(content.split('\n'), 1):
65
- # Remove comments and strip whitespace
66
- line = line.split('#')[0].strip()
67
-
68
- # Skip empty lines
69
- if not line:
70
- continue
71
-
72
- # Check for section headers
73
- if line.startswith('[') and line.endswith(']'):
74
- current_section = line[1:-1].strip()
75
- continue
76
-
77
- # Parse list definitions (only in [lists] section)
78
- if current_section == 'lists':
79
- if '=' not in line:
80
- raise ListFileError(
81
- f"Line {line_num}: Invalid format. Expected 'name = value1, value2, ...'"
82
- )
83
-
84
- name, values_str = line.split('=', 1)
85
- name = name.strip()
86
-
87
- # Validate list name
88
- if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
89
- raise ListFileError(
90
- f"Line {line_num}: Invalid list name '{name}'. "
91
- f"Must start with letter/underscore and contain only alphanumeric/underscore."
92
- )
93
-
94
- # Parse comma-separated values
95
- values = [v.strip() for v in values_str.split(',')]
96
- values = [v for v in values if v] # Remove empty strings
97
-
98
- if not values:
99
- raise ListFileError(
100
- f"Line {line_num}: List '{name}' has no values"
101
- )
102
-
103
- lists[name] = values
104
-
105
- # Skip relationships section for now (Phase II)
106
- elif current_section == 'relationships':
107
- continue
108
-
109
- # Unknown section
110
- elif current_section is not None:
111
- # Allow unknown sections (for future extensions)
112
- continue
113
-
114
- return lists
115
-
116
-
117
- def load_list_file(filepath: str) -> Dict[str, List[str]]:
118
- """
119
- Load and parse a .list file.
120
-
121
- Args:
122
- filepath: Path to .list file
123
-
124
- Returns:
125
- Dictionary mapping list names to value lists
126
-
127
- Raises:
128
- ListFileError: If file not found or parsing fails
129
-
130
- Example:
131
- >>> lists = load_list_file("reference/schema_definitions/global.list")
132
- >>> lists['first_names']
133
- ['Arjun', 'Vikram', 'Samuel', ...]
134
- """
135
- path = Path(filepath)
136
-
137
- if not path.exists():
138
- raise ListFileError(f"List file not found: {filepath}")
139
-
140
- if not path.suffix == '.list':
141
- raise ListFileError(f"File must have .list extension: {filepath}")
142
-
143
- try:
144
- content = path.read_text(encoding='utf-8')
145
- return parse_list_file(content)
146
- except UnicodeDecodeError as e:
147
- raise ListFileError(f"Failed to read file {filepath}: {e}")
148
- except Exception as e:
149
- if isinstance(e, ListFileError):
150
- raise
151
- raise ListFileError(f"Failed to parse {filepath}: {e}")
152
-
153
-
154
- def get_list_values(list_name: str, lists: Dict[str, List[str]]) -> Optional[List[str]]:
155
- """
156
- Get values for a specific list.
157
-
158
- Args:
159
- list_name: Name of the list
160
- lists: Dictionary of lists (from load_list_file or parse_list_file)
161
-
162
- Returns:
163
- List of values or None if not found
164
-
165
- Example:
166
- >>> lists = load_list_file("global.list")
167
- >>> values = get_list_values("first_names", lists)
168
- >>> print(values[:3])
169
- ['Arjun', 'Vikram', 'Samuel']
170
- """
171
- return lists.get(list_name)
172
-
173
-
174
- def list_all_lists(lists: Dict[str, List[str]]) -> List[str]:
175
- """
176
- Get names of all available lists.
177
-
178
- Args:
179
- lists: Dictionary of lists
180
-
181
- Returns:
182
- List of list names
183
-
184
- Example:
185
- >>> lists = load_list_file("global.list")
186
- >>> names = list_all_lists(lists)
187
- >>> print(names)
188
- ['first_names', 'last_names', 'banks', 'statuses', ...]
189
- """
190
- return list(lists.keys())
191
-
192
-
193
- def validate_list_file(filepath: str) -> tuple[bool, List[str]]:
194
- """
195
- Validate a .list file and return any errors.
196
-
197
- Args:
198
- filepath: Path to .list file
199
-
200
- Returns:
201
- Tuple of (is_valid, error_messages)
202
-
203
- Example:
204
- >>> is_valid, errors = validate_list_file("global.list")
205
- >>> if not is_valid:
206
- ... for error in errors:
207
- ... print(error)
208
- """
209
- errors = []
210
-
211
- try:
212
- lists = load_list_file(filepath)
213
-
214
- # Check for empty file
215
- if not lists:
216
- errors.append("File contains no lists")
217
-
218
- # Check for duplicate names (already handled by dict)
219
- # Check for empty lists
220
- for name, values in lists.items():
221
- if not values:
222
- errors.append(f"List '{name}' is empty")
223
-
224
- return (len(errors) == 0, errors)
225
-
226
- except ListFileError as e:
227
- return (False, [str(e)])
228
- except Exception as e:
229
- return (False, [f"Unexpected error: {e}"])
@@ -1,240 +0,0 @@
1
- """
2
- Pattern File Management
3
-
4
- Handles loading and parsing of .properties files containing regex patterns.
5
-
6
- File Format (.properties):
7
- # Email patterns
8
- email_generic = [A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\\\.[A-Z|a-z]{2,}
9
- email_us = [A-Za-z0-9._%+-]+@(gmail|yahoo|outlook)\\.com
10
-
11
- # Phone patterns
12
- phone_us = \\+1-\\d{3}-\\d{3}-\\d{4}
13
- phone_in = \\+91-\\d{10}
14
-
15
- Usage:
16
- from additory.common.patterns import load_properties_file, get_pattern
17
-
18
- patterns = load_properties_file("reference/schema_definitions/global.properties")
19
- email_pattern = get_pattern("email_generic", patterns)
20
- """
21
-
22
- from typing import Dict, Optional, List
23
- from pathlib import Path
24
- import re
25
-
26
-
27
- class PatternFileError(Exception):
28
- """Raised when pattern file parsing fails."""
29
- pass
30
-
31
-
32
- def parse_properties_file(content: str) -> Dict[str, str]:
33
- """
34
- Parse .properties file content into dictionary of patterns.
35
-
36
- Format:
37
- # Comment
38
- pattern_name = regex_pattern
39
- another_pattern = another_regex
40
-
41
- Args:
42
- content: File content as string
43
-
44
- Returns:
45
- Dictionary mapping pattern names to regex patterns
46
-
47
- Raises:
48
- PatternFileError: If parsing fails
49
-
50
- Example:
51
- >>> content = '''
52
- ... # Email patterns
53
- ... email = [A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}
54
- ... phone = \\+1-\\d{3}-\\d{3}-\\d{4}
55
- ... '''
56
- >>> patterns = parse_properties_file(content)
57
- >>> patterns['email']
58
- '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\\\.[A-Z|a-z]{2,}'
59
- """
60
- patterns = {}
61
-
62
- for line_num, line in enumerate(content.split('\n'), 1):
63
- # Remove comments and strip whitespace
64
- line = line.split('#')[0].strip()
65
-
66
- # Skip empty lines
67
- if not line:
68
- continue
69
-
70
- # Parse pattern definitions
71
- if '=' not in line:
72
- raise PatternFileError(
73
- f"Line {line_num}: Invalid format. Expected 'name = pattern'"
74
- )
75
-
76
- name, pattern = line.split('=', 1)
77
- name = name.strip()
78
- pattern = pattern.strip()
79
-
80
- # Validate pattern name
81
- if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
82
- raise PatternFileError(
83
- f"Line {line_num}: Invalid pattern name '{name}'. "
84
- f"Must start with letter/underscore and contain only alphanumeric/underscore."
85
- )
86
-
87
- # Check for empty pattern
88
- if not pattern:
89
- raise PatternFileError(
90
- f"Line {line_num}: Pattern '{name}' has no value"
91
- )
92
-
93
- # Check for duplicate names
94
- if name in patterns:
95
- raise PatternFileError(
96
- f"Line {line_num}: Duplicate pattern name '{name}'"
97
- )
98
-
99
- patterns[name] = pattern
100
-
101
- return patterns
102
-
103
-
104
- def load_properties_file(filepath: str) -> Dict[str, str]:
105
- """
106
- Load and parse a .properties file.
107
-
108
- Args:
109
- filepath: Path to .properties file
110
-
111
- Returns:
112
- Dictionary mapping pattern names to regex patterns
113
-
114
- Raises:
115
- PatternFileError: If file not found or parsing fails
116
-
117
- Example:
118
- >>> patterns = load_properties_file("reference/schema_definitions/global.properties")
119
- >>> patterns['email_generic']
120
- '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}'
121
- """
122
- path = Path(filepath)
123
-
124
- if not path.exists():
125
- raise PatternFileError(f"Pattern file not found: {filepath}")
126
-
127
- if not path.suffix == '.properties':
128
- raise PatternFileError(f"File must have .properties extension: {filepath}")
129
-
130
- try:
131
- content = path.read_text(encoding='utf-8')
132
- return parse_properties_file(content)
133
- except UnicodeDecodeError as e:
134
- raise PatternFileError(f"Failed to read file {filepath}: {e}")
135
- except Exception as e:
136
- if isinstance(e, PatternFileError):
137
- raise
138
- raise PatternFileError(f"Failed to parse {filepath}: {e}")
139
-
140
-
141
- def get_pattern(pattern_name: str, patterns: Dict[str, str]) -> Optional[str]:
142
- """
143
- Get regex pattern for a specific name.
144
-
145
- Args:
146
- pattern_name: Name of the pattern
147
- patterns: Dictionary of patterns (from load_properties_file or parse_properties_file)
148
-
149
- Returns:
150
- Regex pattern string or None if not found
151
-
152
- Example:
153
- >>> patterns = load_properties_file("global.properties")
154
- >>> pattern = get_pattern("email_generic", patterns)
155
- >>> print(pattern)
156
- [A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}
157
- """
158
- return patterns.get(pattern_name)
159
-
160
-
161
- def list_all_patterns(patterns: Dict[str, str]) -> List[str]:
162
- """
163
- Get names of all available patterns.
164
-
165
- Args:
166
- patterns: Dictionary of patterns
167
-
168
- Returns:
169
- List of pattern names
170
-
171
- Example:
172
- >>> patterns = load_properties_file("global.properties")
173
- >>> names = list_all_patterns(patterns)
174
- >>> print(names)
175
- ['email_generic', 'email_us', 'phone_us', 'phone_in', ...]
176
- """
177
- return list(patterns.keys())
178
-
179
-
180
- def validate_properties_file(filepath: str) -> tuple[bool, List[str]]:
181
- """
182
- Validate a .properties file and return any errors.
183
-
184
- Args:
185
- filepath: Path to .properties file
186
-
187
- Returns:
188
- Tuple of (is_valid, error_messages)
189
-
190
- Example:
191
- >>> is_valid, errors = validate_properties_file("global.properties")
192
- >>> if not is_valid:
193
- ... for error in errors:
194
- ... print(error)
195
- """
196
- errors = []
197
-
198
- try:
199
- patterns = load_properties_file(filepath)
200
-
201
- # Check for empty file
202
- if not patterns:
203
- errors.append("File contains no patterns")
204
-
205
- # Check for empty patterns
206
- for name, pattern in patterns.items():
207
- if not pattern:
208
- errors.append(f"Pattern '{name}' is empty")
209
-
210
- return (len(errors) == 0, errors)
211
-
212
- except PatternFileError as e:
213
- return (False, [str(e)])
214
- except Exception as e:
215
- return (False, [f"Unexpected error: {e}"])
216
-
217
-
218
- def is_regex_pattern(value: str) -> bool:
219
- r"""
220
- Check if a string looks like a regex pattern.
221
-
222
- Detects special regex characters: \\ [ ] ( ) { } + * ? ^ $ | .
223
-
224
- Args:
225
- value: String to check
226
-
227
- Returns:
228
- True if string contains regex special characters
229
-
230
- Example:
231
- >>> is_regex_pattern("CUST\\d{8}")
232
- True
233
- >>> is_regex_pattern("first_names")
234
- False
235
- >>> is_regex_pattern("[A-Z]+")
236
- True
237
- """
238
- # Check for regex special characters
239
- regex_chars = r'[\\[\](){}+*?^$|.]'
240
- return bool(re.search(regex_chars, value))