additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/analysis/scan.py DELETED
@@ -1,400 +0,0 @@
1
- """
2
- Data Profiling and Analysis
3
-
4
- Provides comprehensive data profiling through the scan() function.
5
- """
6
-
7
- from dataclasses import dataclass, field
8
- from typing import Dict, List, Optional, Union, Any
9
- import json
10
- import polars as pl
11
- import pandas as pd
12
- import numpy as np
13
-
14
- from additory.common.backend import detect_backend, to_polars
15
- from additory.analysis.distributions import (
16
- detect_distributions,
17
- DistributionFit
18
- )
19
- from additory.analysis.correlations import (
20
- calculate_correlations,
21
- CorrelationResult
22
- )
23
- from additory.analysis.cardinality import (
24
- analyze_all_cardinality,
25
- CardinalityInfo
26
- )
27
- from additory.analysis.quality import (
28
- analyze_all_quality,
29
- QualityMetrics,
30
- is_numeric_dtype
31
- )
32
-
33
- # Scan presets for common use cases
34
- SCAN_PRESETS = {
35
- 'quick': {
36
- 'detect_distributions_flag': False,
37
- 'detect_correlations_flag': False,
38
- 'detect_cardinality_flag': True,
39
- 'description': 'Quick scan: quality + cardinality only'
40
- },
41
- 'distributions': {
42
- 'detect_distributions_flag': True,
43
- 'detect_correlations_flag': False,
44
- 'detect_cardinality_flag': False,
45
- 'description': 'Distribution-focused: detect distributions only'
46
- },
47
- 'correlations': {
48
- 'detect_distributions_flag': False,
49
- 'detect_correlations_flag': True,
50
- 'detect_cardinality_flag': False,
51
- 'description': 'Correlation-focused: correlation analysis only'
52
- },
53
- 'full': {
54
- 'detect_distributions_flag': True,
55
- 'detect_correlations_flag': True,
56
- 'detect_cardinality_flag': True,
57
- 'description': 'Full analysis: all features enabled'
58
- },
59
- 'minimal': {
60
- 'detect_distributions_flag': False,
61
- 'detect_correlations_flag': False,
62
- 'detect_cardinality_flag': False,
63
- 'description': 'Minimal scan: quality metrics only'
64
- }
65
- }
66
-
67
-
68
- @dataclass
69
- class ColumnInfo:
70
- """Information about a single column"""
71
- name: str
72
- dtype: str
73
- null_count: int
74
- null_percentage: float
75
- unique_count: int
76
-
77
- def __repr__(self):
78
- return f"ColumnInfo(name='{self.name}', dtype='{self.dtype}', nulls={self.null_percentage:.1f}%)"
79
-
80
-
81
- @dataclass
82
- class ScanResult:
83
- """
84
- Comprehensive scan results for a DataFrame
85
-
86
- Contains all analysis results including distributions, correlations,
87
- cardinality, and quality metrics.
88
- """
89
- # Basic info
90
- shape: tuple
91
- columns: List[ColumnInfo]
92
-
93
- # Analysis results
94
- distributions: Dict[str, List[DistributionFit]] = field(default_factory=dict)
95
- correlations: List[CorrelationResult] = field(default_factory=list)
96
- cardinality: Dict[str, CardinalityInfo] = field(default_factory=dict)
97
- quality: Dict[str, QualityMetrics] = field(default_factory=dict)
98
-
99
- # Metadata
100
- preset_used: Optional[str] = None
101
- analysis_enabled: Dict[str, bool] = field(default_factory=dict)
102
-
103
- def summary(self) -> str:
104
- """Generate a human-readable summary of the scan results"""
105
- lines = []
106
- lines.append(f"DataFrame Scan Results")
107
- lines.append(f"Shape: {self.shape[0]:,} rows × {self.shape[1]} columns")
108
- lines.append("")
109
-
110
- if self.preset_used:
111
- preset_desc = SCAN_PRESETS.get(self.preset_used, {}).get('description', 'Custom preset')
112
- lines.append(f"Preset: {self.preset_used} ({preset_desc})")
113
- lines.append("")
114
-
115
- # Column overview
116
- lines.append("Columns:")
117
- for col in self.columns:
118
- lines.append(f" {col.name}: {col.dtype} ({col.null_percentage:.1f}% null, {col.unique_count:,} unique)")
119
- lines.append("")
120
-
121
- # Distributions
122
- if self.distributions:
123
- lines.append("Top Distributions:")
124
- for col_name, fits in self.distributions.items():
125
- if fits:
126
- best_fit = fits[0]
127
- lines.append(f" {col_name}: {best_fit.distribution} (score: {best_fit.score:.3f})")
128
- lines.append("")
129
-
130
- # Correlations
131
- if self.correlations:
132
- lines.append("Strong Correlations:")
133
- strong_corrs = [c for c in self.correlations if abs(c.correlation) >= 0.7]
134
- for corr in strong_corrs[:5]: # Top 5
135
- lines.append(f" {corr.column1} ↔ {corr.column2}: {corr.correlation:.3f} ({corr.method})")
136
- if len(strong_corrs) > 5:
137
- lines.append(f" ... and {len(strong_corrs) - 5} more")
138
- lines.append("")
139
-
140
- # Cardinality insights
141
- if self.cardinality:
142
- high_card = [name for name, info in self.cardinality.items()
143
- if info.classification == 'high']
144
- if high_card:
145
- lines.append(f"High Cardinality Columns: {', '.join(high_card)}")
146
- lines.append("")
147
-
148
- return "\n".join(lines)
149
-
150
- def to_dict(self) -> Dict[str, Any]:
151
- """Convert scan results to dictionary format"""
152
- return {
153
- 'shape': self.shape,
154
- 'columns': [
155
- {
156
- 'name': col.name,
157
- 'dtype': col.dtype,
158
- 'null_count': col.null_count,
159
- 'null_percentage': col.null_percentage,
160
- 'unique_count': col.unique_count
161
- }
162
- for col in self.columns
163
- ],
164
- 'distributions': {
165
- col_name: [
166
- {
167
- 'distribution': fit.distribution,
168
- 'score': fit.score,
169
- 'parameters': fit.parameters
170
- }
171
- for fit in fits
172
- ]
173
- for col_name, fits in self.distributions.items()
174
- },
175
- 'correlations': [
176
- {
177
- 'column1': corr.column1,
178
- 'column2': corr.column2,
179
- 'correlation': corr.correlation,
180
- 'method': corr.method,
181
- 'p_value': corr.p_value
182
- }
183
- for corr in self.correlations
184
- ],
185
- 'cardinality': {
186
- col_name: {
187
- 'unique_count': info.unique_count,
188
- 'total_count': info.total_count,
189
- 'unique_ratio': info.unique_ratio,
190
- 'classification': info.classification,
191
- 'top_values': info.top_values
192
- }
193
- for col_name, info in self.cardinality.items()
194
- },
195
- 'quality': {
196
- col_name: {
197
- 'missing_count': metrics.missing_count,
198
- 'missing_percentage': metrics.missing_percentage,
199
- 'data_type': metrics.data_type,
200
- 'summary_stats': metrics.summary_stats
201
- }
202
- for col_name, metrics in self.quality.items()
203
- },
204
- 'metadata': {
205
- 'preset_used': self.preset_used,
206
- 'analysis_enabled': self.analysis_enabled
207
- }
208
- }
209
-
210
- def to_json(self, indent: int = 2) -> str:
211
- """Convert scan results to JSON string"""
212
- return json.dumps(self.to_dict(), indent=indent, default=str)
213
-
214
-
215
- def scan(
216
- df: Union[pl.DataFrame, pd.DataFrame, Any],
217
- preset: Optional[str] = None,
218
- detect_distributions_flag: bool = True,
219
- detect_correlations_flag: bool = True,
220
- detect_cardinality_flag: bool = True,
221
- top_n_distributions: int = 3,
222
- correlation_methods: List[str] = None,
223
- correlation_threshold: float = 0.3,
224
- cardinality_top_n: int = 10,
225
- verbose: bool = True
226
- ) -> ScanResult:
227
- """
228
- Scan a DataFrame to detect distributions, correlations, and cardinality.
229
-
230
- Accepts pandas, polars, or cuDF DataFrames. Automatically converts to Polars
231
- for processing. Returns ScanResult with analysis results.
232
-
233
- This function provides comprehensive data profiling including:
234
- - Distribution detection for numeric columns
235
- - Correlation analysis between columns
236
- - Cardinality analysis (unique values)
237
- - Data quality metrics
238
-
239
- Args:
240
- df: DataFrame to analyze (pandas, polars, or cuDF)
241
- preset: Optional preset ('quick', 'distributions', 'correlations', 'full', 'minimal')
242
- detect_distributions_flag: Whether to detect distributions (default: True)
243
- detect_correlations_flag: Whether to calculate correlations (default: True)
244
- detect_cardinality_flag: Whether to analyze cardinality (default: True)
245
- top_n_distributions: Number of top distributions to return per column (default: 3)
246
- correlation_methods: Correlation methods to use (default: ['pearson', 'spearman'])
247
- correlation_threshold: Minimum correlation to report (default: 0.3)
248
- cardinality_top_n: Number of top values to return per column (default: 10)
249
- verbose: Whether to print progress messages (default: True)
250
-
251
- Returns:
252
- ScanResult object containing all analysis results
253
-
254
- Presets:
255
- - 'quick': Quality + cardinality only (fast)
256
- - 'distributions': Distribution detection only
257
- - 'correlations': Correlation analysis only
258
- - 'full': All analyses enabled
259
- - 'minimal': Quality metrics only (fastest)
260
-
261
- Example:
262
- >>> import pandas as pd
263
- >>> from additory.analysis.scan import scan
264
- >>>
265
- >>> # Works with pandas
266
- >>> df = pd.DataFrame({
267
- ... 'age': [25, 30, 35, 40, 45],
268
- ... 'income': [50000, 60000, 70000, 80000, 90000],
269
- ... 'category': ['A', 'B', 'A', 'B', 'A']
270
- ... })
271
- >>>
272
- >>> result = scan(df)
273
- >>> print(result.summary())
274
- >>>
275
- >>> # Use presets
276
- >>> result = scan(df, preset='quick')
277
- >>> result = scan(df, preset='distributions', top_n_distributions=5)
278
- """
279
- # Handle preset configuration
280
- if preset:
281
- if preset not in SCAN_PRESETS:
282
- available = ', '.join(SCAN_PRESETS.keys())
283
- raise ValueError(f"Unknown preset '{preset}'. Available presets: {available}")
284
-
285
- preset_config = SCAN_PRESETS[preset]
286
- # Override flags with preset values (but allow explicit overrides)
287
- if 'detect_distributions_flag' not in locals() or detect_distributions_flag is True:
288
- detect_distributions_flag = preset_config['detect_distributions_flag']
289
- if 'detect_correlations_flag' not in locals() or detect_correlations_flag is True:
290
- detect_correlations_flag = preset_config['detect_correlations_flag']
291
- if 'detect_cardinality_flag' not in locals() or detect_cardinality_flag is True:
292
- detect_cardinality_flag = preset_config['detect_cardinality_flag']
293
-
294
- # Set default correlation methods
295
- if correlation_methods is None:
296
- correlation_methods = ['pearson', 'spearman']
297
-
298
- # Convert to Polars for processing
299
- original_backend = detect_backend(df)
300
- if verbose:
301
- print(f"Scanning {original_backend} DataFrame with shape {df.shape}")
302
-
303
- df_polars = to_polars(df)
304
-
305
- # Memory cleanup: delete original if converted
306
- if original_backend != 'polars':
307
- del df
308
- import gc
309
- gc.collect()
310
-
311
- # Get basic info
312
- shape = df_polars.shape
313
- column_names = df_polars.columns
314
-
315
- # Analyze column info
316
- columns = []
317
- for col_name in column_names:
318
- col_series = df_polars[col_name]
319
- dtype = str(col_series.dtype)
320
- null_count = col_series.null_count()
321
- null_percentage = (null_count / shape[0]) * 100
322
- unique_count = col_series.n_unique()
323
-
324
- columns.append(ColumnInfo(
325
- name=col_name,
326
- dtype=dtype,
327
- null_count=null_count,
328
- null_percentage=null_percentage,
329
- unique_count=unique_count
330
- ))
331
-
332
- # Initialize result
333
- result = ScanResult(
334
- shape=shape,
335
- columns=columns,
336
- preset_used=preset,
337
- analysis_enabled={
338
- 'distributions': detect_distributions_flag,
339
- 'correlations': detect_correlations_flag,
340
- 'cardinality': detect_cardinality_flag
341
- }
342
- )
343
-
344
- # Quality analysis (always performed)
345
- if verbose:
346
- print("Analyzing data quality...")
347
- result.quality = analyze_all_quality(df_polars)
348
-
349
- # Distribution analysis
350
- if detect_distributions_flag:
351
- if verbose:
352
- print("Detecting distributions...")
353
-
354
- # Get numeric columns
355
- numeric_columns = [col.name for col in columns
356
- if is_numeric_dtype(df_polars[col.name].dtype)]
357
-
358
- if numeric_columns:
359
- result.distributions = detect_distributions(
360
- df_polars,
361
- columns=numeric_columns,
362
- top_n=top_n_distributions
363
- )
364
- elif verbose:
365
- print("No numeric columns found for distribution analysis")
366
-
367
- # Correlation analysis
368
- if detect_correlations_flag:
369
- if verbose:
370
- print("Calculating correlations...")
371
-
372
- # Get numeric columns for correlation
373
- numeric_columns = [col.name for col in columns
374
- if is_numeric_dtype(df_polars[col.name].dtype)]
375
-
376
- if len(numeric_columns) >= 2:
377
- result.correlations = calculate_correlations(
378
- df_polars,
379
- columns=numeric_columns,
380
- methods=correlation_methods,
381
- threshold=correlation_threshold
382
- )
383
- elif verbose:
384
- print(f"Need at least 2 numeric columns for correlation analysis (found {len(numeric_columns)})")
385
-
386
- # Cardinality analysis
387
- if detect_cardinality_flag:
388
- if verbose:
389
- print("Analyzing cardinality...")
390
- result.cardinality = analyze_all_cardinality(df_polars, top_n=cardinality_top_n)
391
-
392
- if verbose:
393
- print("Scan complete!")
394
-
395
- # Final memory cleanup
396
- del df_polars
397
- import gc
398
- gc.collect()
399
-
400
- return result