additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,259 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Text-based label deduction for additory.
4
-
5
- Uses TF-IDF + cosine similarity to deduce labels from text.
6
- Pure Python, no LLMs, offline-first.
7
- """
8
-
9
- import math
10
- import re
11
- from collections import Counter
12
- from typing import Union, List, Optional
13
- import pandas as pd
14
- import polars as pl
15
-
16
-
17
- def tokenize(text: str) -> List[str]:
18
- """
19
- Tokenize text into words.
20
-
21
- Args:
22
- text: Input text
23
-
24
- Returns:
25
- List of lowercase tokens
26
- """
27
- if text is None or not isinstance(text, str):
28
- return []
29
-
30
- text = text.lower()
31
- text = re.sub(r"[^a-z0-9\s]", " ", text)
32
- return [w for w in text.split() if w]
33
-
34
-
35
- def vectorize(tokens: List[str]) -> Counter:
36
- """
37
- Convert tokens to TF vector (term frequency).
38
-
39
- Args:
40
- tokens: List of tokens
41
-
42
- Returns:
43
- Counter with term frequencies
44
- """
45
- return Counter(tokens)
46
-
47
-
48
- def cosine_similarity(v1: Counter, v2: Counter) -> float:
49
- """
50
- Compute cosine similarity between two vectors.
51
-
52
- Args:
53
- v1: First vector (Counter)
54
- v2: Second vector (Counter)
55
-
56
- Returns:
57
- Similarity score (0-1)
58
- """
59
- # Dot product
60
- dot = sum(v1[t] * v2[t] for t in v1 if t in v2)
61
-
62
- # Magnitudes
63
- mag1 = math.sqrt(sum(v * v for v in v1.values()))
64
- mag2 = math.sqrt(sum(v * v for v in v2.values()))
65
-
66
- if mag1 == 0 or mag2 == 0:
67
- return 0.0
68
-
69
- return dot / (mag1 * mag2)
70
-
71
-
72
- def _deduce_polars(
73
- df: pl.DataFrame,
74
- from_column: Union[str, List[str]],
75
- to_column: str,
76
- min_examples: int = 3
77
- ) -> pl.DataFrame:
78
- """
79
- Deduce missing labels using text similarity (Polars-native).
80
-
81
- Args:
82
- df: Polars DataFrame
83
- from_column: Text column(s) to analyze
84
- to_column: Label column to fill
85
- min_examples: Minimum labeled examples required
86
-
87
- Returns:
88
- DataFrame with deduced labels
89
-
90
- Raises:
91
- ValueError: If insufficient labeled examples
92
- """
93
- # Normalize from_column to list
94
- if isinstance(from_column, str):
95
- source_cols = [from_column]
96
- else:
97
- source_cols = from_column
98
-
99
- # Validate columns exist
100
- for col in source_cols:
101
- if col not in df.columns:
102
- raise ValueError(f"Column '{col}' not found in DataFrame")
103
-
104
- if to_column not in df.columns:
105
- raise ValueError(f"Column '{to_column}' not found in DataFrame")
106
-
107
- # Create combined text column if multiple sources
108
- if len(source_cols) == 1:
109
- text_col = source_cols[0]
110
- df_work = df.clone()
111
- else:
112
- # Concatenate multiple columns with spaces
113
- df_work = df.with_columns([
114
- pl.concat_str(
115
- [pl.col(c).fill_null("") for c in source_cols],
116
- separator=" "
117
- ).alias("__deduce_text__")
118
- ])
119
- text_col = "__deduce_text__"
120
-
121
- # Split into labeled and unlabeled
122
- labeled_df = df_work.filter(pl.col(to_column).is_not_null())
123
- unlabeled_df = df_work.filter(pl.col(to_column).is_null())
124
-
125
- # Check if we have enough labeled examples
126
- n_labeled = len(labeled_df)
127
- if n_labeled == 0:
128
- raise ValueError(
129
- f"⚠️ Cannot deduce labels: No labeled examples found in '{to_column}' column.\n"
130
- f"Please manually label at least {min_examples} examples per category, then run again.\n\n"
131
- f"Note: additory uses pure Python text similarity (no LLMs, no external calls).\n"
132
- f"Your data never leaves your machine."
133
- )
134
-
135
- if n_labeled < min_examples:
136
- print(
137
- f"⚠️ Only {n_labeled} labeled examples found. "
138
- f"For better accuracy, label at least {min_examples} examples.\n"
139
- f"Proceeding with available data..."
140
- )
141
-
142
- # If no unlabeled rows, return original
143
- if len(unlabeled_df) == 0:
144
- if len(source_cols) > 1:
145
- # Remove temporary column
146
- return df_work.drop("__deduce_text__")
147
- return df_work
148
-
149
- # Precompute vectors for labeled rows
150
- labeled_vectors = []
151
- for row in labeled_df.iter_rows(named=True):
152
- text = row[text_col]
153
- label = row[to_column]
154
- tokens = tokenize(text)
155
- vec = vectorize(tokens)
156
- labeled_vectors.append((vec, label))
157
-
158
- # Deduce labels for unlabeled rows
159
- deduced_labels = []
160
- for row in unlabeled_df.iter_rows(named=True):
161
- text = row[text_col]
162
- tokens = tokenize(text)
163
- vec = vectorize(tokens)
164
-
165
- # Find most similar labeled example
166
- best_label = None
167
- best_score = -1.0
168
-
169
- for labeled_vec, label in labeled_vectors:
170
- score = cosine_similarity(vec, labeled_vec)
171
- if score > best_score:
172
- best_score = score
173
- best_label = label
174
-
175
- deduced_labels.append(best_label)
176
-
177
- # Create deduced labels series
178
- deduced_series = pl.Series(to_column, deduced_labels)
179
-
180
- # Update unlabeled rows with deduced labels
181
- unlabeled_df = unlabeled_df.with_columns([deduced_series])
182
-
183
- # Combine labeled and unlabeled back together
184
- result_df = pl.concat([labeled_df, unlabeled_df])
185
-
186
- # Remove temporary column if created
187
- if len(source_cols) > 1:
188
- result_df = result_df.drop("__deduce_text__")
189
-
190
- # Print success message
191
- n_deduced = len(deduced_labels)
192
- print(f"✓ Deduced {n_deduced} label{'s' if n_deduced != 1 else ''} from {n_labeled} examples (offline, no LLMs)")
193
-
194
- return result_df
195
-
196
-
197
- def deduce(
198
- df: Union[pd.DataFrame, pl.DataFrame],
199
- from_column: Union[str, List[str]],
200
- to_column: str
201
- ) -> Union[pd.DataFrame, pl.DataFrame]:
202
- """
203
- Deduce missing labels based on text similarity to labeled examples.
204
-
205
- Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
206
- Requires at least 3 labeled examples to work.
207
-
208
- When multiple source columns are provided, they are concatenated with
209
- spaces before computing similarity.
210
-
211
- Args:
212
- df: DataFrame with some labeled and some unlabeled rows
213
- from_column: Text column(s) to analyze
214
- - str: Single column (e.g., "comment")
215
- - List[str]: Multiple columns (e.g., ["comment", "notes"])
216
- to_column: Label column to fill (e.g., "status")
217
-
218
- Returns:
219
- DataFrame with deduced labels filled in
220
-
221
- Examples:
222
- # Single column
223
- >>> result = add.deduce(df, from_column="comment", to_column="status")
224
-
225
- # Multiple columns (better accuracy)
226
- >>> result = add.deduce(
227
- ... df,
228
- ... from_column=["comment", "notes", "description"],
229
- ... to_column="status"
230
- ... )
231
-
232
- Privacy: Your data never leaves your machine. No external connections.
233
- """
234
- # Detect input backend
235
- if isinstance(df, pd.DataFrame):
236
- backend = "pandas"
237
- # Convert to Polars
238
- df_polars = pl.from_pandas(df)
239
- elif isinstance(df, pl.DataFrame):
240
- backend = "polars"
241
- df_polars = df
242
- else:
243
- # Try arrow bridge (for cudf, etc.)
244
- try:
245
- df_polars = pl.from_arrow(df)
246
- backend = "arrow"
247
- except Exception:
248
- raise TypeError(f"Unsupported DataFrame type: {type(df)}")
249
-
250
- # Process in Polars
251
- result_polars = _deduce_polars(df_polars, from_column, to_column)
252
-
253
- # Convert back to original format
254
- if backend == "pandas":
255
- return result_polars.to_pandas()
256
- elif backend == "polars":
257
- return result_polars
258
- else: # arrow
259
- return result_polars.to_arrow()
@@ -1,22 +0,0 @@
1
- """
2
- Distribution Strategies for Synthetic Data Generation
3
-
4
- DEPRECATED: This module has been moved to additory.common.distributions
5
- Please update your imports to use additory.common.distributions instead.
6
-
7
- This file is kept for backward compatibility and will be removed in a future version.
8
- """
9
-
10
- import warnings
11
-
12
- # Issue deprecation warning
13
- warnings.warn(
14
- "additory.synthetic.distributions is deprecated. "
15
- "Please use additory.common.distributions instead. "
16
- "This module will be removed in a future version.",
17
- DeprecationWarning,
18
- stacklevel=2
19
- )
20
-
21
- # Import everything from common.distributions for backward compatibility
22
- from additory.common.distributions import * # noqa: F401, F403