autopreprocessor 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [2026] [Saad Sayed]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: autopreprocessor
3
+ Version: 0.1.0
4
+ Summary: Automatic preprocessing for tabular datasets using heuristics and RAG-assisted encoding detection.
5
+ Author: Saad Sayed
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE.txt
9
+ Requires-Dist: numpy
10
+ Requires-Dist: pandas
11
+ Requires-Dist: scikit-learn
12
+ Requires-Dist: sentence-transformers
13
+ Requires-Dist: chromadb
14
+ Requires-Dist: langchain-community
15
+ Requires-Dist: tqdm
16
+ Dynamic: license-file
17
+
18
+ # AutoPreprocessor
19
+
20
+ An experimental AutoML preprocessing package that combines rule-based preprocessing with a lightweight Retrieval-Augmented Generation (RAG) system to automatically suggest encoding strategies for tabular datasets.
21
+
22
+ The project uses Sentence Transformers and ChromaDB locally. It does not rely on external LLM APIs.
23
+
24
+ ## Features
25
+
26
+ - Mean, median, and mode imputation
27
+ - Label encoding for binary columns
28
+ - One-hot encoding
29
+ - RAG-assisted ordinal encoding
30
+ - Standard scaling and robust scaling
31
+ - Automatic dropping of columns with excessive missing values
32
+
33
+ ## Installation
34
+
35
+ Install locally from the project root:
36
+
37
+ ```bash
38
+ pip install .
39
+ ```
40
+
41
+ For development, you can also install dependencies directly:
42
+
43
+ ```bash
44
+ pip install -r requirements.txt
45
+ ```
46
+
47
+ ## Usage
48
+
49
+ ```python
50
+ import pandas as pd
51
+ from autopreprocessor import AutoPreprocessor
52
+
53
+ data = pd.read_csv("sample.csv")
54
+
55
+ auto = AutoPreprocessor()
56
+ X_processed = auto.fit_transform(data)
57
+
58
+ print(auto.report_)
59
+ ```
60
+
61
+ ## RAG And ChromaDB
62
+
63
+ The package preserves the existing RAG behavior and ChromaDB usage. `AutoPreprocessor` loads the persistent ChromaDB collection named `rag_collection` from:
64
+
65
+ ```text
66
+ autopreprocessor/chroma_db/
67
+ ```
68
+
69
+ The ChromaDB path is resolved with `pathlib` relative to the installed package location, so it can work across machines and operating systems.
70
+
71
+ ## Project Structure
72
+
73
+ ```text
74
+ AutoPreprocessor/
75
+ |
76
+ |-- autopreprocessor/
77
+ | |-- __init__.py
78
+ | |-- auto_preprocessor.py
79
+ | |-- chroma_db/
80
+ |
81
+ |-- requirements.txt
82
+ |-- setup.py
83
+ |-- pyproject.toml
84
+ |-- README.md
85
+ |-- .gitignore
86
+ ```
87
+
88
+ ## API
89
+
90
+ ```python
91
+ from autopreprocessor import AutoPreprocessor
92
+ ```
93
+
94
+ `AutoPreprocessor` exposes:
95
+
96
+ - `fit(data)`
97
+ - `transform(data)`
98
+ - `fit_transform(data)`
99
+ - `report_`
100
+
101
+ The preprocessing logic is preserved from the original notebook implementation.
@@ -0,0 +1,84 @@
1
+ # AutoPreprocessor
2
+
3
+ An experimental AutoML preprocessing package that combines rule-based preprocessing with a lightweight Retrieval-Augmented Generation (RAG) system to automatically suggest encoding strategies for tabular datasets.
4
+
5
+ The project uses Sentence Transformers and ChromaDB locally. It does not rely on external LLM APIs.
6
+
7
+ ## Features
8
+
9
+ - Mean, median, and mode imputation
10
+ - Label encoding for binary columns
11
+ - One-hot encoding
12
+ - RAG-assisted ordinal encoding
13
+ - Standard scaling and robust scaling
14
+ - Automatic dropping of columns with excessive missing values
15
+
16
+ ## Installation
17
+
18
+ Install locally from the project root:
19
+
20
+ ```bash
21
+ pip install .
22
+ ```
23
+
24
+ For development, you can also install dependencies directly:
25
+
26
+ ```bash
27
+ pip install -r requirements.txt
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ import pandas as pd
34
+ from autopreprocessor import AutoPreprocessor
35
+
36
+ data = pd.read_csv("sample.csv")
37
+
38
+ auto = AutoPreprocessor()
39
+ X_processed = auto.fit_transform(data)
40
+
41
+ print(auto.report_)
42
+ ```
43
+
44
+ ## RAG And ChromaDB
45
+
46
+ The package preserves the existing RAG behavior and ChromaDB usage. `AutoPreprocessor` loads the persistent ChromaDB collection named `rag_collection` from:
47
+
48
+ ```text
49
+ autopreprocessor/chroma_db/
50
+ ```
51
+
52
+ The ChromaDB path is resolved with `pathlib` relative to the installed package location, so it can work across machines and operating systems.
53
+
54
+ ## Project Structure
55
+
56
+ ```text
57
+ AutoPreprocessor/
58
+ |
59
+ |-- autopreprocessor/
60
+ | |-- __init__.py
61
+ | |-- auto_preprocessor.py
62
+ | |-- chroma_db/
63
+ |
64
+ |-- requirements.txt
65
+ |-- setup.py
66
+ |-- pyproject.toml
67
+ |-- README.md
68
+ |-- .gitignore
69
+ ```
70
+
71
+ ## API
72
+
73
+ ```python
74
+ from autopreprocessor import AutoPreprocessor
75
+ ```
76
+
77
+ `AutoPreprocessor` exposes:
78
+
79
+ - `fit(data)`
80
+ - `transform(data)`
81
+ - `fit_transform(data)`
82
+ - `report_`
83
+
84
+ The preprocessing logic is preserved from the original notebook implementation.
@@ -0,0 +1,2 @@
1
+ from .auto_preprocessor import AutoPreprocessor
2
+ __version__ = "0.1.0"
@@ -0,0 +1,221 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+ from sentence_transformers import SentenceTransformer
6
+ import chromadb
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn import datasets
9
+ from sklearn.pipeline import Pipeline, make_pipeline
10
+ from sklearn.compose import ColumnTransformer, make_column_selector
11
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, OrdinalEncoder, LabelEncoder
12
+ from sklearn.impute import SimpleImputer
13
+
14
+
15
+ BASE_DIR = Path(__file__).parent
16
+
17
+
18
+ class AutoPreprocessor:
19
+ def __init__(self):
20
+ self.preprocessor = None
21
+ self.report_ = None
22
+
23
+ def fit(self, data):
24
+ SIMILARITY_THRESHOLD = 0.5
25
+ DEFAULT_ENCODING = "onehot"
26
+
27
+ ###output cols
28
+ mean_imputer = []
29
+ median_imputer = []
30
+ mode_imputer = []
31
+ label_encoder = []
32
+ onehot_encoder = []
33
+ ordinal_encoder =[]
34
+ standard_scaler = []
35
+ robust_scaler = []
36
+ no_scaling = []
37
+ drop_cols = []
38
+
39
+ model = SentenceTransformer("all-MiniLM-L6-v2")
40
+ client = chromadb.PersistentClient(
41
+ path=str(BASE_DIR / "chroma_db")
42
+ )
43
+ collection = client.get_collection("rag_collection")
44
+
45
+ for col in tqdm(data.columns, desc="Classifying Columns"):
46
+ if data[col].isnull().mean() > 0.5:
47
+ drop_cols.append(col)
48
+ continue
49
+ if data[col].dtype == object:
50
+ if data[col].isnull().sum() > 0:
51
+ mode_imputer.append(col)
52
+ if data[col].nunique() == 2:
53
+ label_encoder.append(col)
54
+ else:
55
+ onehot_encoder.append(col)
56
+ else:
57
+ if data[col].isnull().sum() > 0:
58
+ if abs(data[col].skew()) > 1:
59
+ median_imputer.append(col)
60
+ else:
61
+ mean_imputer.append(col)
62
+ if data[col].nunique()/len(data) < 0.05:
63
+ no_scaling.append(col)
64
+ elif abs(data[col].skew()) > 1:
65
+ robust_scaler.append(col)
66
+ else:
67
+ standard_scaler.append(col)
68
+
69
+ for j in tqdm(onehot_encoder, desc="RAG Encoding Prediction"):
70
+ values = pd.Series(data[j].dropna().unique()).astype(str).str.lower().tolist()
71
+ query = " ".join(values)
72
+ query_embedding = model.encode(query)
73
+ results = collection.query(
74
+ query_embeddings=[query_embedding.tolist()],
75
+ n_results=3
76
+ )
77
+ r_documents = results["documents"][0]
78
+ r_distances = results["distances"][0]
79
+ r_metadatas = results["metadatas"][0]
80
+ r_ids = results["ids"][0]
81
+ predictions = []
82
+
83
+ for i, (doc, distance, meta, rid) in enumerate(zip(r_documents, r_distances, r_metadatas, r_ids)):
84
+ similarity_score = 1 - distance
85
+ prediction = meta.get("encoding", DEFAULT_ENCODING) if similarity_score > SIMILARITY_THRESHOLD else DEFAULT_ENCODING
86
+
87
+ predictions.append({
88
+ "rank": i + 1,
89
+ "id": rid,
90
+ "similarity_score": similarity_score,
91
+ "metadata": meta,
92
+ "document": doc,
93
+ "prediction": prediction
94
+ })
95
+ best = max(predictions, key=lambda x: x["similarity_score"])
96
+ if best["prediction"] == "ordinal":
97
+ ordinal_encoder.append(j)
98
+ onehot_encoder = [col for col in onehot_encoder if col not in ordinal_encoder]
99
+ print("Ordinal encoding columns:\n")
100
+ for k in ordinal_encoder.copy():
101
+ print(f"Column: {k}")
102
+ print("Unique values:")
103
+ print(data[k].dropna().unique())
104
+
105
+ while True:
106
+ proceed = input("Keep as Ordinal? (Y/N): ").strip().upper()
107
+
108
+ if proceed == "Y":
109
+ break
110
+
111
+ elif proceed == "N":
112
+ onehot_encoder.append(k)
113
+ ordinal_encoder.remove(k)
114
+ break
115
+
116
+ else:
117
+ print("Only Y and N are expected.")
118
+ all_lists = [
119
+ mean_imputer,
120
+ median_imputer,
121
+ mode_imputer,
122
+ label_encoder,
123
+ onehot_encoder,
124
+ standard_scaler,
125
+ robust_scaler,
126
+ no_scaling,
127
+ ordinal_encoder
128
+ ]
129
+ for lst in tqdm(all_lists, desc="Cleaning Drop Columns"):
130
+ lst[:] = [col for col in lst if col not in drop_cols]
131
+
132
+ mode_label = [col for col in mode_imputer if col in label_encoder]
133
+ mode_onehot = [col for col in mode_imputer if col in onehot_encoder]
134
+ mode_ordinal = [col for col in mode_imputer if col in ordinal_encoder]
135
+ mean_standard = [col for col in mean_imputer if col in standard_scaler]
136
+ mean_robust = [col for col in mean_imputer if col in robust_scaler]
137
+ mean_noscale = [col for col in mean_imputer if col in no_scaling]
138
+ median_standard = [col for col in median_imputer if col in standard_scaler]
139
+ median_robust = [col for col in median_imputer if col in robust_scaler]
140
+ median_noscale = [col for col in median_imputer if col in no_scaling]
141
+ label_only = [col for col in label_encoder if col not in mode_imputer]
142
+ onehot_only = [col for col in onehot_encoder if col not in mode_imputer]
143
+ ordinal_only = [col for col in ordinal_encoder if col not in mode_imputer]
144
+ standard_only = [col for col in standard_scaler if col not in mean_imputer + median_imputer]
145
+ robust_only = [col for col in robust_scaler if col not in mean_imputer + median_imputer]
146
+ noscale_only = [col for col in no_scaling if col not in mean_imputer + median_imputer]
147
+
148
+ mode_label_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OrdinalEncoder())])
149
+ mode_onehot_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))])
150
+ mode_ordinal_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
151
+ mean_standard_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
152
+ mean_robust_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', RobustScaler())])
153
+ median_standard_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
154
+ median_robust_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', RobustScaler())])
155
+ mean_noscale_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean'))])
156
+ median_noscale_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median'))])
157
+ label_pipeline = Pipeline([('encoder', OrdinalEncoder())])
158
+ onehot_pipeline = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore'))])
159
+ ordinal_pipeline = Pipeline([('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
160
+ standard_pipeline = Pipeline([('scaler', StandardScaler())])
161
+ robust_pipeline = Pipeline([('scaler', RobustScaler())])
162
+
163
+ transformers = []
164
+ if mode_label:
165
+ transformers.append(('mode_label', mode_label_pipeline, mode_label))
166
+ if mode_onehot:
167
+ transformers.append(('mode_onehot', mode_onehot_pipeline, mode_onehot))
168
+ if mode_ordinal:
169
+ transformers.append(('mode_ordinal', mode_ordinal_pipeline, mode_ordinal))
170
+ if mean_standard:
171
+ transformers.append(('mean_standard', mean_standard_pipeline, mean_standard))
172
+ if mean_robust:
173
+ transformers.append(('mean_robust', mean_robust_pipeline, mean_robust))
174
+ if mean_noscale:
175
+ transformers.append(('mean_noscale', mean_noscale_pipeline, mean_noscale))
176
+ if median_standard:
177
+ transformers.append(('median_standard', median_standard_pipeline, median_standard))
178
+ if median_robust:
179
+ transformers.append(('median_robust', median_robust_pipeline, median_robust))
180
+ if median_noscale:
181
+ transformers.append(('median_noscale', median_noscale_pipeline, median_noscale))
182
+ if label_only:
183
+ transformers.append(('label', label_pipeline, label_only))
184
+ if onehot_only:
185
+ transformers.append(('onehot', onehot_pipeline, onehot_only))
186
+ if ordinal_only:
187
+ transformers.append(('ordinal', ordinal_pipeline, ordinal_only))
188
+ if robust_only:
189
+ transformers.append(('robust', robust_pipeline, robust_only))
190
+ if standard_only:
191
+ transformers.append(('standard', standard_pipeline, standard_only))
192
+ if noscale_only:
193
+ transformers.append(('no_scaling', 'passthrough', noscale_only))
194
+
195
+ self.preprocessor = ColumnTransformer(
196
+ transformers=transformers,
197
+ remainder='drop'
198
+ )
199
+ self.preprocessor.fit(data)
200
+
201
+ self.report_ = {
202
+ "Mean Imputer": mean_imputer,
203
+ "Median Imputer": median_imputer,
204
+ "Mode Imputer": mode_imputer,
205
+ "Label Encoder": label_encoder,
206
+ "OneHot Encoder": onehot_encoder,
207
+ "Ordinal Encoder": ordinal_encoder,
208
+ "Standard Scaler": standard_scaler,
209
+ "Robust Scaler": robust_scaler,
210
+ "No Scaling Required": no_scaling,
211
+ "Drop Columns": drop_cols
212
+ }
213
+
214
+ return self
215
+
216
+ def transform(self, data):
217
+ return self.preprocessor.transform(data)
218
+
219
+ def fit_transform(self, data):
220
+ self.fit(data)
221
+ return self.transform(data)
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: autopreprocessor
3
+ Version: 0.1.0
4
+ Summary: Automatic preprocessing for tabular datasets using heuristics and RAG-assisted encoding detection.
5
+ Author: Saad Sayed
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE.txt
9
+ Requires-Dist: numpy
10
+ Requires-Dist: pandas
11
+ Requires-Dist: scikit-learn
12
+ Requires-Dist: sentence-transformers
13
+ Requires-Dist: chromadb
14
+ Requires-Dist: langchain-community
15
+ Requires-Dist: tqdm
16
+ Dynamic: license-file
17
+
18
+ # AutoPreprocessor
19
+
20
+ An experimental AutoML preprocessing package that combines rule-based preprocessing with a lightweight Retrieval-Augmented Generation (RAG) system to automatically suggest encoding strategies for tabular datasets.
21
+
22
+ The project uses Sentence Transformers and ChromaDB locally. It does not rely on external LLM APIs.
23
+
24
+ ## Features
25
+
26
+ - Mean, median, and mode imputation
27
+ - Label encoding for binary columns
28
+ - One-hot encoding
29
+ - RAG-assisted ordinal encoding
30
+ - Standard scaling and robust scaling
31
+ - Automatic dropping of columns with excessive missing values
32
+
33
+ ## Installation
34
+
35
+ Install locally from the project root:
36
+
37
+ ```bash
38
+ pip install .
39
+ ```
40
+
41
+ For development, you can also install dependencies directly:
42
+
43
+ ```bash
44
+ pip install -r requirements.txt
45
+ ```
46
+
47
+ ## Usage
48
+
49
+ ```python
50
+ import pandas as pd
51
+ from autopreprocessor import AutoPreprocessor
52
+
53
+ data = pd.read_csv("sample.csv")
54
+
55
+ auto = AutoPreprocessor()
56
+ X_processed = auto.fit_transform(data)
57
+
58
+ print(auto.report_)
59
+ ```
60
+
61
+ ## RAG And ChromaDB
62
+
63
+ The package preserves the existing RAG behavior and ChromaDB usage. `AutoPreprocessor` loads the persistent ChromaDB collection named `rag_collection` from:
64
+
65
+ ```text
66
+ autopreprocessor/chroma_db/
67
+ ```
68
+
69
+ The ChromaDB path is resolved with `pathlib` relative to the installed package location, so it can work across machines and operating systems.
70
+
71
+ ## Project Structure
72
+
73
+ ```text
74
+ AutoPreprocessor/
75
+ |
76
+ |-- autopreprocessor/
77
+ | |-- __init__.py
78
+ | |-- auto_preprocessor.py
79
+ | |-- chroma_db/
80
+ |
81
+ |-- requirements.txt
82
+ |-- setup.py
83
+ |-- pyproject.toml
84
+ |-- README.md
85
+ |-- .gitignore
86
+ ```
87
+
88
+ ## API
89
+
90
+ ```python
91
+ from autopreprocessor import AutoPreprocessor
92
+ ```
93
+
94
+ `AutoPreprocessor` exposes:
95
+
96
+ - `fit(data)`
97
+ - `transform(data)`
98
+ - `fit_transform(data)`
99
+ - `report_`
100
+
101
+ The preprocessing logic is preserved from the original notebook implementation.
@@ -0,0 +1,11 @@
1
+ LICENSE.txt
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ autopreprocessor/__init__.py
6
+ autopreprocessor/auto_preprocessor.py
7
+ autopreprocessor.egg-info/PKG-INFO
8
+ autopreprocessor.egg-info/SOURCES.txt
9
+ autopreprocessor.egg-info/dependency_links.txt
10
+ autopreprocessor.egg-info/requires.txt
11
+ autopreprocessor.egg-info/top_level.txt
@@ -0,0 +1,7 @@
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ sentence-transformers
5
+ chromadb
6
+ langchain-community
7
+ tqdm
@@ -0,0 +1 @@
1
+ autopreprocessor
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "autopreprocessor"
7
+ version = "0.1.0"
8
+ description = "Automatic preprocessing for tabular datasets using heuristics and RAG-assisted encoding detection."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+
12
+ authors = [
13
+ {name = "Saad Sayed"}
14
+ ]
15
+
16
+ dependencies = [
17
+ "numpy",
18
+ "pandas",
19
+ "scikit-learn",
20
+ "sentence-transformers",
21
+ "chromadb",
22
+ "langchain-community",
23
+ "tqdm"
24
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,5 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ packages=find_packages()
5
+ )