autopreprocessor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autopreprocessor-0.1.0/LICENSE.txt +21 -0
- autopreprocessor-0.1.0/PKG-INFO +101 -0
- autopreprocessor-0.1.0/README.md +84 -0
- autopreprocessor-0.1.0/autopreprocessor/__init__.py +2 -0
- autopreprocessor-0.1.0/autopreprocessor/auto_preprocessor.py +221 -0
- autopreprocessor-0.1.0/autopreprocessor.egg-info/PKG-INFO +101 -0
- autopreprocessor-0.1.0/autopreprocessor.egg-info/SOURCES.txt +11 -0
- autopreprocessor-0.1.0/autopreprocessor.egg-info/dependency_links.txt +1 -0
- autopreprocessor-0.1.0/autopreprocessor.egg-info/requires.txt +7 -0
- autopreprocessor-0.1.0/autopreprocessor.egg-info/top_level.txt +1 -0
- autopreprocessor-0.1.0/pyproject.toml +24 -0
- autopreprocessor-0.1.0/setup.cfg +4 -0
- autopreprocessor-0.1.0/setup.py +5 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2026] [Saad Sayed]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autopreprocessor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automatic preprocessing for tabular datasets using heuristics and RAG-assisted encoding detection.
|
|
5
|
+
Author: Saad Sayed
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE.txt
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: pandas
|
|
11
|
+
Requires-Dist: scikit-learn
|
|
12
|
+
Requires-Dist: sentence-transformers
|
|
13
|
+
Requires-Dist: chromadb
|
|
14
|
+
Requires-Dist: langchain-community
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# AutoPreprocessor
|
|
19
|
+
|
|
20
|
+
An experimental AutoML preprocessing package that combines rule-based preprocessing with a lightweight Retrieval-Augmented Generation (RAG) system to automatically suggest encoding strategies for tabular datasets.
|
|
21
|
+
|
|
22
|
+
The project uses Sentence Transformers and ChromaDB locally. It does not rely on external LLM APIs.
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
- Mean, median, and mode imputation
|
|
27
|
+
- Label encoding for binary columns
|
|
28
|
+
- One-hot encoding
|
|
29
|
+
- RAG-assisted ordinal encoding
|
|
30
|
+
- Standard scaling and robust scaling
|
|
31
|
+
- Automatic dropping of columns with excessive missing values
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
Install locally from the project root:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For development, you can also install dependencies directly:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install -r requirements.txt
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Usage
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import pandas as pd
|
|
51
|
+
from autopreprocessor import AutoPreprocessor
|
|
52
|
+
|
|
53
|
+
data = pd.read_csv("sample.csv")
|
|
54
|
+
|
|
55
|
+
auto = AutoPreprocessor()
|
|
56
|
+
X_processed = auto.fit_transform(data)
|
|
57
|
+
|
|
58
|
+
print(auto.report_)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## RAG And ChromaDB
|
|
62
|
+
|
|
63
|
+
The package preserves the existing RAG behavior and ChromaDB usage. `AutoPreprocessor` loads the persistent ChromaDB collection named `rag_collection` from:
|
|
64
|
+
|
|
65
|
+
```text
|
|
66
|
+
autopreprocessor/chroma_db/
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
The ChromaDB path is resolved with `pathlib` relative to the installed package location, so it can work across machines and operating systems.
|
|
70
|
+
|
|
71
|
+
## Project Structure
|
|
72
|
+
|
|
73
|
+
```text
|
|
74
|
+
AutoPreprocessor/
|
|
75
|
+
|
|
|
76
|
+
|-- autopreprocessor/
|
|
77
|
+
| |-- __init__.py
|
|
78
|
+
| |-- auto_preprocessor.py
|
|
79
|
+
| |-- chroma_db/
|
|
80
|
+
|
|
|
81
|
+
|-- requirements.txt
|
|
82
|
+
|-- setup.py
|
|
83
|
+
|-- pyproject.toml
|
|
84
|
+
|-- README.md
|
|
85
|
+
|-- .gitignore
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## API
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from autopreprocessor import AutoPreprocessor
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
`AutoPreprocessor` exposes:
|
|
95
|
+
|
|
96
|
+
- `fit(data)`
|
|
97
|
+
- `transform(data)`
|
|
98
|
+
- `fit_transform(data)`
|
|
99
|
+
- `report_`
|
|
100
|
+
|
|
101
|
+
The preprocessing logic is preserved from the original notebook implementation.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# AutoPreprocessor
|
|
2
|
+
|
|
3
|
+
An experimental AutoML preprocessing package that combines rule-based preprocessing with a lightweight Retrieval-Augmented Generation (RAG) system to automatically suggest encoding strategies for tabular datasets.
|
|
4
|
+
|
|
5
|
+
The project uses Sentence Transformers and ChromaDB locally. It does not rely on external LLM APIs.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Mean, median, and mode imputation
|
|
10
|
+
- Label encoding for binary columns
|
|
11
|
+
- One-hot encoding
|
|
12
|
+
- RAG-assisted ordinal encoding
|
|
13
|
+
- Standard scaling and robust scaling
|
|
14
|
+
- Automatic dropping of columns with excessive missing values
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Install locally from the project root:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install .
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
For development, you can also install dependencies directly:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install -r requirements.txt
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import pandas as pd
|
|
34
|
+
from autopreprocessor import AutoPreprocessor
|
|
35
|
+
|
|
36
|
+
data = pd.read_csv("sample.csv")
|
|
37
|
+
|
|
38
|
+
auto = AutoPreprocessor()
|
|
39
|
+
X_processed = auto.fit_transform(data)
|
|
40
|
+
|
|
41
|
+
print(auto.report_)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## RAG And ChromaDB
|
|
45
|
+
|
|
46
|
+
The package preserves the existing RAG behavior and ChromaDB usage. `AutoPreprocessor` loads the persistent ChromaDB collection named `rag_collection` from:
|
|
47
|
+
|
|
48
|
+
```text
|
|
49
|
+
autopreprocessor/chroma_db/
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The ChromaDB path is resolved with `pathlib` relative to the installed package location, so it can work across machines and operating systems.
|
|
53
|
+
|
|
54
|
+
## Project Structure
|
|
55
|
+
|
|
56
|
+
```text
|
|
57
|
+
AutoPreprocessor/
|
|
58
|
+
|
|
|
59
|
+
|-- autopreprocessor/
|
|
60
|
+
| |-- __init__.py
|
|
61
|
+
| |-- auto_preprocessor.py
|
|
62
|
+
| |-- chroma_db/
|
|
63
|
+
|
|
|
64
|
+
|-- requirements.txt
|
|
65
|
+
|-- setup.py
|
|
66
|
+
|-- pyproject.toml
|
|
67
|
+
|-- README.md
|
|
68
|
+
|-- .gitignore
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## API
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from autopreprocessor import AutoPreprocessor
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
`AutoPreprocessor` exposes:
|
|
78
|
+
|
|
79
|
+
- `fit(data)`
|
|
80
|
+
- `transform(data)`
|
|
81
|
+
- `fit_transform(data)`
|
|
82
|
+
- `report_`
|
|
83
|
+
|
|
84
|
+
The preprocessing logic is preserved from the original notebook implementation.
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
from sentence_transformers import SentenceTransformer
|
|
6
|
+
import chromadb
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn import datasets
|
|
9
|
+
from sklearn.pipeline import Pipeline, make_pipeline
|
|
10
|
+
from sklearn.compose import ColumnTransformer, make_column_selector
|
|
11
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, OrdinalEncoder, LabelEncoder
|
|
12
|
+
from sklearn.impute import SimpleImputer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
BASE_DIR = Path(__file__).parent
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AutoPreprocessor:
|
|
19
|
+
def __init__(self):
|
|
20
|
+
self.preprocessor = None
|
|
21
|
+
self.report_ = None
|
|
22
|
+
|
|
23
|
+
def fit(self, data):
|
|
24
|
+
SIMILARITY_THRESHOLD = 0.5
|
|
25
|
+
DEFAULT_ENCODING = "onehot"
|
|
26
|
+
|
|
27
|
+
###output cols
|
|
28
|
+
mean_imputer = []
|
|
29
|
+
median_imputer = []
|
|
30
|
+
mode_imputer = []
|
|
31
|
+
label_encoder = []
|
|
32
|
+
onehot_encoder = []
|
|
33
|
+
ordinal_encoder =[]
|
|
34
|
+
standard_scaler = []
|
|
35
|
+
robust_scaler = []
|
|
36
|
+
no_scaling = []
|
|
37
|
+
drop_cols = []
|
|
38
|
+
|
|
39
|
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
40
|
+
client = chromadb.PersistentClient(
|
|
41
|
+
path=str(BASE_DIR / "chroma_db")
|
|
42
|
+
)
|
|
43
|
+
collection = client.get_collection("rag_collection")
|
|
44
|
+
|
|
45
|
+
for col in tqdm(data.columns, desc="Classifying Columns"):
|
|
46
|
+
if data[col].isnull().mean() > 0.5:
|
|
47
|
+
drop_cols.append(col)
|
|
48
|
+
continue
|
|
49
|
+
if data[col].dtype == object:
|
|
50
|
+
if data[col].isnull().sum() > 0:
|
|
51
|
+
mode_imputer.append(col)
|
|
52
|
+
if data[col].nunique() == 2:
|
|
53
|
+
label_encoder.append(col)
|
|
54
|
+
else:
|
|
55
|
+
onehot_encoder.append(col)
|
|
56
|
+
else:
|
|
57
|
+
if data[col].isnull().sum() > 0:
|
|
58
|
+
if abs(data[col].skew()) > 1:
|
|
59
|
+
median_imputer.append(col)
|
|
60
|
+
else:
|
|
61
|
+
mean_imputer.append(col)
|
|
62
|
+
if data[col].nunique()/len(data) < 0.05:
|
|
63
|
+
no_scaling.append(col)
|
|
64
|
+
elif abs(data[col].skew()) > 1:
|
|
65
|
+
robust_scaler.append(col)
|
|
66
|
+
else:
|
|
67
|
+
standard_scaler.append(col)
|
|
68
|
+
|
|
69
|
+
for j in tqdm(onehot_encoder, desc="RAG Encoding Prediction"):
|
|
70
|
+
values = pd.Series(data[j].dropna().unique()).astype(str).str.lower().tolist()
|
|
71
|
+
query = " ".join(values)
|
|
72
|
+
query_embedding = model.encode(query)
|
|
73
|
+
results = collection.query(
|
|
74
|
+
query_embeddings=[query_embedding.tolist()],
|
|
75
|
+
n_results=3
|
|
76
|
+
)
|
|
77
|
+
r_documents = results["documents"][0]
|
|
78
|
+
r_distances = results["distances"][0]
|
|
79
|
+
r_metadatas = results["metadatas"][0]
|
|
80
|
+
r_ids = results["ids"][0]
|
|
81
|
+
predictions = []
|
|
82
|
+
|
|
83
|
+
for i, (doc, distance, meta, rid) in enumerate(zip(r_documents, r_distances, r_metadatas, r_ids)):
|
|
84
|
+
similarity_score = 1 - distance
|
|
85
|
+
prediction = meta.get("encoding", DEFAULT_ENCODING) if similarity_score > SIMILARITY_THRESHOLD else DEFAULT_ENCODING
|
|
86
|
+
|
|
87
|
+
predictions.append({
|
|
88
|
+
"rank": i + 1,
|
|
89
|
+
"id": rid,
|
|
90
|
+
"similarity_score": similarity_score,
|
|
91
|
+
"metadata": meta,
|
|
92
|
+
"document": doc,
|
|
93
|
+
"prediction": prediction
|
|
94
|
+
})
|
|
95
|
+
best = max(predictions, key=lambda x: x["similarity_score"])
|
|
96
|
+
if best["prediction"] == "ordinal":
|
|
97
|
+
ordinal_encoder.append(j)
|
|
98
|
+
onehot_encoder = [col for col in onehot_encoder if col not in ordinal_encoder]
|
|
99
|
+
print("Ordinal encoding columns:\n")
|
|
100
|
+
for k in ordinal_encoder.copy():
|
|
101
|
+
print(f"Column: {k}")
|
|
102
|
+
print("Unique values:")
|
|
103
|
+
print(data[k].dropna().unique())
|
|
104
|
+
|
|
105
|
+
while True:
|
|
106
|
+
proceed = input("Keep as Ordinal? (Y/N): ").strip().upper()
|
|
107
|
+
|
|
108
|
+
if proceed == "Y":
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
elif proceed == "N":
|
|
112
|
+
onehot_encoder.append(k)
|
|
113
|
+
ordinal_encoder.remove(k)
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
else:
|
|
117
|
+
print("Only Y and N are expected.")
|
|
118
|
+
all_lists = [
|
|
119
|
+
mean_imputer,
|
|
120
|
+
median_imputer,
|
|
121
|
+
mode_imputer,
|
|
122
|
+
label_encoder,
|
|
123
|
+
onehot_encoder,
|
|
124
|
+
standard_scaler,
|
|
125
|
+
robust_scaler,
|
|
126
|
+
no_scaling,
|
|
127
|
+
ordinal_encoder
|
|
128
|
+
]
|
|
129
|
+
for lst in tqdm(all_lists, desc="Cleaning Drop Columns"):
|
|
130
|
+
lst[:] = [col for col in lst if col not in drop_cols]
|
|
131
|
+
|
|
132
|
+
mode_label = [col for col in mode_imputer if col in label_encoder]
|
|
133
|
+
mode_onehot = [col for col in mode_imputer if col in onehot_encoder]
|
|
134
|
+
mode_ordinal = [col for col in mode_imputer if col in ordinal_encoder]
|
|
135
|
+
mean_standard = [col for col in mean_imputer if col in standard_scaler]
|
|
136
|
+
mean_robust = [col for col in mean_imputer if col in robust_scaler]
|
|
137
|
+
mean_noscale = [col for col in mean_imputer if col in no_scaling]
|
|
138
|
+
median_standard = [col for col in median_imputer if col in standard_scaler]
|
|
139
|
+
median_robust = [col for col in median_imputer if col in robust_scaler]
|
|
140
|
+
median_noscale = [col for col in median_imputer if col in no_scaling]
|
|
141
|
+
label_only = [col for col in label_encoder if col not in mode_imputer]
|
|
142
|
+
onehot_only = [col for col in onehot_encoder if col not in mode_imputer]
|
|
143
|
+
ordinal_only = [col for col in ordinal_encoder if col not in mode_imputer]
|
|
144
|
+
standard_only = [col for col in standard_scaler if col not in mean_imputer + median_imputer]
|
|
145
|
+
robust_only = [col for col in robust_scaler if col not in mean_imputer + median_imputer]
|
|
146
|
+
noscale_only = [col for col in no_scaling if col not in mean_imputer + median_imputer]
|
|
147
|
+
|
|
148
|
+
mode_label_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OrdinalEncoder())])
|
|
149
|
+
mode_onehot_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))])
|
|
150
|
+
mode_ordinal_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
|
|
151
|
+
mean_standard_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
|
|
152
|
+
mean_robust_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', RobustScaler())])
|
|
153
|
+
median_standard_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
|
|
154
|
+
median_robust_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', RobustScaler())])
|
|
155
|
+
mean_noscale_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean'))])
|
|
156
|
+
median_noscale_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median'))])
|
|
157
|
+
label_pipeline = Pipeline([('encoder', OrdinalEncoder())])
|
|
158
|
+
onehot_pipeline = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore'))])
|
|
159
|
+
ordinal_pipeline = Pipeline([('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
|
|
160
|
+
standard_pipeline = Pipeline([('scaler', StandardScaler())])
|
|
161
|
+
robust_pipeline = Pipeline([('scaler', RobustScaler())])
|
|
162
|
+
|
|
163
|
+
transformers = []
|
|
164
|
+
if mode_label:
|
|
165
|
+
transformers.append(('mode_label', mode_label_pipeline, mode_label))
|
|
166
|
+
if mode_onehot:
|
|
167
|
+
transformers.append(('mode_onehot', mode_onehot_pipeline, mode_onehot))
|
|
168
|
+
if mode_ordinal:
|
|
169
|
+
transformers.append(('mode_ordinal', mode_ordinal_pipeline, mode_ordinal))
|
|
170
|
+
if mean_standard:
|
|
171
|
+
transformers.append(('mean_standard', mean_standard_pipeline, mean_standard))
|
|
172
|
+
if mean_robust:
|
|
173
|
+
transformers.append(('mean_robust', mean_robust_pipeline, mean_robust))
|
|
174
|
+
if mean_noscale:
|
|
175
|
+
transformers.append(('mean_noscale', mean_noscale_pipeline, mean_noscale))
|
|
176
|
+
if median_standard:
|
|
177
|
+
transformers.append(('median_standard', median_standard_pipeline, median_standard))
|
|
178
|
+
if median_robust:
|
|
179
|
+
transformers.append(('median_robust', median_robust_pipeline, median_robust))
|
|
180
|
+
if median_noscale:
|
|
181
|
+
transformers.append(('median_noscale', median_noscale_pipeline, median_noscale))
|
|
182
|
+
if label_only:
|
|
183
|
+
transformers.append(('label', label_pipeline, label_only))
|
|
184
|
+
if onehot_only:
|
|
185
|
+
transformers.append(('onehot', onehot_pipeline, onehot_only))
|
|
186
|
+
if ordinal_only:
|
|
187
|
+
transformers.append(('ordinal', ordinal_pipeline, ordinal_only))
|
|
188
|
+
if robust_only:
|
|
189
|
+
transformers.append(('robust', robust_pipeline, robust_only))
|
|
190
|
+
if standard_only:
|
|
191
|
+
transformers.append(('standard', standard_pipeline, standard_only))
|
|
192
|
+
if noscale_only:
|
|
193
|
+
transformers.append(('no_scaling', 'passthrough', noscale_only))
|
|
194
|
+
|
|
195
|
+
self.preprocessor = ColumnTransformer(
|
|
196
|
+
transformers=transformers,
|
|
197
|
+
remainder='drop'
|
|
198
|
+
)
|
|
199
|
+
self.preprocessor.fit(data)
|
|
200
|
+
|
|
201
|
+
self.report_ = {
|
|
202
|
+
"Mean Imputer": mean_imputer,
|
|
203
|
+
"Median Imputer": median_imputer,
|
|
204
|
+
"Mode Imputer": mode_imputer,
|
|
205
|
+
"Label Encoder": label_encoder,
|
|
206
|
+
"OneHot Encoder": onehot_encoder,
|
|
207
|
+
"Ordinal Encoder": ordinal_encoder,
|
|
208
|
+
"Standard Scaler": standard_scaler,
|
|
209
|
+
"Robust Scaler": robust_scaler,
|
|
210
|
+
"No Scaling Required": no_scaling,
|
|
211
|
+
"Drop Columns": drop_cols
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return self
|
|
215
|
+
|
|
216
|
+
def transform(self, data):
|
|
217
|
+
return self.preprocessor.transform(data)
|
|
218
|
+
|
|
219
|
+
def fit_transform(self, data):
|
|
220
|
+
self.fit(data)
|
|
221
|
+
return self.transform(data)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autopreprocessor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automatic preprocessing for tabular datasets using heuristics and RAG-assisted encoding detection.
|
|
5
|
+
Author: Saad Sayed
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE.txt
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: pandas
|
|
11
|
+
Requires-Dist: scikit-learn
|
|
12
|
+
Requires-Dist: sentence-transformers
|
|
13
|
+
Requires-Dist: chromadb
|
|
14
|
+
Requires-Dist: langchain-community
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# AutoPreprocessor
|
|
19
|
+
|
|
20
|
+
An experimental AutoML preprocessing package that combines rule-based preprocessing with a lightweight Retrieval-Augmented Generation (RAG) system to automatically suggest encoding strategies for tabular datasets.
|
|
21
|
+
|
|
22
|
+
The project uses Sentence Transformers and ChromaDB locally. It does not rely on external LLM APIs.
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
- Mean, median, and mode imputation
|
|
27
|
+
- Label encoding for binary columns
|
|
28
|
+
- One-hot encoding
|
|
29
|
+
- RAG-assisted ordinal encoding
|
|
30
|
+
- Standard scaling and robust scaling
|
|
31
|
+
- Automatic dropping of columns with excessive missing values
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
Install locally from the project root:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For development, you can also install dependencies directly:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install -r requirements.txt
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Usage
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import pandas as pd
|
|
51
|
+
from autopreprocessor import AutoPreprocessor
|
|
52
|
+
|
|
53
|
+
data = pd.read_csv("sample.csv")
|
|
54
|
+
|
|
55
|
+
auto = AutoPreprocessor()
|
|
56
|
+
X_processed = auto.fit_transform(data)
|
|
57
|
+
|
|
58
|
+
print(auto.report_)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## RAG And ChromaDB
|
|
62
|
+
|
|
63
|
+
The package preserves the existing RAG behavior and ChromaDB usage. `AutoPreprocessor` loads the persistent ChromaDB collection named `rag_collection` from:
|
|
64
|
+
|
|
65
|
+
```text
|
|
66
|
+
autopreprocessor/chroma_db/
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
The ChromaDB path is resolved with `pathlib` relative to the installed package location, so it can work across machines and operating systems.
|
|
70
|
+
|
|
71
|
+
## Project Structure
|
|
72
|
+
|
|
73
|
+
```text
|
|
74
|
+
AutoPreprocessor/
|
|
75
|
+
|
|
|
76
|
+
|-- autopreprocessor/
|
|
77
|
+
| |-- __init__.py
|
|
78
|
+
| |-- auto_preprocessor.py
|
|
79
|
+
| |-- chroma_db/
|
|
80
|
+
|
|
|
81
|
+
|-- requirements.txt
|
|
82
|
+
|-- setup.py
|
|
83
|
+
|-- pyproject.toml
|
|
84
|
+
|-- README.md
|
|
85
|
+
|-- .gitignore
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## API
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from autopreprocessor import AutoPreprocessor
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
`AutoPreprocessor` exposes:
|
|
95
|
+
|
|
96
|
+
- `fit(data)`
|
|
97
|
+
- `transform(data)`
|
|
98
|
+
- `fit_transform(data)`
|
|
99
|
+
- `report_`
|
|
100
|
+
|
|
101
|
+
The preprocessing logic is preserved from the original notebook implementation.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE.txt
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
autopreprocessor/__init__.py
|
|
6
|
+
autopreprocessor/auto_preprocessor.py
|
|
7
|
+
autopreprocessor.egg-info/PKG-INFO
|
|
8
|
+
autopreprocessor.egg-info/SOURCES.txt
|
|
9
|
+
autopreprocessor.egg-info/dependency_links.txt
|
|
10
|
+
autopreprocessor.egg-info/requires.txt
|
|
11
|
+
autopreprocessor.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
autopreprocessor
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "autopreprocessor"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Automatic preprocessing for tabular datasets using heuristics and RAG-assisted encoding detection."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Saad Sayed"}
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
dependencies = [
|
|
17
|
+
"numpy",
|
|
18
|
+
"pandas",
|
|
19
|
+
"scikit-learn",
|
|
20
|
+
"sentence-transformers",
|
|
21
|
+
"chromadb",
|
|
22
|
+
"langchain-community",
|
|
23
|
+
"tqdm"
|
|
24
|
+
]
|