dataset-preprocessing-agent 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Ton Nom
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,148 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataset-preprocessing-agent
3
+ Version: 0.1.0
4
+ Summary: LLM-based automated dataset standardization and evaluation framework
5
+ License: MIT License
6
+
7
+ Copyright (c) 2025 Ton Nom
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+ Requires-Python: >=3.10
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENCE
29
+ Requires-Dist: datasets>=3.6.0
30
+ Requires-Dist: unitxt
31
+ Requires-Dist: transformers
32
+ Requires-Dist: torch
33
+ Requires-Dist: accelerate
34
+ Requires-Dist: sentence-transformers
35
+ Requires-Dist: openai
36
+ Requires-Dist: pandas
37
+ Requires-Dist: tasksource
38
+ Provides-Extra: notebook
39
+ Requires-Dist: seaborn; extra == "notebook"
40
+ Requires-Dist: matplotlib; extra == "notebook"
41
+ Requires-Dist: ipywidgets; extra == "notebook"
42
+ Dynamic: license-file
43
+
44
+ # dataset-preprocessing-agent
45
+
46
+ **Automated dataset standardization using LLM agents.**
47
+
48
+ Every HuggingFace dataset has a unique schema (`tweet_text`, `review_body`, `sentence1`, …), making it hard to reuse models across tasks without manual mapping work. This library automates that step: given a raw dataset, an LLM inspects a small sample and produces a JSON mapping of raw column names to a canonical schema, evaluated against [Unitxt](https://github.com/IBM/unitxt) and [tasksource](https://github.com/sileod/tasksource) ground truths.
49
+
50
+ ---
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install dataset-preprocessing-agent
56
+ ```
57
+
58
+ For notebook visualization support:
59
+
60
+ ```bash
61
+ pip install "dataset-preprocessing-agent[notebook]"
62
+ ```
63
+
64
+ **Python 3.10+ required.**
65
+
66
+ For the API backend, export your OpenRouter key:
67
+
68
+ ```bash
69
+ export OPENROUTER_API_KEY="your_key_here"
70
+ ```
71
+
72
+ ---
73
+
74
+ ## Quick Start
75
+
76
+ ```python
77
+ from dataset_preprocessing_agent.standardize_api import load_standardized_dataset
78
+
79
+ result = load_standardized_dataset("glue", config="sst2")
80
+ print(result["mapping"])
81
+ # {"task": "classification", "text": "sentence", "label": "label"}
82
+ ```
83
+
84
+ ### Evaluate against Unitxt ground truth
85
+
86
+ ```python
87
+ from dataset_preprocessing_agent.eval import evaluate
88
+
89
+ result = evaluate(hf_name="glue", hf_config="sst2", card_id="sst2")
90
+ print(result["score"])
91
+ print(result["gt_cols"]) # e.g. ['label', 'sentence']
92
+ print(result["pred_cols"]) # e.g. ['label', 'sentence']
93
+ ```
94
+
95
+ ### Evaluate against tasksource ground truth
96
+
97
+ ```python
98
+ from dataset_preprocessing_agent.eval_ts import evaluate_ts
99
+
100
+ result = evaluate_ts("glue", "rte")
101
+ print(result["score"])
102
+ print(result["ts_gt"]) # GT mapping from tasksource preprocessing
103
+ ```
104
+
105
+ ---
106
+
107
+ ## Architecture
108
+
109
+ The pipeline runs in three stages:
110
+
111
+ 1. **Standardization** — an LLM inspects 5–10 raw samples and outputs a JSON mapping of raw column names to canonical fields (`task`, `text` / `text_a` + `text_b`, `label`).
112
+ 2. **Mapping application** — `apply_llm_mapping` renames columns and converts integer labels to class name strings.
113
+ 3. **Evaluation** — the predicted raw column set is compared to the ground-truth column set using Jaccard similarity on raw HuggingFace column names.
114
+
115
+ ### Backends
116
+
117
+ | Module | Backend |
118
+ |--------|---------|
119
+ | `standardize_api` | Cloud LLM via OpenRouter API |
120
+ | `standardize_local` | Local HuggingFace model |
121
+
122
+ ### Baselines
123
+
124
+ | Baseline | Method |
125
+ |----------|--------|
126
+ | `baseline_keyword_match` | Synonym dictionary matching |
127
+ | `baseline_embedding_match` | Cosine similarity via `all-MiniLM-L6-v2` |
128
+
129
+ ### Evaluation backends
130
+
131
+ | Module | Ground truth |
132
+ |--------|-------------|
133
+ | `eval` | Unitxt task cards |
134
+ | `eval_ts` | tasksource preprocessing objects |
135
+
136
+ ---
137
+
138
+ ## Dependencies
139
+
140
+ | Package | Purpose |
141
+ |---------|---------|
142
+ | `unitxt` | Ground-truth task cards |
143
+ | `tasksource` | Ground-truth preprocessing objects |
144
+ | `datasets` | HuggingFace dataset loading |
145
+ | `transformers` + `torch` + `accelerate` | Local model inference |
146
+ | `openai` | OpenRouter API client |
147
+ | `sentence-transformers` | Embedding baseline |
148
+ | `pandas` | Result DataFrames |
@@ -0,0 +1,105 @@
1
+ # dataset-preprocessing-agent
2
+
3
+ **Automated dataset standardization using LLM agents.**
4
+
5
+ Every HuggingFace dataset has a unique schema (`tweet_text`, `review_body`, `sentence1`, …), making it hard to reuse models across tasks without manual mapping work. This library automates that step: given a raw dataset, an LLM inspects a small sample and produces a JSON mapping of raw column names to a canonical schema, evaluated against [Unitxt](https://github.com/IBM/unitxt) and [tasksource](https://github.com/sileod/tasksource) ground truths.
6
+
7
+ ---
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install dataset-preprocessing-agent
13
+ ```
14
+
15
+ For notebook visualization support:
16
+
17
+ ```bash
18
+ pip install "dataset-preprocessing-agent[notebook]"
19
+ ```
20
+
21
+ **Python 3.10+ required.**
22
+
23
+ For the API backend, export your OpenRouter key:
24
+
25
+ ```bash
26
+ export OPENROUTER_API_KEY="your_key_here"
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Quick Start
32
+
33
+ ```python
34
+ from dataset_preprocessing_agent.standardize_api import load_standardized_dataset
35
+
36
+ result = load_standardized_dataset("glue", config="sst2")
37
+ print(result["mapping"])
38
+ # {"task": "classification", "text": "sentence", "label": "label"}
39
+ ```
40
+
41
+ ### Evaluate against Unitxt ground truth
42
+
43
+ ```python
44
+ from dataset_preprocessing_agent.eval import evaluate
45
+
46
+ result = evaluate(hf_name="glue", hf_config="sst2", card_id="sst2")
47
+ print(result["score"])
48
+ print(result["gt_cols"]) # e.g. ['label', 'sentence']
49
+ print(result["pred_cols"]) # e.g. ['label', 'sentence']
50
+ ```
51
+
52
+ ### Evaluate against tasksource ground truth
53
+
54
+ ```python
55
+ from dataset_preprocessing_agent.eval_ts import evaluate_ts
56
+
57
+ result = evaluate_ts("glue", "rte")
58
+ print(result["score"])
59
+ print(result["ts_gt"]) # GT mapping from tasksource preprocessing
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Architecture
65
+
66
+ The pipeline runs in three stages:
67
+
68
+ 1. **Standardization** — an LLM inspects 5–10 raw samples and outputs a JSON mapping of raw column names to canonical fields (`task`, `text` / `text_a` + `text_b`, `label`).
69
+ 2. **Mapping application** — `apply_llm_mapping` renames columns and converts integer labels to class name strings.
70
+ 3. **Evaluation** — the predicted raw column set is compared to the ground-truth column set using Jaccard similarity on raw HuggingFace column names.
71
+
72
+ ### Backends
73
+
74
+ | Module | Backend |
75
+ |--------|---------|
76
+ | `standardize_api` | Cloud LLM via OpenRouter API |
77
+ | `standardize_local` | Local HuggingFace model |
78
+
79
+ ### Baselines
80
+
81
+ | Baseline | Method |
82
+ |----------|--------|
83
+ | `baseline_keyword_match` | Synonym dictionary matching |
84
+ | `baseline_embedding_match` | Cosine similarity via `all-MiniLM-L6-v2` |
85
+
86
+ ### Evaluation backends
87
+
88
+ | Module | Ground truth |
89
+ |--------|-------------|
90
+ | `eval` | Unitxt task cards |
91
+ | `eval_ts` | tasksource preprocessing objects |
92
+
93
+ ---
94
+
95
+ ## Dependencies
96
+
97
+ | Package | Purpose |
98
+ |---------|---------|
99
+ | `unitxt` | Ground-truth task cards |
100
+ | `tasksource` | Ground-truth preprocessing objects |
101
+ | `datasets` | HuggingFace dataset loading |
102
+ | `transformers` + `torch` + `accelerate` | Local model inference |
103
+ | `openai` | OpenRouter API client |
104
+ | `sentence-transformers` | Embedding baseline |
105
+ | `pandas` | Result DataFrames |
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dataset-preprocessing-agent"
7
+ version = "0.1.0"
8
+ description = "LLM-based automated dataset standardization and evaluation framework"
9
+ readme = "README.md"
10
+ license = { file = "LICENCE" }
11
+ requires-python = ">=3.10"
12
+ dependencies = [
13
+ "datasets>=3.6.0",
14
+ "unitxt",
15
+ "transformers",
16
+ "torch",
17
+ "accelerate",
18
+ "sentence-transformers",
19
+ "openai",
20
+ "pandas",
21
+ "tasksource",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ notebook = [
26
+ "seaborn",
27
+ "matplotlib",
28
+ "ipywidgets",
29
+ ]
30
+
31
+ [tool.setuptools]
32
+ include-package-data = false
33
+
34
+ [tool.setuptools.packages.find]
35
+ where = ["src"]
36
+ include = ["dataset_preprocessing_agent*"]
37
+ exclude = ["notebooks*", "results*", "tests*"]
38
+
39
+ [tool.pyright]
40
+ pythonVersion = "3.10"
41
+ extraPaths = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,14 @@
1
+ from .standardize_api import load_standardized_dataset
2
+ from .standardize_local import load_standardized_dataset_local
3
+ from .eval import evaluate
4
+ from .eval_ts import evaluate_ts
5
+ from .baselines import baseline_keyword_match, baseline_embedding_match
6
+
7
+ __all__ = [
8
+ "load_standardized_dataset",
9
+ "load_standardized_dataset_local",
10
+ "evaluate",
11
+ "evaluate_ts",
12
+ "baseline_keyword_match",
13
+ "baseline_embedding_match",
14
+ ]
@@ -0,0 +1,174 @@
1
+ """
2
+ Baseline methods for dataset column mapping.
3
+ Compares against LLM-based standardization approach.
4
+ """
5
+ import re
6
+ from datasets import load_dataset
7
+ from sentence_transformers import SentenceTransformer, util
8
+ from .utils import generate_code
9
+
10
+ _FALLBACK_SPLITS = ["train", "test", "validation"]
11
+
12
+
13
+ def _load_split(name: str, config: str | None) -> object:
14
+ """Load a HuggingFace dataset trying train → test → validation.
15
+
16
+ If config is None and the dataset requires one, the first suggested
17
+ config is extracted from the error message and retried automatically.
18
+ """
19
+ last_err = None
20
+ for split in _FALLBACK_SPLITS:
21
+ try:
22
+ return load_dataset(name, config, split=split, streaming=True) if config else \
23
+ load_dataset(name, split=split, streaming=True)
24
+ except Exception as e:
25
+ err_str = str(e)
26
+ if config is None and "Config name is missing" in err_str:
27
+ candidates = [c for c in re.findall(r"'([^']+)'", err_str) if c != name]
28
+ if candidates:
29
+ try:
30
+ return load_dataset(name, candidates[0], split=split, streaming=True)
31
+ except Exception as e2:
32
+ last_err = e2
33
+ else:
34
+ last_err = e
35
+ break
36
+ else:
37
+ last_err = e
38
+ raise ValueError(f"No accessible split for {name}/{config}: {last_err}")
39
+
40
+
41
+ FIELD_SYNONYMS = {
42
+ "text": ["text", "sentence", "review", "body", "content", "tweet", "document"],
43
+ "label": ["label", "target", "score", "class", "category", "sentiment"],
44
+ "text_a": ["text_a", "premise", "sentence1", "context", "source","question", "query", "prompt"],
45
+ "text_b": ["text_b", "hypothesis", "sentence2", "response", "answer", "response", "output"],
46
+ }
47
+
48
+ STANDARD_FIELDS = ["text", "label", "text_a", "text_b", "question", "answer", "input", "output"]
49
+
50
+ _embedding_model = None
51
+
52
+
53
+ def _get_embedding_model():
54
+ """
55
+ Lazy load sentence transformer model.
56
+
57
+ Returns:
58
+ SentenceTransformer model instance.
59
+ """
60
+ global _embedding_model
61
+ if _embedding_model is None:
62
+ _embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
63
+ return _embedding_model
64
+
65
+
66
+ def _infer_task(mapping: dict) -> str:
67
+ """
68
+ Infer task type from mapping keys.
69
+
70
+ Args:
71
+ mapping: Dictionary mapping standard field names to source column names.
72
+
73
+ Returns:
74
+ Inferred task type as string (nli, qa, generation, or classification).
75
+ """
76
+ keys = set(mapping.keys())
77
+ if keys & {"text_a", "text_b", "premise", "hypothesis"}:
78
+ return "nli"
79
+ if keys & {"question", "answer"}:
80
+ return "qa"
81
+ if keys & {"input", "output"}:
82
+ return "generation"
83
+ return "classification"
84
+
85
+
86
+ def baseline_keyword_match(dataset, config: str = None) -> dict:
87
+ """
88
+ Rule-based keyword matching baseline.
89
+
90
+ Maps columns to Unitxt fields using synonym dictionaries.
91
+
92
+ Args:
93
+ dataset: Dataset name (str) or dataset object.
94
+ config: Optional dataset configuration name.
95
+
96
+ Returns:
97
+ Dictionary containing mapping, code, score, and dataset.
98
+ """
99
+ if isinstance(dataset, str):
100
+ ds = _load_split(dataset, config)
101
+ else:
102
+ ds = dataset
103
+
104
+ features = ds.features
105
+ if features is None:
106
+ sample = next(iter(ds))
107
+ features = {k: type(v).__name__ for k, v in sample.items()}
108
+ columns = set(features.keys())
109
+ mapping = {}
110
+
111
+ for field, synonyms in FIELD_SYNONYMS.items():
112
+ for col in columns:
113
+ if col.lower() in synonyms:
114
+ mapping[field] = col
115
+ break
116
+
117
+ mapping["task"] = _infer_task(mapping)
118
+
119
+ return {
120
+ "mapping": mapping,
121
+ "code": generate_code(mapping),
122
+ "score": 0.0,
123
+ "dataset": ds,
124
+ }
125
+
126
+
127
+ def baseline_embedding_match(dataset, config: str = None, threshold: float = 0.6) -> dict:
128
+ """
129
+ Semantic similarity baseline using sentence-transformers.
130
+
131
+ Matches columns to Unitxt fields via cosine similarity.
132
+
133
+ Args:
134
+ dataset: Dataset name (str) or dataset object.
135
+ config: Optional dataset configuration name.
136
+ threshold: Minimum cosine similarity threshold for matching.
137
+
138
+ Returns:
139
+ Dictionary containing mapping, code, score, and dataset.
140
+ """
141
+ if isinstance(dataset, str):
142
+ ds = _load_split(dataset, config)
143
+ else:
144
+ ds = dataset
145
+
146
+ features = ds.features
147
+ if features is None:
148
+ sample = next(iter(ds))
149
+ features = {k: type(v).__name__ for k, v in sample.items()}
150
+ columns = list(features.keys())
151
+ model = _get_embedding_model()
152
+
153
+ col_embeddings = model.encode(columns, convert_to_tensor=True)
154
+ field_embeddings = model.encode(STANDARD_FIELDS, convert_to_tensor=True)
155
+
156
+ similarities = util.cos_sim(field_embeddings, col_embeddings)
157
+
158
+ mapping = {}
159
+ for i, field in enumerate(STANDARD_FIELDS):
160
+ best_idx = similarities[i].argmax().item()
161
+ best_score = similarities[i][best_idx].item()
162
+ if best_score >= threshold:
163
+ mapping[field] = columns[best_idx]
164
+
165
+ mapping["task"] = _infer_task(mapping)
166
+
167
+ return {
168
+ "mapping": mapping,
169
+ "code": generate_code(mapping),
170
+ "score": 0.0,
171
+ "dataset": ds,
172
+ }
173
+
174
+