chatan 0.1.0__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatan-0.1.0 → chatan-0.1.3}/.gitignore +2 -0
- chatan-0.1.3/.readthedocs.yml +22 -0
- chatan-0.1.3/LICENSE +21 -0
- chatan-0.1.3/PKG-INFO +124 -0
- chatan-0.1.3/README.md +86 -0
- chatan-0.1.3/docs/Makefile +20 -0
- chatan-0.1.3/docs/make.bat +35 -0
- chatan-0.1.3/docs/source/api.rst +17 -0
- chatan-0.1.3/docs/source/conf.py +46 -0
- chatan-0.1.3/docs/source/examples.rst +72 -0
- chatan-0.1.3/docs/source/index.rst +53 -0
- chatan-0.1.3/docs/source/other_examples.rst +91 -0
- chatan-0.1.3/docs/source/quickstart.rst +65 -0
- {chatan-0.1.0 → chatan-0.1.3}/pyproject.toml +16 -0
- chatan-0.1.3/src/chatan/__init__.py +11 -0
- {chatan-0.1.0 → chatan-0.1.3}/src/chatan/dataset.py +50 -5
- chatan-0.1.3/src/chatan/evaluate.py +320 -0
- {chatan-0.1.0 → chatan-0.1.3}/src/chatan/generator.py +51 -12
- chatan-0.1.3/src/chatan/viewer.py +581 -0
- chatan-0.1.3/tests/test_evaluate.py +618 -0
- {chatan-0.1.0 → chatan-0.1.3}/tests/test_generator.py +73 -3
- {chatan-0.1.0 → chatan-0.1.3}/uv.lock +2941 -691
- chatan-0.1.0/PKG-INFO +0 -83
- chatan-0.1.0/README.md +0 -52
- chatan-0.1.0/src/chatan/__init__.py +0 -9
- {chatan-0.1.0 → chatan-0.1.3}/.python-version +0 -0
- {chatan-0.1.0 → chatan-0.1.3}/main.py +0 -0
- {chatan-0.1.0 → chatan-0.1.3}/src/chatan/sampler.py +0 -0
- {chatan-0.1.0 → chatan-0.1.3}/tests/test_dataset_comprehensive.py +0 -0
- {chatan-0.1.0 → chatan-0.1.3}/tests/test_datset.py +0 -0
- {chatan-0.1.0 → chatan-0.1.3}/tests/test_sampler.py +0 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
version: 2
|
2
|
+
|
3
|
+
build:
|
4
|
+
os: ubuntu-22.04
|
5
|
+
tools:
|
6
|
+
python: "3.11"
|
7
|
+
jobs:
|
8
|
+
pre_create_environment:
|
9
|
+
- asdf plugin add uv
|
10
|
+
- asdf install uv latest
|
11
|
+
- asdf global uv latest
|
12
|
+
create_environment:
|
13
|
+
- uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
|
14
|
+
install:
|
15
|
+
- UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
|
16
|
+
|
17
|
+
sphinx:
|
18
|
+
configuration: docs/source/conf.py
|
19
|
+
|
20
|
+
formats:
|
21
|
+
- pdf
|
22
|
+
- epub
|
chatan-0.1.3/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Christian Reetz
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
chatan-0.1.3/PKG-INFO
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: chatan
|
3
|
+
Version: 0.1.3
|
4
|
+
Summary: Create synthetic datasets with LLM generators and samplers
|
5
|
+
Project-URL: Documentation, https://github.com/cdreetz/chatan#readme
|
6
|
+
Project-URL: Issues, https://github.com/cdreetz/chatan/issues
|
7
|
+
Project-URL: Source, https://github.com/cdreetz/chatan
|
8
|
+
Author-email: Christian Reetz <cdreetz@gmail.com>
|
9
|
+
License-Expression: MIT
|
10
|
+
License-File: LICENSE
|
11
|
+
Keywords: dataset generation,llm,machine learning,synthetic data
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
13
|
+
Classifier: Intended Audience :: Developers
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
15
|
+
Classifier: Operating System :: OS Independent
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
24
|
+
Requires-Python: >=3.8
|
25
|
+
Requires-Dist: anthropic>=0.7.0
|
26
|
+
Requires-Dist: datasets>=2.0.0
|
27
|
+
Requires-Dist: nltk>=3.9.1
|
28
|
+
Requires-Dist: numpy>=1.20.0
|
29
|
+
Requires-Dist: openai>=1.0.0
|
30
|
+
Requires-Dist: pandas>=1.3.0
|
31
|
+
Requires-Dist: pydantic>=2.0.0
|
32
|
+
Requires-Dist: scikit-learn>=1.3.2
|
33
|
+
Requires-Dist: sentence-transformers>=3.2.1
|
34
|
+
Requires-Dist: torch>=2.5.1
|
35
|
+
Requires-Dist: tqdm>=4.0.0
|
36
|
+
Requires-Dist: transformers>=4.0.0
|
37
|
+
Description-Content-Type: text/markdown
|
38
|
+
|
39
|
+
# Chatan
|
40
|
+
|
41
|
+
Create diverse, synthetic datasets. Start from scratch or augment an existing dataset. Simple define your dataset schema as a set of generators, typically being LLMs with a prompt describing what kind of examples you want.
|
42
|
+
|
43
|
+
## Installation
|
44
|
+
|
45
|
+
```
|
46
|
+
pip install chatan
|
47
|
+
```
|
48
|
+
|
49
|
+
## Getting Started
|
50
|
+
|
51
|
+
```
|
52
|
+
import chatan
|
53
|
+
|
54
|
+
# Create a generator
|
55
|
+
gen = chatan.generator("openai", "YOUR_API_KEY")
|
56
|
+
|
57
|
+
# Define a dataset schema
|
58
|
+
ds = chatan.dataset({
|
59
|
+
"topic": chatan.sample.choice(["Python", "JavaScript", "Rust"]),
|
60
|
+
"prompt": gen("write a programming question about {topic}"),
|
61
|
+
"response": gen("answer this question: {prompt}")
|
62
|
+
})
|
63
|
+
|
64
|
+
# Generate the data with a progress bar
|
65
|
+
df = ds.generate(n=10)
|
66
|
+
```
|
67
|
+
|
68
|
+
## Examples
|
69
|
+
|
70
|
+
Create Data Mixes
|
71
|
+
|
72
|
+
```
|
73
|
+
from chatan import dataset, generator, sample
|
74
|
+
import uuid
|
75
|
+
|
76
|
+
gen = generator("openai", "YOUR_API_KEY")
|
77
|
+
|
78
|
+
mix = [
|
79
|
+
"san antonio, tx",
|
80
|
+
"marfa, tx",
|
81
|
+
"paris, fr"
|
82
|
+
]
|
83
|
+
|
84
|
+
ds = dataset({
|
85
|
+
"id": sample.uuid(),
|
86
|
+
"topic": sample.choice(mix),
|
87
|
+
"prompt": gen("write an example question about the history of {topic}"),
|
88
|
+
"response": gen("respond to: {prompt}"),
|
89
|
+
})
|
90
|
+
```
|
91
|
+
|
92
|
+
Augment datasets
|
93
|
+
|
94
|
+
```
|
95
|
+
from chatan import generator, dataset, sample
|
96
|
+
from datasets import load_dataset
|
97
|
+
|
98
|
+
gen = generator("openai", "YOUR_API_KEY")
|
99
|
+
hf_data = load_dataset("some/dataset")
|
100
|
+
|
101
|
+
ds = dataset({
|
102
|
+
"original_prompt": sample.from_dataset(hf_data, "prompt"),
|
103
|
+
"variation": gen("rewrite this prompt: {original_prompt}"),
|
104
|
+
"response": gen("respond to: {variation}")
|
105
|
+
})
|
106
|
+
|
107
|
+
```
|
108
|
+
|
109
|
+
## Citation
|
110
|
+
|
111
|
+
If you use this code in your research, please cite:
|
112
|
+
|
113
|
+
```
|
114
|
+
@software{reetz2025chatan,
|
115
|
+
author = {Reetz, Christian},
|
116
|
+
title = {chatan: Create synthetic datasets with LLM generators.},
|
117
|
+
url = {https://github.com/cdreetz/chatan},
|
118
|
+
year = {2025}
|
119
|
+
}
|
120
|
+
```
|
121
|
+
|
122
|
+
## Contributing
|
123
|
+
|
124
|
+
Community contributions are more than welcome, bug reports, bug fixes, feature requests, feature additions, please refer to the Issues tab.
|
chatan-0.1.3/README.md
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
# Chatan
|
2
|
+
|
3
|
+
Create diverse, synthetic datasets. Start from scratch or augment an existing dataset. Simple define your dataset schema as a set of generators, typically being LLMs with a prompt describing what kind of examples you want.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
```
|
8
|
+
pip install chatan
|
9
|
+
```
|
10
|
+
|
11
|
+
## Getting Started
|
12
|
+
|
13
|
+
```
|
14
|
+
import chatan
|
15
|
+
|
16
|
+
# Create a generator
|
17
|
+
gen = chatan.generator("openai", "YOUR_API_KEY")
|
18
|
+
|
19
|
+
# Define a dataset schema
|
20
|
+
ds = chatan.dataset({
|
21
|
+
"topic": chatan.sample.choice(["Python", "JavaScript", "Rust"]),
|
22
|
+
"prompt": gen("write a programming question about {topic}"),
|
23
|
+
"response": gen("answer this question: {prompt}")
|
24
|
+
})
|
25
|
+
|
26
|
+
# Generate the data with a progress bar
|
27
|
+
df = ds.generate(n=10)
|
28
|
+
```
|
29
|
+
|
30
|
+
## Examples
|
31
|
+
|
32
|
+
Create Data Mixes
|
33
|
+
|
34
|
+
```
|
35
|
+
from chatan import dataset, generator, sample
|
36
|
+
import uuid
|
37
|
+
|
38
|
+
gen = generator("openai", "YOUR_API_KEY")
|
39
|
+
|
40
|
+
mix = [
|
41
|
+
"san antonio, tx",
|
42
|
+
"marfa, tx",
|
43
|
+
"paris, fr"
|
44
|
+
]
|
45
|
+
|
46
|
+
ds = dataset({
|
47
|
+
"id": sample.uuid(),
|
48
|
+
"topic": sample.choice(mix),
|
49
|
+
"prompt": gen("write an example question about the history of {topic}"),
|
50
|
+
"response": gen("respond to: {prompt}"),
|
51
|
+
})
|
52
|
+
```
|
53
|
+
|
54
|
+
Augment datasets
|
55
|
+
|
56
|
+
```
|
57
|
+
from chatan import generator, dataset, sample
|
58
|
+
from datasets import load_dataset
|
59
|
+
|
60
|
+
gen = generator("openai", "YOUR_API_KEY")
|
61
|
+
hf_data = load_dataset("some/dataset")
|
62
|
+
|
63
|
+
ds = dataset({
|
64
|
+
"original_prompt": sample.from_dataset(hf_data, "prompt"),
|
65
|
+
"variation": gen("rewrite this prompt: {original_prompt}"),
|
66
|
+
"response": gen("respond to: {variation}")
|
67
|
+
})
|
68
|
+
|
69
|
+
```
|
70
|
+
|
71
|
+
## Citation
|
72
|
+
|
73
|
+
If you use this code in your research, please cite:
|
74
|
+
|
75
|
+
```
|
76
|
+
@software{reetz2025chatan,
|
77
|
+
author = {Reetz, Christian},
|
78
|
+
title = {chatan: Create synthetic datasets with LLM generators.},
|
79
|
+
url = {https://github.com/cdreetz/chatan},
|
80
|
+
year = {2025}
|
81
|
+
}
|
82
|
+
```
|
83
|
+
|
84
|
+
## Contributing
|
85
|
+
|
86
|
+
Community contributions are more than welcome, bug reports, bug fixes, feature requests, feature additions, please refer to the Issues tab.
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Minimal makefile for Sphinx documentation
|
2
|
+
#
|
3
|
+
|
4
|
+
# You can set these variables from the command line, and also
|
5
|
+
# from the environment for the first two.
|
6
|
+
SPHINXOPTS ?=
|
7
|
+
SPHINXBUILD ?= sphinx-build
|
8
|
+
SOURCEDIR = source
|
9
|
+
BUILDDIR = build
|
10
|
+
|
11
|
+
# Put it first so that "make" without argument is like "make help".
|
12
|
+
help:
|
13
|
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
14
|
+
|
15
|
+
.PHONY: help Makefile
|
16
|
+
|
17
|
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
18
|
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
19
|
+
%: Makefile
|
20
|
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
@@ -0,0 +1,35 @@
|
|
1
|
+
@ECHO OFF
|
2
|
+
|
3
|
+
pushd %~dp0
|
4
|
+
|
5
|
+
REM Command file for Sphinx documentation
|
6
|
+
|
7
|
+
if "%SPHINXBUILD%" == "" (
|
8
|
+
set SPHINXBUILD=sphinx-build
|
9
|
+
)
|
10
|
+
set SOURCEDIR=source
|
11
|
+
set BUILDDIR=build
|
12
|
+
|
13
|
+
%SPHINXBUILD% >NUL 2>NUL
|
14
|
+
if errorlevel 9009 (
|
15
|
+
echo.
|
16
|
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
17
|
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
18
|
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
19
|
+
echo.may add the Sphinx directory to PATH.
|
20
|
+
echo.
|
21
|
+
echo.If you don't have Sphinx installed, grab it from
|
22
|
+
echo.https://www.sphinx-doc.org/
|
23
|
+
exit /b 1
|
24
|
+
)
|
25
|
+
|
26
|
+
if "%1" == "" goto help
|
27
|
+
|
28
|
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
29
|
+
goto end
|
30
|
+
|
31
|
+
:help
|
32
|
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
33
|
+
|
34
|
+
:end
|
35
|
+
popd
|
@@ -0,0 +1,17 @@
|
|
1
|
+
API Reference
|
2
|
+
=====================================
|
3
|
+
|
4
|
+
Dataset
|
5
|
+
-------
|
6
|
+
.. automodule:: chatan.dataset
|
7
|
+
:members:
|
8
|
+
|
9
|
+
Generator
|
10
|
+
---------
|
11
|
+
.. automodule:: chatan.generator
|
12
|
+
:members:
|
13
|
+
|
14
|
+
Sampler
|
15
|
+
-------
|
16
|
+
.. automodule:: chatan.sampler
|
17
|
+
:members:
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Configuration file for the Sphinx documentation builder.
|
2
|
+
#
|
3
|
+
# For the full list of built-in configuration values, see the documentation:
|
4
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
5
|
+
#
|
6
|
+
import os
|
7
|
+
import sys
|
8
|
+
sys.path.insert(0, os.path.abspath('../src'))
|
9
|
+
|
10
|
+
# -- Project information -----------------------------------------------------
|
11
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
12
|
+
|
13
|
+
project = 'chatan'
|
14
|
+
copyright = '2025, Christian Reetz'
|
15
|
+
author = 'Christian Reetz'
|
16
|
+
release = '0.1.1'
|
17
|
+
|
18
|
+
# -- General configuration ---------------------------------------------------
|
19
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
20
|
+
|
21
|
+
extensions = [
|
22
|
+
'sphinx.ext.autodoc',
|
23
|
+
'sphinx.ext.viewcode',
|
24
|
+
'sphinx.ext.napoleon',
|
25
|
+
'sphinx.ext.autosummary',
|
26
|
+
'sphinx_autodoc_typehints',
|
27
|
+
]
|
28
|
+
|
29
|
+
templates_path = ['_templates']
|
30
|
+
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
# -- Options for HTML output -------------------------------------------------
|
35
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
36
|
+
|
37
|
+
html_theme = 'sphinx_rtd_theme'
|
38
|
+
html_theme_options = {
|
39
|
+
'navigation_depth': 5,
|
40
|
+
}
|
41
|
+
html_static_path = ['_static']
|
42
|
+
|
43
|
+
# -- Extension configuration -------------------------------------------------
|
44
|
+
|
45
|
+
|
46
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
Examples
|
2
|
+
================================================
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
Basic QA Dataset
|
7
|
+
----------------------------
|
8
|
+
|
9
|
+
.. code-block:: python
|
10
|
+
|
11
|
+
import chatan
|
12
|
+
|
13
|
+
gen = chatan.generator("openai", "YOUR_API_KEY")
|
14
|
+
ds = chatan.dataset({
|
15
|
+
"question": gen("write a example question from a 5th grade math test"),
|
16
|
+
"answer": gen("answer: {question}")
|
17
|
+
})
|
18
|
+
|
19
|
+
df = ds.generate(100)
|
20
|
+
|
21
|
+
Creating Data Mixes
|
22
|
+
----------------------------
|
23
|
+
|
24
|
+
.. code-block:: python
|
25
|
+
|
26
|
+
import uuid
|
27
|
+
from chatan import dataset, generator, sample
|
28
|
+
|
29
|
+
gen = generator("openai", "YOUR_API_KEY")
|
30
|
+
|
31
|
+
mix = [
|
32
|
+
"san antonio, tx",
|
33
|
+
"marfa, tx",
|
34
|
+
"paris, fr"
|
35
|
+
]
|
36
|
+
|
37
|
+
ds = dataset({
|
38
|
+
"id": sample.uuid(),
|
39
|
+
"topic": sample.choice(mix),
|
40
|
+
"prompt": gen("write an example question about the history of {topic}"),
|
41
|
+
"response": gen("respond to: {prompt}"),
|
42
|
+
})
|
43
|
+
|
44
|
+
Dataset Augmentation
|
45
|
+
-------------------------------
|
46
|
+
|
47
|
+
.. code-block:: python
|
48
|
+
|
49
|
+
from datasets import load_dataset
|
50
|
+
import chatan
|
51
|
+
|
52
|
+
gen = chatan.generator("openai", "YOUR_API_KEY")
|
53
|
+
hf_data = load_dataset("some/dataset")
|
54
|
+
|
55
|
+
ds = chatan.dataset({
|
56
|
+
"original_prompt": chatan.sample.from_dataset(hf_data, "prompt"),
|
57
|
+
"variation": gen("rewrite this prompt: {original_prompt}"),
|
58
|
+
"response": gen("respond to: {variation}")
|
59
|
+
})
|
60
|
+
|
61
|
+
Saving Datasets
|
62
|
+
---------------
|
63
|
+
|
64
|
+
.. code-block:: python
|
65
|
+
|
66
|
+
# Generate and save
|
67
|
+
df = ds.generate(1000)
|
68
|
+
ds.save("my_dataset.parquet")
|
69
|
+
ds.save("my_dataset.csv", format="csv")
|
70
|
+
|
71
|
+
# Convert to HuggingFace format
|
72
|
+
hf_dataset = ds.to_huggingface()
|
@@ -0,0 +1,53 @@
|
|
1
|
+
.. chatan documentation master file, created by
|
2
|
+
sphinx-quickstart on Mon Jun 9 13:53:32 2025.
|
3
|
+
You can adapt this file completely to your liking, but it should at least
|
4
|
+
contain the root `toctree` directive.
|
5
|
+
|
6
|
+
chatan documentation
|
7
|
+
===========================================
|
8
|
+
|
9
|
+
Create synthetic datasets with LLM generators and samplers.
|
10
|
+
|
11
|
+
.. toctree::
|
12
|
+
:maxdepth: 2
|
13
|
+
:caption: Contents:
|
14
|
+
:hidden:
|
15
|
+
|
16
|
+
quickstart
|
17
|
+
examples
|
18
|
+
other_examples
|
19
|
+
api
|
20
|
+
|
21
|
+
Installation
|
22
|
+
------------
|
23
|
+
|
24
|
+
.. code-block:: bash
|
25
|
+
|
26
|
+
pip install chatan
|
27
|
+
|
28
|
+
Quick Start
|
29
|
+
-----------
|
30
|
+
|
31
|
+
.. code-block:: python
|
32
|
+
|
33
|
+
import chatan
|
34
|
+
|
35
|
+
# Create a generator
|
36
|
+
gen = chatan.generator("openai", "YOUR_API_KEY")
|
37
|
+
|
38
|
+
# Define a dataset schema
|
39
|
+
ds = chatan.dataset({
|
40
|
+
"topic": chatan.sample.choice(["Python", "JavaScript", "Rust"]),
|
41
|
+
"prompt": gen("write a programming question about {topic}"),
|
42
|
+
"response": gen("answer this question: {prompt}")
|
43
|
+
})
|
44
|
+
|
45
|
+
# Generate the data with a progress bar
|
46
|
+
df = ds.generate(n=10)
|
47
|
+
|
48
|
+
Indices and tables
|
49
|
+
==================
|
50
|
+
|
51
|
+
* :ref:`genindex`
|
52
|
+
* :ref:`modindex`
|
53
|
+
* :ref:`search`
|
@@ -0,0 +1,91 @@
|
|
1
|
+
Examples
|
2
|
+
================================================
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
Dataset Triton
|
8
|
+
----------------------------------------------
|
9
|
+
|
10
|
+
.. code-block:: python
|
11
|
+
|
12
|
+
from datasets import load_dataset
|
13
|
+
from chatan import generator, dataset, sample
|
14
|
+
import chatan
|
15
|
+
|
16
|
+
gen = generator("openai", "YOUR_API_KEY")
|
17
|
+
kernelbook = load_dataset("GPUMODE/KernelBook")
|
18
|
+
kernelbench = load_dataset("ScalingIntelligence/KernelBench")
|
19
|
+
|
20
|
+
ds_1 = dataset({
|
21
|
+
"operation": sample.from_dataset(kernelbench, "id"),
|
22
|
+
"prompt": gen("write a prompt asking for a Triton kernel for: {operation}"),
|
23
|
+
"response": gen("{prompt}")
|
24
|
+
})
|
25
|
+
|
26
|
+
ds_2 = dataset({
|
27
|
+
"original_prompt": sample.from_dataset(kernelbook, "python_code"),
|
28
|
+
"prompt": gen("write a question asking for this code to be written as a Triton kernel"),
|
29
|
+
"response": gen("{prompt}")
|
30
|
+
})
|
31
|
+
|
32
|
+
df_1 = ds_1(n=500)
|
33
|
+
df_2 = ds_2(n=500)
|
34
|
+
combined_df = pd.concat([df_1, df_2], ignore_index=True)
|
35
|
+
|
36
|
+
|
37
|
+
~~WIP~~ Ways to Create Complex Mixes
|
38
|
+
------------------------------------------------------------
|
39
|
+
|
40
|
+
.. code-block:: python
|
41
|
+
|
42
|
+
# Method 2: Create a mixed dataset with sampling
|
43
|
+
mixed_ds = dataset({
|
44
|
+
"dataset_type": sample.choice(["kernelbench", "kernelbook"]),
|
45
|
+
"operation": sample.from_dataset(kernelbench, "id"),
|
46
|
+
"original_code": sample.from_dataset(kernelbook, "python_code"),
|
47
|
+
"prompt": gen("""
|
48
|
+
{%- if dataset_type == "kernelbench" -%}
|
49
|
+
write a prompt asking for a Triton kernel for: {operation}
|
50
|
+
{%- else -%}
|
51
|
+
write a question asking for this code to be written as a Triton kernel: {original_code}
|
52
|
+
{%- endif -%}
|
53
|
+
"""),
|
54
|
+
"response": gen("{prompt}")
|
55
|
+
})
|
56
|
+
|
57
|
+
# Method 3: Use sample.choice to pick between schemas
|
58
|
+
schema_choice = sample.choice([
|
59
|
+
{"source": "kernelbench", "operation": sample.from_dataset(kernelbench, "id")},
|
60
|
+
{"source": "kernelbook", "code": sample.from_dataset(kernelbook, "python_code")}
|
61
|
+
])
|
62
|
+
|
63
|
+
final_ds = dataset({
|
64
|
+
"source": schema_choice,
|
65
|
+
"prompt": gen("create a Triton kernel prompt based on {source}"),
|
66
|
+
"response": gen("{prompt}")
|
67
|
+
})
|
68
|
+
|
69
|
+
# Generate the final mixed dataset
|
70
|
+
final_df = final_ds.generate(1000)
|
71
|
+
final_ds.save("triton_kernel_dataset.parquet")
|
72
|
+
|
73
|
+
Transformers Local Generation
|
74
|
+
----------------------------------------------
|
75
|
+
|
76
|
+
.. code-block:: python
|
77
|
+
|
78
|
+
from chatan import generator, dataset, sample
|
79
|
+
|
80
|
+
# Use a local HuggingFace model
|
81
|
+
gen = generator("transformers", model="gpt2")
|
82
|
+
|
83
|
+
ds = dataset({
|
84
|
+
"topic": sample.choice(["space", "history", "science"]),
|
85
|
+
"prompt": gen("Ask a short question about {topic}"),
|
86
|
+
"response": gen("{prompt}")
|
87
|
+
})
|
88
|
+
|
89
|
+
df = ds.generate(5)
|
90
|
+
|
91
|
+
|
@@ -0,0 +1,65 @@
|
|
1
|
+
Quick Start
|
2
|
+
===================================
|
3
|
+
|
4
|
+
Installation
|
5
|
+
------------
|
6
|
+
|
7
|
+
Install chatan from PyPI:
|
8
|
+
|
9
|
+
.. code-block:: bash
|
10
|
+
|
11
|
+
pip install chatan
|
12
|
+
|
13
|
+
Basic Usage
|
14
|
+
-----------
|
15
|
+
|
16
|
+
1. **Create a generator**
|
17
|
+
|
18
|
+
.. code-block:: python
|
19
|
+
|
20
|
+
import chatan
|
21
|
+
|
22
|
+
gen = chatan.generator("openai", "YOUR_OPENAI_API_KEY")
|
23
|
+
# or for Anthropic
|
24
|
+
# gen = chatan.generator("anthropic", "YOUR_ANTHROPIC_API_KEY")
|
25
|
+
|
26
|
+
2. **Define your dataset schema**
|
27
|
+
|
28
|
+
.. code-block:: python
|
29
|
+
|
30
|
+
ds = chatan.dataset({
|
31
|
+
"prompt": gen("write a coding question about {language}"),
|
32
|
+
"language": chatan.sample.choice(["Python", "JavaScript", "Rust"]),
|
33
|
+
"response": gen("answer this question: {prompt}")
|
34
|
+
})
|
35
|
+
|
36
|
+
3. **Generate data**
|
37
|
+
|
38
|
+
.. code-block:: python
|
39
|
+
|
40
|
+
# Generate 100 samples with a progress bar
|
41
|
+
df = ds.generate(100)
|
42
|
+
|
43
|
+
# Save to file
|
44
|
+
ds.save("my_dataset.parquet")
|
45
|
+
|
46
|
+
Core Concepts
|
47
|
+
-------------
|
48
|
+
|
49
|
+
**Generators**
|
50
|
+
Use LLMs to create text content. Support OpenAI and Anthropic APIs.
|
51
|
+
|
52
|
+
**Samplers**
|
53
|
+
Create structured data like UUIDs, choices, ranges, dates.
|
54
|
+
|
55
|
+
**Schemas**
|
56
|
+
Define relationships between columns using generators and samplers.
|
57
|
+
|
58
|
+
**Dependencies**
|
59
|
+
Columns can reference other columns using ``{column_name}`` syntax.
|
60
|
+
|
61
|
+
Next Steps
|
62
|
+
----------
|
63
|
+
|
64
|
+
- Check out :doc:`examples` for more complex use cases
|
65
|
+
- Browse the :doc:`api` reference for all available functions
|