synkro 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. synkro-0.1.4/.gitignore +183 -0
  2. synkro-0.1.4/LICENSE +21 -0
  3. synkro-0.1.4/MANIFEST.in +3 -0
  4. synkro-0.1.4/PKG-INFO +308 -0
  5. synkro-0.1.4/README.md +274 -0
  6. synkro-0.1.4/examples/advanced_usage.py +308 -0
  7. synkro-0.1.4/examples/finetune_llama.py +134 -0
  8. synkro-0.1.4/examples/quickstart.py +42 -0
  9. synkro-0.1.4/pyproject.toml +73 -0
  10. synkro-0.1.4/synkro/__init__.py +129 -0
  11. synkro-0.1.4/synkro/cli.py +118 -0
  12. synkro-0.1.4/synkro/core/__init__.py +7 -0
  13. synkro-0.1.4/synkro/core/dataset.py +233 -0
  14. synkro-0.1.4/synkro/core/policy.py +197 -0
  15. synkro-0.1.4/synkro/errors.py +195 -0
  16. synkro-0.1.4/synkro/examples/__init__.py +148 -0
  17. synkro-0.1.4/synkro/formatters/__init__.py +12 -0
  18. synkro-0.1.4/synkro/formatters/dpo.py +103 -0
  19. synkro-0.1.4/synkro/formatters/qa.py +85 -0
  20. synkro-0.1.4/synkro/formatters/sft.py +90 -0
  21. synkro-0.1.4/synkro/generation/__init__.py +9 -0
  22. synkro-0.1.4/synkro/generation/generator.py +227 -0
  23. synkro-0.1.4/synkro/generation/planner.py +76 -0
  24. synkro-0.1.4/synkro/generation/responses.py +157 -0
  25. synkro-0.1.4/synkro/generation/scenarios.py +99 -0
  26. synkro-0.1.4/synkro/llm/__init__.py +7 -0
  27. synkro-0.1.4/synkro/llm/client.py +210 -0
  28. synkro-0.1.4/synkro/llm/rate_limits.py +107 -0
  29. synkro-0.1.4/synkro/models/__init__.py +45 -0
  30. synkro-0.1.4/synkro/models/anthropic.py +14 -0
  31. synkro-0.1.4/synkro/models/deepseek.py +11 -0
  32. synkro-0.1.4/synkro/models/google.py +19 -0
  33. synkro-0.1.4/synkro/models/groq.py +14 -0
  34. synkro-0.1.4/synkro/models/ollama.py +18 -0
  35. synkro-0.1.4/synkro/models/openai.py +17 -0
  36. synkro-0.1.4/synkro/models/together.py +13 -0
  37. synkro-0.1.4/synkro/modes/__init__.py +15 -0
  38. synkro-0.1.4/synkro/modes/config.py +66 -0
  39. synkro-0.1.4/synkro/modes/dpo.py +18 -0
  40. synkro-0.1.4/synkro/modes/qa.py +18 -0
  41. synkro-0.1.4/synkro/modes/sft.py +18 -0
  42. synkro-0.1.4/synkro/parsers.py +404 -0
  43. synkro-0.1.4/synkro/pipelines.py +59 -0
  44. synkro-0.1.4/synkro/prompts/__init__.py +44 -0
  45. synkro-0.1.4/synkro/prompts/base.py +167 -0
  46. synkro-0.1.4/synkro/prompts/dpo_templates.py +106 -0
  47. synkro-0.1.4/synkro/prompts/qa_templates.py +97 -0
  48. synkro-0.1.4/synkro/prompts/templates.py +281 -0
  49. synkro-0.1.4/synkro/quality/__init__.py +7 -0
  50. synkro-0.1.4/synkro/quality/grader.py +117 -0
  51. synkro-0.1.4/synkro/quality/refiner.py +137 -0
  52. synkro-0.1.4/synkro/schemas.py +251 -0
  53. synkro-0.1.4/synkro/types/__init__.py +29 -0
  54. synkro-0.1.4/synkro/types/core.py +78 -0
  55. synkro-0.1.4/synkro/types/dataset_type.py +30 -0
  56. synkro-0.1.4/tests/__init__.py +2 -0
  57. synkro-0.1.4/tests/test_imports.py +150 -0
@@ -0,0 +1,183 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ .github/workflows/
30
+
31
+ # PyInstaller
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ .python-version
87
+
88
+ # pipenv
89
+ Pipfile.lock
90
+
91
+ # poetry
92
+ poetry.lock
93
+
94
+ # pdm
95
+ .pdm.toml
96
+
97
+ # PEP 582
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ # IDEs
141
+ .vscode/
142
+ .idea/
143
+ *.swp
144
+ *.swo
145
+ *~
146
+ .DS_Store
147
+
148
+ # Project specific
149
+ *.jsonl
150
+ *.json
151
+ !examples/*.json
152
+ !tests/*.json
153
+ output/
154
+ outputs/
155
+ training_data/
156
+ *.pdf
157
+ !examples/*.pdf
158
+ *.docx
159
+ !examples/*.docx
160
+
161
+ # API keys and secrets
162
+ .env
163
+ .env.local
164
+ .env.*.local
165
+ *.key
166
+ *.pem
167
+
168
+ # Model outputs (but not synkro/models/ Python package)
169
+ /models/
170
+ checkpoints/
171
+ *.pt
172
+ *.pth
173
+ *.ckpt
174
+
175
+ # Logs
176
+ *.log
177
+ logs/
178
+
179
+ # Temporary files
180
+ tmp/
181
+ temp/
182
+ *.tmp
183
+
synkro-0.1.4/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Murtaza Meerza
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,3 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include examples *.py
synkro-0.1.4/PKG-INFO ADDED
@@ -0,0 +1,308 @@
1
+ Metadata-Version: 2.4
2
+ Name: synkro
3
+ Version: 0.1.4
4
+ Summary: Generate training datasets from any document
5
+ Author: Murtaza Meerza
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Keywords: dataset-generation,fine-tuning,llm,synthetic-data,training-data
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: beautifulsoup4>=4.12
19
+ Requires-Dist: html2text>=2020.1
20
+ Requires-Dist: httpx>=0.25
21
+ Requires-Dist: instructor>=1.0
22
+ Requires-Dist: litellm>=1.40
23
+ Requires-Dist: mammoth>=1.6
24
+ Requires-Dist: marker-pdf>=0.2
25
+ Requires-Dist: pydantic>=2.0
26
+ Requires-Dist: python-dotenv>=1.0
27
+ Requires-Dist: rich>=13.0
28
+ Requires-Dist: typer>=0.9
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
31
+ Requires-Dist: pytest>=7.0; extra == 'dev'
32
+ Requires-Dist: ruff>=0.1; extra == 'dev'
33
+ Description-Content-Type: text/markdown
34
+
35
+ # Synkro
36
+
37
+ **Generate training datasets from any document.**
38
+
39
+ Turn policies, handbooks, and documentation into high-quality training data for fine-tuning LLMs.
40
+
41
+ ## Features
42
+
43
+ - **Quality Evaluation** - Each response is graded and automatically refined if it fails
44
+ - **Multiple Formats** - SFT (chat), QA (question-answer), DPO (preference pairs)
45
+ - **Any LLM Provider** - OpenAI, Anthropic, Google, Ollama, Groq, and more
46
+ - **File Support** - PDF, DOCX, TXT, Markdown, URLs
47
+ - **CLI Included** - Generate datasets from the command line
48
+
49
+ ## Installation
50
+
51
+ ```bash
52
+ pip install synkro
53
+ ```
54
+
55
+ ## Quick Start
56
+
57
+ ```python
58
+ from synkro.pipelines import create_pipeline
59
+ from synkro.models.google import Google
60
+ from synkro.types import DatasetType
61
+
62
+ pipeline = create_pipeline(
63
+ model=Google.GEMINI_25_FLASH, # Fast generation
64
+ grading_model=Google.GEMINI_25_PRO, # Quality grading
65
+ dataset_type=DatasetType.SFT,
66
+ )
67
+
68
+ dataset = pipeline.generate(
69
+ "All expenses over $50 require manager approval.",
70
+ traces=50,
71
+ )
72
+ dataset.save("training.jsonl")
73
+ ```
74
+
75
+ ### From Files
76
+
77
+ ```python
78
+ from synkro.pipelines import create_pipeline
79
+ from synkro.core.policy import Policy
80
+
81
+ policy = Policy.from_file("handbook.pdf") # PDF, DOCX, TXT, MD
82
+ pipeline = create_pipeline()
83
+ dataset = pipeline.generate(policy, traces=100)
84
+ dataset.save()
85
+ ```
86
+
87
+ ### From URLs
88
+
89
+ ```python
90
+ from synkro.core.policy import Policy
91
+
92
+ policy = Policy.from_url("https://example.com/terms")
93
+ dataset = pipeline.generate(policy)
94
+ ```
95
+
96
+ ## Dataset Types
97
+
98
+ | Format | Output | Best For |
99
+ |--------|--------|----------|
100
+ | **SFT** | Chat messages | Fine-tuning chat models |
101
+ | **QA** | Question-answer pairs | RAG systems, knowledge bases |
102
+ | **DPO** | Preference pairs | RLHF, alignment training |
103
+
104
+ ### SFT (Default)
105
+
106
+ ```python
107
+ from synkro.types import DatasetType
108
+
109
+ pipeline = create_pipeline(dataset_type=DatasetType.SFT)
110
+ dataset = pipeline.generate(policy)
111
+ ```
112
+
113
+ Output:
114
+ ```json
115
+ {"messages": [
116
+ {"role": "system", "content": "You are a policy expert..."},
117
+ {"role": "user", "content": "What's the approval process for $350?"},
118
+ {"role": "assistant", "content": "For a $350 expense, you need..."}
119
+ ]}
120
+ ```
121
+
122
+ ### QA
123
+
124
+ ```python
125
+ pipeline = create_pipeline(dataset_type=DatasetType.QA)
126
+ ```
127
+
128
+ Output:
129
+ ```json
130
+ {"question": "What's the approval process?", "answer": "You need...", "context": "..."}
131
+ ```
132
+
133
+ ### DPO
134
+
135
+ ```python
136
+ pipeline = create_pipeline(dataset_type=DatasetType.DPO)
137
+ ```
138
+
139
+ Output:
140
+ ```json
141
+ {"prompt": "What's the process?", "chosen": "Good answer...", "rejected": "Bad answer..."}
142
+ ```
143
+
144
+ ## Evaluation & Grading
145
+
146
+ Every response is graded on policy compliance, citations, and reasoning. Failed responses are automatically refined (up to 3 iterations).
147
+
148
+ ```python
149
+ from synkro.pipelines import create_pipeline
150
+ from synkro.models.openai import OpenAI
151
+
152
+ pipeline = create_pipeline(
153
+ model=OpenAI.GPT_4O_MINI, # Fast generation
154
+ grading_model=OpenAI.GPT_4O, # Quality grading
155
+ max_iterations=3, # Refinement attempts
156
+ )
157
+
158
+ dataset = pipeline.generate(policy, traces=100)
159
+
160
+ # Check quality
161
+ print(f"Pass rate: {dataset.passing_rate:.1%}")
162
+
163
+ # Filter to only passing traces
164
+ high_quality = dataset.filter(passed=True)
165
+ high_quality.save("training.jsonl")
166
+ ```
167
+
168
+ ### Custom Graders
169
+
170
+ ```python
171
+ from synkro.models.anthropic import Anthropic
172
+ from synkro.models.ollama import Ollama
173
+
174
+ # Use Claude for grading
175
+ pipeline = create_pipeline(grading_model=Anthropic.CLAUDE_35_SONNET)
176
+
177
+ # Or free local grading
178
+ pipeline = create_pipeline(grading_model=Ollama.QWEN_25_32B)
179
+ ```
180
+
181
+ ## Models & Providers
182
+
183
+ ### OpenAI
184
+
185
+ ```python
186
+ from synkro.models.openai import OpenAI
187
+
188
+ pipeline = create_pipeline(model=OpenAI.GPT_4O_MINI)
189
+ ```
190
+
191
+ | Model | Use Case |
192
+ |-------|----------|
193
+ | `OpenAI.GPT_4O` | Best quality (grading) |
194
+ | `OpenAI.GPT_4O_MINI` | Fast & cheap (generation) |
195
+ | `OpenAI.O1` | Reasoning tasks |
196
+
197
+ **Env:** `OPENAI_API_KEY`
198
+
199
+ ### Anthropic
200
+
201
+ ```python
202
+ from synkro.models.anthropic import Anthropic
203
+
204
+ pipeline = create_pipeline(model=Anthropic.CLAUDE_35_HAIKU)
205
+ ```
206
+
207
+ | Model | Use Case |
208
+ |-------|----------|
209
+ | `Anthropic.CLAUDE_35_SONNET` | High quality |
210
+ | `Anthropic.CLAUDE_35_HAIKU` | Fast & cheap |
211
+
212
+ **Env:** `ANTHROPIC_API_KEY`
213
+
214
+ ### Google
215
+
216
+ ```python
217
+ from synkro.models.google import Google
218
+
219
+ pipeline = create_pipeline(model=Google.GEMINI_25_FLASH)
220
+ ```
221
+
222
+ | Model | Use Case |
223
+ |-------|----------|
224
+ | `Google.GEMINI_3_PRO` | Most intelligent |
225
+ | `Google.GEMINI_25_FLASH` | Best price-performance |
226
+ | `Google.GEMINI_2_FLASH_LITE` | Cheapest |
227
+
228
+ **Env:** `GEMINI_API_KEY`
229
+
230
+ ### Ollama (Free, Local)
231
+
232
+ ```python
233
+ from synkro.models.ollama import Ollama
234
+
235
+ # First: ollama pull llama3.1:8b
236
+ pipeline = create_pipeline(model=Ollama.LLAMA_31_8B)
237
+ ```
238
+
239
+ | Model | Use Case |
240
+ |-------|----------|
241
+ | `Ollama.LLAMA_31_8B` | Fast local |
242
+ | `Ollama.QWEN_25_32B` | High quality local |
243
+
244
+ **No API key needed**
245
+
246
+ ### Groq (Ultra Fast)
247
+
248
+ ```python
249
+ from synkro.models.groq import Groq
250
+
251
+ pipeline = create_pipeline(model=Groq.LLAMA_33_70B)
252
+ ```
253
+
254
+ | Model | Use Case |
255
+ |-------|----------|
256
+ | `Groq.LLAMA_33_70B` | Fastest inference |
257
+ | `Groq.LLAMA_31_8B` | Budget option |
258
+
259
+ **Env:** `GROQ_API_KEY`
260
+
261
+ ### Model Selection Tips
262
+
263
+ **For Generation (fast, cheap):**
264
+ - `OpenAI.GPT_4O_MINI` - Best balance
265
+ - `Groq.LLAMA_33_70B` - Ultra fast
266
+ - `Ollama.LLAMA_31_8B` - Free
267
+
268
+ **For Grading (high quality):**
269
+ - `OpenAI.GPT_4O` - Best quality
270
+ - `Anthropic.CLAUDE_35_SONNET` - Great alternative
271
+ - `Google.GEMINI_3_PRO` - Most intelligent
272
+
273
+ ## CLI
274
+
275
+ ### Generate
276
+
277
+ ```bash
278
+ # From file
279
+ synkro generate policy.pdf --traces 50 --format sft
280
+
281
+ # From text
282
+ synkro generate "All expenses over $50 need approval" -n 20
283
+
284
+ # From URL
285
+ synkro generate https://example.com/policy -o training.jsonl
286
+ ```
287
+
288
+ **Options:**
289
+ - `--traces, -n` - Number of traces (default: 20)
290
+ - `--format, -f` - Output format: sft, qa, dpo (default: sft)
291
+ - `--output, -o` - Output file path
292
+ - `--model, -m` - Model for generation
293
+
294
+ ### Demo
295
+
296
+ ```bash
297
+ synkro demo # Quick demo with example policy
298
+ ```
299
+
300
+ ### Version
301
+
302
+ ```bash
303
+ synkro version
304
+ ```
305
+
306
+ ## License
307
+
308
+ MIT