synkro 0.4.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. synkro-0.4.30/.gitignore +183 -0
  2. synkro-0.4.30/LICENSE +21 -0
  3. synkro-0.4.30/PKG-INFO +294 -0
  4. synkro-0.4.30/README.md +261 -0
  5. synkro-0.4.30/examples/advanced_usage.py +277 -0
  6. synkro-0.4.30/examples/anthropic_basic.py +45 -0
  7. synkro-0.4.30/examples/finetune_llama.py +236 -0
  8. synkro-0.4.30/examples/multi_file_policy.py +139 -0
  9. synkro-0.4.30/examples/openai_basic.py +45 -0
  10. synkro-0.4.30/examples/policies/hr_policy.md +30 -0
  11. synkro-0.4.30/examples/policies/security_policy.txt +35 -0
  12. synkro-0.4.30/examples/quickstart.py +51 -0
  13. synkro-0.4.30/examples/test_fixes.py +47 -0
  14. synkro-0.4.30/examples/tool_calling.py +208 -0
  15. synkro-0.4.30/pyproject.toml +71 -0
  16. synkro-0.4.30/synkro/__init__.py +186 -0
  17. synkro-0.4.30/synkro/advanced.py +184 -0
  18. synkro-0.4.30/synkro/cli.py +156 -0
  19. synkro-0.4.30/synkro/core/__init__.py +7 -0
  20. synkro-0.4.30/synkro/core/checkpoint.py +250 -0
  21. synkro-0.4.30/synkro/core/dataset.py +402 -0
  22. synkro-0.4.30/synkro/core/policy.py +337 -0
  23. synkro-0.4.30/synkro/errors.py +178 -0
  24. synkro-0.4.30/synkro/examples/__init__.py +148 -0
  25. synkro-0.4.30/synkro/factory.py +287 -0
  26. synkro-0.4.30/synkro/formatters/__init__.py +12 -0
  27. synkro-0.4.30/synkro/formatters/chatml.py +121 -0
  28. synkro-0.4.30/synkro/formatters/sft.py +90 -0
  29. synkro-0.4.30/synkro/formatters/tool_call.py +127 -0
  30. synkro-0.4.30/synkro/generation/__init__.py +9 -0
  31. synkro-0.4.30/synkro/generation/follow_ups.py +134 -0
  32. synkro-0.4.30/synkro/generation/generator.py +232 -0
  33. synkro-0.4.30/synkro/generation/golden_responses.py +244 -0
  34. synkro-0.4.30/synkro/generation/golden_scenarios.py +276 -0
  35. synkro-0.4.30/synkro/generation/golden_tool_responses.py +762 -0
  36. synkro-0.4.30/synkro/generation/logic_extractor.py +126 -0
  37. synkro-0.4.30/synkro/generation/multiturn_responses.py +177 -0
  38. synkro-0.4.30/synkro/generation/planner.py +131 -0
  39. synkro-0.4.30/synkro/generation/responses.py +189 -0
  40. synkro-0.4.30/synkro/generation/scenarios.py +90 -0
  41. synkro-0.4.30/synkro/generation/tool_responses.py +625 -0
  42. synkro-0.4.30/synkro/generation/tool_simulator.py +114 -0
  43. synkro-0.4.30/synkro/interactive/__init__.py +16 -0
  44. synkro-0.4.30/synkro/interactive/hitl_session.py +205 -0
  45. synkro-0.4.30/synkro/interactive/intent_classifier.py +94 -0
  46. synkro-0.4.30/synkro/interactive/logic_map_editor.py +176 -0
  47. synkro-0.4.30/synkro/interactive/rich_ui.py +459 -0
  48. synkro-0.4.30/synkro/interactive/scenario_editor.py +198 -0
  49. synkro-0.4.30/synkro/llm/__init__.py +7 -0
  50. synkro-0.4.30/synkro/llm/client.py +277 -0
  51. synkro-0.4.30/synkro/llm/rate_limits.py +99 -0
  52. synkro-0.4.30/synkro/models/__init__.py +50 -0
  53. synkro-0.4.30/synkro/models/anthropic.py +26 -0
  54. synkro-0.4.30/synkro/models/google.py +19 -0
  55. synkro-0.4.30/synkro/models/local.py +104 -0
  56. synkro-0.4.30/synkro/models/openai.py +31 -0
  57. synkro-0.4.30/synkro/modes/__init__.py +13 -0
  58. synkro-0.4.30/synkro/modes/config.py +65 -0
  59. synkro-0.4.30/synkro/modes/conversation.py +26 -0
  60. synkro-0.4.30/synkro/modes/tool_call.py +18 -0
  61. synkro-0.4.30/synkro/parsers.py +442 -0
  62. synkro-0.4.30/synkro/pipeline/__init__.py +20 -0
  63. synkro-0.4.30/synkro/pipeline/phases.py +592 -0
  64. synkro-0.4.30/synkro/pipeline/runner.py +616 -0
  65. synkro-0.4.30/synkro/pipelines.py +126 -0
  66. synkro-0.4.30/synkro/prompts/__init__.py +57 -0
  67. synkro-0.4.30/synkro/prompts/base.py +167 -0
  68. synkro-0.4.30/synkro/prompts/golden_templates.py +474 -0
  69. synkro-0.4.30/synkro/prompts/interactive_templates.py +198 -0
  70. synkro-0.4.30/synkro/prompts/multiturn_templates.py +156 -0
  71. synkro-0.4.30/synkro/prompts/templates.py +281 -0
  72. synkro-0.4.30/synkro/prompts/tool_templates.py +318 -0
  73. synkro-0.4.30/synkro/quality/__init__.py +14 -0
  74. synkro-0.4.30/synkro/quality/golden_refiner.py +163 -0
  75. synkro-0.4.30/synkro/quality/grader.py +153 -0
  76. synkro-0.4.30/synkro/quality/multiturn_grader.py +150 -0
  77. synkro-0.4.30/synkro/quality/refiner.py +137 -0
  78. synkro-0.4.30/synkro/quality/tool_grader.py +126 -0
  79. synkro-0.4.30/synkro/quality/tool_refiner.py +128 -0
  80. synkro-0.4.30/synkro/quality/verifier.py +228 -0
  81. synkro-0.4.30/synkro/reporting.py +403 -0
  82. synkro-0.4.30/synkro/schemas.py +521 -0
  83. synkro-0.4.30/synkro/types/__init__.py +41 -0
  84. synkro-0.4.30/synkro/types/core.py +126 -0
  85. synkro-0.4.30/synkro/types/dataset_type.py +29 -0
  86. synkro-0.4.30/synkro/types/logic_map.py +345 -0
  87. synkro-0.4.30/synkro/types/tool.py +94 -0
  88. synkro-0.4.30/tests/__init__.py +2 -0
  89. synkro-0.4.30/tests/test_imports.py +129 -0
@@ -0,0 +1,183 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ .github/workflows/
30
+
31
+ # PyInstaller
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ .python-version
87
+
88
+ # pipenv
89
+ Pipfile.lock
90
+
91
+ # poetry
92
+ poetry.lock
93
+
94
+ # pdm
95
+ .pdm.toml
96
+
97
+ # PEP 582
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ # IDEs
141
+ .vscode/
142
+ .idea/
143
+ *.swp
144
+ *.swo
145
+ *~
146
+ .DS_Store
147
+
148
+ # Project specific
149
+ *.jsonl
150
+ *.json
151
+ !examples/*.json
152
+ !tests/*.json
153
+ output/
154
+ outputs/
155
+ training_data/
156
+ *.pdf
157
+ !examples/*.pdf
158
+ *.docx
159
+ !examples/*.docx
160
+
161
+ # API keys and secrets
162
+ .env
163
+ .env.local
164
+ .env.*.local
165
+ *.key
166
+ *.pem
167
+
168
+ # Model outputs (but not synkro/models/ Python package)
169
+ /models/
170
+ checkpoints/
171
+ *.pt
172
+ *.pth
173
+ *.ckpt
174
+
175
+ # Logs
176
+ *.log
177
+ logs/
178
+
179
+ # Temporary files
180
+ tmp/
181
+ temp/
182
+ *.tmp
183
+
synkro-0.4.30/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Murtaza Meerza
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
synkro-0.4.30/PKG-INFO ADDED
@@ -0,0 +1,294 @@
1
+ Metadata-Version: 2.4
2
+ Name: synkro
3
+ Version: 0.4.30
4
+ Summary: Generate training datasets from any document
5
+ Author: Murtaza Meerza
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Keywords: dataset-generation,fine-tuning,llm,synthetic-data,training-data
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: beautifulsoup4>=4.12
19
+ Requires-Dist: html2text>=2020.1
20
+ Requires-Dist: httpx>=0.25
21
+ Requires-Dist: litellm>=1.40
22
+ Requires-Dist: mammoth>=1.6
23
+ Requires-Dist: marker-pdf>=0.2
24
+ Requires-Dist: pydantic>=2.0
25
+ Requires-Dist: python-dotenv>=1.0
26
+ Requires-Dist: rich>=13.0
27
+ Requires-Dist: typer>=0.9
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
30
+ Requires-Dist: pytest>=7.0; extra == 'dev'
31
+ Requires-Dist: ruff>=0.1; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Synkro
35
+
36
+ Turn policies, handbooks, and documentation into high-quality training data for fine-tuning LLMs.
37
+
38
+ ## Features
39
+
40
+ - **Quality Evaluation** - Each response is graded and automatically refined if it fails
41
+ - **Multiple Formats** - Conversation (multi-turn), Instruction (single-turn), and Tool Calling
42
+ - **Tool Call Training** - Generate OpenAI function calling format for teaching models to use custom tools
43
+ - **Top LLM Providers** - OpenAI, Anthropic, and Google
44
+ - **File Support** - PDF, DOCX, TXT, Markdown, URLs
45
+ - **CLI Included** - Generate datasets from the command line
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install synkro
51
+ ```
52
+
53
+ ## Quick Start
54
+
55
+ ```python
56
+ from synkro.pipelines import create_pipeline
57
+ from synkro.models.google import Google
58
+ from synkro.types import DatasetType
59
+
60
+ pipeline = create_pipeline(
61
+ model=Google.GEMINI_25_FLASH, # Fast generation
62
+ grading_model=Google.GEMINI_25_PRO, # Quality grading
63
+ dataset_type=DatasetType.CONVERSATION,
64
+ )
65
+
66
+ dataset = pipeline.generate(
67
+ "All expenses over $50 require manager approval.",
68
+ traces=50,
69
+ )
70
+ dataset.save("training.jsonl")
71
+ ```
72
+
73
+ ### From Files
74
+
75
+ ```python
76
+ from synkro.pipelines import create_pipeline
77
+ from synkro.core.policy import Policy
78
+
79
+ policy = Policy.from_file("handbook.pdf") # PDF, DOCX, TXT, MD
80
+ pipeline = create_pipeline()
81
+ dataset = pipeline.generate(policy, traces=100)
82
+ dataset.save()
83
+ ```
84
+
85
+ ### From URLs
86
+
87
+ ```python
88
+ from synkro.core.policy import Policy
89
+
90
+ policy = Policy.from_url("https://example.com/terms")
91
+ dataset = pipeline.generate(policy)
92
+ ```
93
+
94
+ ## Dataset Types
95
+
96
+ | Type | Turns | Output Formats | Best For |
97
+ |------|-------|----------------|----------|
98
+ | **CONVERSATION** | Multi | messages | Fine-tuning chat models |
99
+ | **INSTRUCTION** | 1 | messages | Instruction-following models |
100
+ | **TOOL_CALL** | Multi | OpenAI function calling, ChatML | Teaching tool use |
101
+
102
+ ### Conversation (Default)
103
+
104
+ ```python
105
+ from synkro.types import DatasetType
106
+
107
+ pipeline = create_pipeline(dataset_type=DatasetType.CONVERSATION)
108
+ dataset = pipeline.generate(policy)
109
+ ```
110
+
111
+ Output (multi-turn):
112
+ ```json
113
+ {"messages": [
114
+ {"role": "user", "content": "What's the approval process for $350?"},
115
+ {"role": "assistant", "content": "For a $350 expense, you need manager approval..."},
116
+ {"role": "user", "content": "What if my manager is unavailable?"},
117
+ {"role": "assistant", "content": "You can request approval from..."}
118
+ ]}
119
+ ```
120
+
121
+ ### Instruction
122
+
123
+ ```python
124
+ pipeline = create_pipeline(dataset_type=DatasetType.INSTRUCTION)
125
+ dataset = pipeline.generate(policy)
126
+ ```
127
+
128
+ Output (single-turn):
129
+ ```json
130
+ {"messages": [
131
+ {"role": "user", "content": "What's the approval process for $350?"},
132
+ {"role": "assistant", "content": "For a $350 expense, you need manager approval. Submit the expense report with receipt..."}
133
+ ]}
134
+ ```
135
+
136
+ ### Tool Calling
137
+
138
+ Generate training data for teaching models when and how to use your custom tools:
139
+
140
+ ```python
141
+ from synkro import create_pipeline, ToolDefinition, DatasetType
142
+
143
+ # Define your tools
144
+ web_search = ToolDefinition(
145
+ name="web_search",
146
+ description="Search the web for current information",
147
+ parameters={
148
+ "type": "object",
149
+ "properties": {
150
+ "query": {"type": "string", "description": "Search query"}
151
+ },
152
+ "required": ["query"]
153
+ },
154
+ mock_responses=["NYC: 72°F, sunny", "BTC: $67,234"]
155
+ )
156
+
157
+ # Create pipeline with tools
158
+ pipeline = create_pipeline(
159
+ dataset_type=DatasetType.TOOL_CALL,
160
+ tools=[web_search],
161
+ )
162
+
163
+ # Generate from tool usage guidelines
164
+ dataset = pipeline.generate("""
165
+ Use web_search for real-time data like weather, prices.
166
+ Answer general questions directly without tools.
167
+ """, traces=20)
168
+
169
+ dataset.save("tool_training.jsonl", format="tool_call") # OpenAI format
170
+ dataset.save("tool_training.jsonl", format="chatml") # ChatML with XML tags
171
+ ```
172
+
173
+ **Output Formats:**
174
+
175
+ OpenAI function calling (`format="tool_call"`):
176
+ ```json
177
+ {"messages": [
178
+ {"role": "user", "content": "What's the weather in NYC?"},
179
+ {"role": "assistant", "content": null, "tool_calls": [
180
+ {"id": "call_abc", "type": "function", "function": {"name": "web_search", "arguments": "{\"query\": \"weather NYC\"}"}}
181
+ ]},
182
+ {"role": "tool", "tool_call_id": "call_abc", "content": "NYC: 72°F, sunny"},
183
+ {"role": "assistant", "content": "The weather in NYC is 72°F and sunny."}
184
+ ]}
185
+ ```
186
+
187
+ ChatML with XML tags (`format="chatml"`):
188
+ ```json
189
+ {"messages": [
190
+ {"role": "user", "content": "What's the weather in NYC?"},
191
+ {"role": "assistant", "content": "<tool_call>\n{\"name\": \"web_search\", \"arguments\": {\"query\": \"weather NYC\"}}\n</tool_call>"},
192
+ {"role": "tool", "content": "<tool_response>\nNYC: 72°F, sunny\n</tool_response>"},
193
+ {"role": "assistant", "content": "The weather in NYC is 72°F and sunny."}
194
+ ]}
195
+ ```
196
+
197
+ ## Evaluation & Grading
198
+
199
+ Every response is graded on policy compliance, citations, and reasoning. Failed responses are automatically refined (up to N iterations).
200
+
201
+ ```python
202
+ from synkro.pipelines import create_pipeline
203
+ from synkro.models.openai import OpenAI
204
+
205
+ pipeline = create_pipeline(
206
+ model=OpenAI.GPT_4O_MINI, # Fast generation
207
+ grading_model=OpenAI.GPT_4O, # Quality grading
208
+ max_iterations=3, # Refinement attempts
209
+ )
210
+
211
+ dataset = pipeline.generate(policy, traces=100)
212
+
213
+ # Check quality
214
+ print(f"Pass rate: {dataset.passing_rate:.1%}")
215
+
216
+ # Filter to only passing traces
217
+ high_quality = dataset.filter(passed=True)
218
+ high_quality.save("training.jsonl")
219
+ ```
220
+
221
+ ## CLI
222
+
223
+ ```bash
224
+ # From file
225
+ synkro generate policy.pdf --traces 50
226
+
227
+ # From text
228
+ synkro generate "All expenses over $50 need approval" -n 20
229
+
230
+ # From URL
231
+ synkro generate https://example.com/policy -o training.jsonl
232
+
233
+ # Skip interactive mode
234
+ synkro generate policy.pdf --no-interactive
235
+ ```
236
+
237
+ **Options:**
238
+ - `--traces, -n` - Number of traces (default: 20)
239
+ - `--output, -o` - Output file path
240
+ - `--model, -m` - Model for generation
241
+ - `--interactive/-i, --no-interactive/-I` - Review/edit extracted rules before generation (default: on)
242
+
243
+ ## Interactive Mode
244
+
245
+ By default, synkro extracts policy rules into a Logic Map and lets you review/edit them before generation. The interactive session also shows the recommended conversation turns based on policy complexity:
246
+
247
+ ```
248
+ ╭─────────────────────────── Conversation Settings ────────────────────────────╮
249
+ │ Complexity: Conditional │
250
+ │ Turns: 3 │
251
+ ╰──────────────────────────────────────────────────────────────────────────────╯
252
+
253
+ ╭────────────────────────── 📜 Logic Map (3 rules) ────────────────────────────╮
254
+ │ ├── R001: Expenses over $50 require manager approval │
255
+ │ ├── R002: Client meals limited to $75/person │
256
+ │ └── R003: Receipts required for all expenses │
257
+ ╰──────────────────────────────────────────────────────────────────────────────╯
258
+
259
+ Enter feedback: shorter conversations
260
+ ✓ Set to 2 turns (User requested shorter/simpler conversations)
261
+
262
+ Enter feedback: add a rule for travel expenses
263
+ ✓ Added R004: Travel expenses over $500 require VP approval
264
+
265
+ Enter feedback: done
266
+ ✅ Session complete - 1 rule change(s), 2 turns
267
+ ```
268
+
269
+ You can adjust both **conversation turns** and **rules** using natural language:
270
+
271
+ | Input | Action |
272
+ |-------|--------|
273
+ | `"shorter conversations"` | Reduce turns (1-2) |
274
+ | `"I want 5 turns"` | Set specific turn count |
275
+ | `"more thorough"` | Increase turns (5-6) |
276
+ | `"remove R002"` | Delete a rule |
277
+ | `"add a rule for..."` | Add new rule |
278
+
279
+ Commands: `done`, `undo`, `reset`, `show R001`, `help`
280
+
281
+ ## Logic Map Inspection
282
+
283
+ Access the extracted rules programmatically:
284
+
285
+ ```python
286
+ result = pipeline.generate(policy, traces=50, return_logic_map=True)
287
+
288
+ # Inspect extracted rules
289
+ for rule in result.logic_map.rules:
290
+ print(f"{rule.rule_id}: {rule.text}")
291
+
292
+ # Get the dataset
293
+ dataset = result.dataset
294
+ ```