synkro 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synkro might be problematic. Click here for more details.

Files changed (81) hide show
  1. synkro/__init__.py +331 -0
  2. synkro/advanced.py +184 -0
  3. synkro/cli.py +156 -0
  4. synkro/core/__init__.py +7 -0
  5. synkro/core/checkpoint.py +250 -0
  6. synkro/core/dataset.py +432 -0
  7. synkro/core/policy.py +337 -0
  8. synkro/errors.py +178 -0
  9. synkro/examples/__init__.py +148 -0
  10. synkro/factory.py +291 -0
  11. synkro/formatters/__init__.py +18 -0
  12. synkro/formatters/chatml.py +121 -0
  13. synkro/formatters/langfuse.py +98 -0
  14. synkro/formatters/langsmith.py +98 -0
  15. synkro/formatters/qa.py +112 -0
  16. synkro/formatters/sft.py +90 -0
  17. synkro/formatters/tool_call.py +127 -0
  18. synkro/generation/__init__.py +9 -0
  19. synkro/generation/follow_ups.py +134 -0
  20. synkro/generation/generator.py +314 -0
  21. synkro/generation/golden_responses.py +269 -0
  22. synkro/generation/golden_scenarios.py +333 -0
  23. synkro/generation/golden_tool_responses.py +791 -0
  24. synkro/generation/logic_extractor.py +126 -0
  25. synkro/generation/multiturn_responses.py +177 -0
  26. synkro/generation/planner.py +131 -0
  27. synkro/generation/responses.py +189 -0
  28. synkro/generation/scenarios.py +90 -0
  29. synkro/generation/tool_responses.py +625 -0
  30. synkro/generation/tool_simulator.py +114 -0
  31. synkro/interactive/__init__.py +16 -0
  32. synkro/interactive/hitl_session.py +205 -0
  33. synkro/interactive/intent_classifier.py +94 -0
  34. synkro/interactive/logic_map_editor.py +176 -0
  35. synkro/interactive/rich_ui.py +459 -0
  36. synkro/interactive/scenario_editor.py +198 -0
  37. synkro/llm/__init__.py +7 -0
  38. synkro/llm/client.py +309 -0
  39. synkro/llm/rate_limits.py +99 -0
  40. synkro/models/__init__.py +50 -0
  41. synkro/models/anthropic.py +26 -0
  42. synkro/models/google.py +19 -0
  43. synkro/models/local.py +104 -0
  44. synkro/models/openai.py +31 -0
  45. synkro/modes/__init__.py +13 -0
  46. synkro/modes/config.py +66 -0
  47. synkro/modes/conversation.py +35 -0
  48. synkro/modes/tool_call.py +18 -0
  49. synkro/parsers.py +442 -0
  50. synkro/pipeline/__init__.py +20 -0
  51. synkro/pipeline/phases.py +592 -0
  52. synkro/pipeline/runner.py +769 -0
  53. synkro/pipelines.py +136 -0
  54. synkro/prompts/__init__.py +57 -0
  55. synkro/prompts/base.py +167 -0
  56. synkro/prompts/golden_templates.py +533 -0
  57. synkro/prompts/interactive_templates.py +198 -0
  58. synkro/prompts/multiturn_templates.py +156 -0
  59. synkro/prompts/templates.py +281 -0
  60. synkro/prompts/tool_templates.py +318 -0
  61. synkro/quality/__init__.py +14 -0
  62. synkro/quality/golden_refiner.py +163 -0
  63. synkro/quality/grader.py +153 -0
  64. synkro/quality/multiturn_grader.py +150 -0
  65. synkro/quality/refiner.py +137 -0
  66. synkro/quality/tool_grader.py +126 -0
  67. synkro/quality/tool_refiner.py +128 -0
  68. synkro/quality/verifier.py +228 -0
  69. synkro/reporting.py +464 -0
  70. synkro/schemas.py +521 -0
  71. synkro/types/__init__.py +43 -0
  72. synkro/types/core.py +153 -0
  73. synkro/types/dataset_type.py +33 -0
  74. synkro/types/logic_map.py +348 -0
  75. synkro/types/tool.py +94 -0
  76. synkro-0.4.36.data/data/examples/__init__.py +148 -0
  77. synkro-0.4.36.dist-info/METADATA +507 -0
  78. synkro-0.4.36.dist-info/RECORD +81 -0
  79. synkro-0.4.36.dist-info/WHEEL +4 -0
  80. synkro-0.4.36.dist-info/entry_points.txt +2 -0
  81. synkro-0.4.36.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,148 @@
1
+ """Built-in example policies for instant demos."""
2
+
3
+ EXPENSE_POLICY = """# Company Expense Policy
4
+
5
+ ## Approval Thresholds
6
+ - Expenses under $50: No approval required
7
+ - Expenses $50-$500: Manager approval required
8
+ - Expenses over $500: VP approval required
9
+
10
+ ## Receipt Requirements
11
+ - All expenses over $25 must have a receipt
12
+ - Digital receipts are acceptable
13
+ - Missing receipts require written justification within 48 hours
14
+
15
+ ## Categories
16
+ - Travel: Flights, hotels, ground transportation, meals while traveling
17
+ - Meals: Client meals, team events (max $75/person)
18
+ - Software: Must be on pre-approved list, exceptions need IT approval
19
+ - Equipment: Must be on asset tracking list if over $200
20
+ - Office Supplies: Under $100 can be purchased directly
21
+
22
+ ## Reimbursement Timeline
23
+ - Submit expenses within 30 days of purchase
24
+ - Reimbursements processed within 14 business days
25
+ - Late submissions require manager exception approval
26
+ """
27
+
28
+ HR_HANDBOOK = """# Employee Handbook
29
+
30
+ ## Work Hours
31
+ - Standard work week is 40 hours, Monday through Friday
32
+ - Core hours are 10am to 3pm when all employees should be available
33
+ - Flexible scheduling allowed with manager approval
34
+
35
+ ## Time Off
36
+ - Full-time employees receive 15 days PTO per year
37
+ - PTO accrues monthly (1.25 days per month)
38
+ - Unused PTO can roll over up to 5 days
39
+ - PTO requests must be submitted 2 weeks in advance for 3+ days
40
+
41
+ ## Remote Work
42
+ - Hybrid schedule: minimum 2 days in office per week
43
+ - Fully remote requires director approval
44
+ - Home office stipend of $500 for remote workers
45
+
46
+ ## Performance Reviews
47
+ - Annual reviews conducted in December
48
+ - Mid-year check-ins in June
49
+ - Goals set at start of fiscal year
50
+ - Promotions considered during annual review cycle only
51
+ """
52
+
53
+ REFUND_POLICY = """# Return and Refund Policy
54
+
55
+ ## Eligibility
56
+ - Items can be returned within 30 days of purchase
57
+ - Items must be unused and in original packaging
58
+ - Receipt or proof of purchase required
59
+
60
+ ## Exceptions
61
+ - Final sale items cannot be returned
62
+ - Personalized items cannot be returned
63
+ - Perishable goods cannot be returned after 7 days
64
+
65
+ ## Refund Process
66
+ - Refunds issued to original payment method
67
+ - Processing takes 5-10 business days
68
+ - Shipping costs are non-refundable unless item was defective
69
+
70
+ ## Exchanges
71
+ - Exchanges available within 30 days
72
+ - Size exchanges free of charge
73
+ - Different item exchanges treated as return + new purchase
74
+
75
+ ## Defective Items
76
+ - Report defects within 14 days
77
+ - Photos required for defect claims
78
+ - Replacement or full refund offered for confirmed defects
79
+ """
80
+
81
+ SUPPORT_GUIDELINES = """# Customer Support Guidelines
82
+
83
+ ## Response Times
84
+ - Chat: Respond within 2 minutes
85
+ - Email: Respond within 4 hours during business hours
86
+ - Phone: Answer within 30 seconds, max hold time 3 minutes
87
+
88
+ ## Escalation Tiers
89
+ - Tier 1: General questions, password resets, basic troubleshooting
90
+ - Tier 2: Technical issues, billing disputes, account problems
91
+ - Tier 3: Complex technical issues, executive escalations
92
+
93
+ ## Refund Authority
94
+ - Tier 1 can issue refunds up to $50
95
+ - Tier 2 can issue refunds up to $200
96
+ - Tier 3 or manager approval needed for refunds over $200
97
+
98
+ ## Documentation
99
+ - Log all customer interactions in CRM
100
+ - Include customer sentiment and issue category
101
+ - Note any promised follow-ups with deadlines
102
+ """
103
+
104
+ SECURITY_POLICY = """# Information Security Policy
105
+
106
+ ## Password Requirements
107
+ - Minimum 12 characters
108
+ - Must include uppercase, lowercase, number, and symbol
109
+ - Change every 90 days
110
+ - Cannot reuse last 10 passwords
111
+
112
+ ## Access Control
113
+ - Principle of least privilege applies
114
+ - Access requests require manager approval
115
+ - Quarterly access reviews mandatory
116
+ - Terminate access within 24 hours of employee departure
117
+
118
+ ## Data Classification
119
+ - Public: Marketing materials, job postings
120
+ - Internal: Company announcements, policies
121
+ - Confidential: Customer data, financials
122
+ - Restricted: PII, payment info, credentials
123
+
124
+ ## Incident Response
125
+ - Report security incidents within 1 hour
126
+ - Do not attempt to investigate independently
127
+ - Preserve evidence (don't delete logs or files)
128
+ - Security team leads all incident response
129
+ """
130
+
131
+ # All policies available as a list
132
+ ALL_POLICIES = [
133
+ ("expense", EXPENSE_POLICY),
134
+ ("hr", HR_HANDBOOK),
135
+ ("refund", REFUND_POLICY),
136
+ ("support", SUPPORT_GUIDELINES),
137
+ ("security", SECURITY_POLICY),
138
+ ]
139
+
140
+ __all__ = [
141
+ "EXPENSE_POLICY",
142
+ "HR_HANDBOOK",
143
+ "REFUND_POLICY",
144
+ "SUPPORT_GUIDELINES",
145
+ "SECURITY_POLICY",
146
+ "ALL_POLICIES",
147
+ ]
148
+
@@ -0,0 +1,507 @@
1
+ Metadata-Version: 2.4
2
+ Name: synkro
3
+ Version: 0.4.36
4
+ Summary: Generate training datasets from any document
5
+ Author: Murtaza Meerza
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Keywords: dataset-generation,fine-tuning,llm,synthetic-data,training-data
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: beautifulsoup4>=4.12
19
+ Requires-Dist: html2text>=2020.1
20
+ Requires-Dist: httpx>=0.25
21
+ Requires-Dist: litellm>=1.40
22
+ Requires-Dist: mammoth>=1.6
23
+ Requires-Dist: pydantic>=2.0
24
+ Requires-Dist: pymupdf>=1.24
25
+ Requires-Dist: rich>=13.0
26
+ Requires-Dist: typer>=0.9
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
29
+ Requires-Dist: pytest>=7.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.1; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # Synkro
34
+
35
+ Turn policies, handbooks, and documentation into high-quality training data for fine-tuning LLMs.
36
+
37
+ ## Features
38
+
39
+ - **Quality Evaluation** - Each response is graded and automatically refined if it fails
40
+ - **Multiple Formats** - Conversation (multi-turn), Instruction (single-turn), Evaluation (Q&A), and Tool Calling
41
+ - **Eval Platform Support** - Export to LangSmith, Langfuse, or generic Q&A format
42
+ - **Tool Call Training** - Generate OpenAI function calling format for teaching models to use custom tools
43
+ - **Top LLM Providers** - OpenAI, Anthropic, Google, and local models (Ollama, vLLM)
44
+ - **File Support** - PDF, DOCX, TXT, Markdown, URLs
45
+ - **CLI Included** - Generate datasets from the command line
46
+ - **Cost Tracking** - See total cost and LLM call breakdown after each generation
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ pip install synkro
52
+ ```
53
+
54
+ ## Quick Start
55
+
56
+ ```python
57
+ from synkro.pipelines import create_pipeline
58
+ from synkro.models.google import Google
59
+ from synkro.types import DatasetType
60
+
61
+ pipeline = create_pipeline(
62
+ model=Google.GEMINI_25_FLASH, # Fast generation
63
+ grading_model=Google.GEMINI_25_PRO, # Quality grading
64
+ dataset_type=DatasetType.CONVERSATION,
65
+ )
66
+
67
+ dataset = pipeline.generate(
68
+ "All expenses over $50 require manager approval.",
69
+ traces=50,
70
+ )
71
+ dataset.save("training.jsonl")
72
+ ```
73
+
74
+ ### From Files
75
+
76
+ ```python
77
+ from synkro.pipelines import create_pipeline
78
+ from synkro.core.policy import Policy
79
+
80
+ policy = Policy.from_file("handbook.pdf") # PDF, DOCX, TXT, MD
81
+ pipeline = create_pipeline()
82
+ dataset = pipeline.generate(policy, traces=100)
83
+ dataset.save()
84
+ ```
85
+
86
+ ### From URLs
87
+
88
+ ```python
89
+ from synkro.core.policy import Policy
90
+
91
+ policy = Policy.from_url("https://example.com/terms")
92
+ dataset = pipeline.generate(policy)
93
+ ```
94
+
95
+ ## Dataset Types
96
+
97
+ | Type | Turns | Output Formats | Best For |
98
+ |------|-------|----------------|----------|
99
+ | **CONVERSATION** | Multi | sft, chatml | Fine-tuning chat models |
100
+ | **INSTRUCTION** | 1 | sft, chatml | Instruction-following models |
101
+ | **EVALUATION** | 1 | qa, langsmith, langfuse | LLM evaluation & benchmarks |
102
+ | **TOOL_CALL** | Multi | tool_call, chatml | Teaching tool use |
103
+
104
+ ### Conversation (Default)
105
+
106
+ ```python
107
+ from synkro.types import DatasetType
108
+
109
+ pipeline = create_pipeline(dataset_type=DatasetType.CONVERSATION)
110
+ dataset = pipeline.generate(policy)
111
+ ```
112
+
113
+ Output (multi-turn):
114
+ ```json
115
+ {"messages": [
116
+ {"role": "user", "content": "What's the approval process for $350?"},
117
+ {"role": "assistant", "content": "For a $350 expense, you need manager approval..."},
118
+ {"role": "user", "content": "What if my manager is unavailable?"},
119
+ {"role": "assistant", "content": "You can request approval from..."}
120
+ ]}
121
+ ```
122
+
123
+ ### Instruction
124
+
125
+ ```python
126
+ pipeline = create_pipeline(dataset_type=DatasetType.INSTRUCTION)
127
+ dataset = pipeline.generate(policy)
128
+ ```
129
+
130
+ Output (single-turn):
131
+ ```json
132
+ {"messages": [
133
+ {"role": "user", "content": "What's the approval process for $350?"},
134
+ {"role": "assistant", "content": "For a $350 expense, you need manager approval. Submit the expense report with receipt..."}
135
+ ]}
136
+ ```
137
+
138
+ ### Evaluation
139
+
140
+ Generate Q&A datasets for LLM evaluation with ground truth:
141
+
142
+ ```python
143
+ pipeline = create_pipeline(dataset_type=DatasetType.EVALUATION)
144
+ dataset = pipeline.generate(policy, traces=50)
145
+
146
+ # Save in different formats
147
+ dataset.save("eval.jsonl", format="qa") # Generic Q&A
148
+ dataset.save("eval.jsonl", format="langsmith") # LangSmith format
149
+ dataset.save("eval.jsonl", format="langfuse") # Langfuse format
150
+ ```
151
+
152
+ Output (`format="qa"`):
153
+ ```json
154
+ {
155
+ "question": "Can I submit a $200 expense without a receipt?",
156
+ "answer": "All expenses require receipts per policy...",
157
+ "expected_outcome": "Deny - missing receipt violates R003",
158
+ "ground_truth_rules": ["R003", "R005"],
159
+ "difficulty": "negative",
160
+ "category": "Receipt Requirements"
161
+ }
162
+ ```
163
+
164
+ Output (`format="langsmith"`):
165
+ ```json
166
+ {
167
+ "inputs": {"question": "...", "context": "..."},
168
+ "outputs": {"answer": "..."},
169
+ "metadata": {"expected_outcome": "...", "ground_truth_rules": [...]}
170
+ }
171
+ ```
172
+
173
+ Output (`format="langfuse"`):
174
+ ```json
175
+ {
176
+ "input": {"question": "...", "context": "..."},
177
+ "expectedOutput": {"answer": "...", "expected_outcome": "..."},
178
+ "metadata": {"ground_truth_rules": [...], "difficulty": "..."}
179
+ }
180
+ ```
181
+
182
+ ### Tool Calling
183
+
184
+ Generate training data for teaching models when and how to use your custom tools:
185
+
186
+ ```python
187
+ from synkro import create_pipeline, ToolDefinition, DatasetType
188
+
189
+ # Define your tools
190
+ web_search = ToolDefinition(
191
+ name="web_search",
192
+ description="Search the web for current information",
193
+ parameters={
194
+ "type": "object",
195
+ "properties": {
196
+ "query": {"type": "string", "description": "Search query"}
197
+ },
198
+ "required": ["query"]
199
+ },
200
+ mock_responses=["NYC: 72°F, sunny", "BTC: $67,234"]
201
+ )
202
+
203
+ # Create pipeline with tools
204
+ pipeline = create_pipeline(
205
+ dataset_type=DatasetType.TOOL_CALL,
206
+ tools=[web_search],
207
+ )
208
+
209
+ # Generate from tool usage guidelines
210
+ dataset = pipeline.generate("""
211
+ Use web_search for real-time data like weather, prices.
212
+ Answer general questions directly without tools.
213
+ """, traces=20)
214
+
215
+ dataset.save("tool_training.jsonl", format="tool_call") # OpenAI format
216
+ dataset.save("tool_training.jsonl", format="chatml") # ChatML with XML tags
217
+ ```
218
+
219
+ **Output Formats:**
220
+
221
+ OpenAI function calling (`format="tool_call"`):
222
+ ```json
223
+ {"messages": [
224
+ {"role": "user", "content": "What's the weather in NYC?"},
225
+ {"role": "assistant", "content": null, "tool_calls": [
226
+ {"id": "call_abc", "type": "function", "function": {"name": "web_search", "arguments": "{\"query\": \"weather NYC\"}"}}
227
+ ]},
228
+ {"role": "tool", "tool_call_id": "call_abc", "content": "NYC: 72°F, sunny"},
229
+ {"role": "assistant", "content": "The weather in NYC is 72°F and sunny."}
230
+ ]}
231
+ ```
232
+
233
+ ChatML with XML tags (`format="chatml"`):
234
+ ```json
235
+ {"messages": [
236
+ {"role": "user", "content": "What's the weather in NYC?"},
237
+ {"role": "assistant", "content": "<tool_call>\n{\"name\": \"web_search\", \"arguments\": {\"query\": \"weather NYC\"}}\n</tool_call>"},
238
+ {"role": "tool", "content": "<tool_response>\nNYC: 72°F, sunny\n</tool_response>"},
239
+ {"role": "assistant", "content": "The weather in NYC is 72°F and sunny."}
240
+ ]}
241
+ ```
242
+
243
+ ## Evaluation & Grading
244
+
245
+ Every response is graded on policy compliance, citations, and reasoning. Failed responses are automatically refined (up to N iterations).
246
+
247
+ ```python
248
+ from synkro.pipelines import create_pipeline
249
+ from synkro.models.openai import OpenAI
250
+
251
+ pipeline = create_pipeline(
252
+ model=OpenAI.GPT_4O_MINI, # Fast generation
253
+ grading_model=OpenAI.GPT_4O, # Quality grading
254
+ max_iterations=3, # Refinement attempts
255
+ )
256
+
257
+ dataset = pipeline.generate(policy, traces=100)
258
+
259
+ # Check quality
260
+ print(f"Pass rate: {dataset.passing_rate:.1%}")
261
+
262
+ # Filter to only passing traces
263
+ high_quality = dataset.filter(passed=True)
264
+ high_quality.save("training.jsonl")
265
+ ```
266
+
267
+ ## Eval API
268
+
269
+ Generate test scenarios and grade your own model's responses against policy compliance.
270
+
271
+ ```python
272
+ import synkro
273
+
274
+ # Generate scenarios with ground truth (no synthetic responses)
275
+ result = synkro.generate_scenarios(
276
+ policy="Expenses over $50 require manager approval...",
277
+ count=100,
278
+ )
279
+
280
+ # Each scenario has ground truth labels
281
+ for scenario in result.scenarios:
282
+ print(scenario.user_message) # "Can I expense a $200 dinner?"
283
+ print(scenario.expected_outcome) # "Requires manager approval per R001"
284
+ print(scenario.target_rule_ids) # ["R001", "R003"]
285
+ print(scenario.scenario_type) # "positive" | "negative" | "edge_case"
286
+
287
+ # Grade YOUR model's responses
288
+ for scenario in result.scenarios:
289
+ response = my_model(scenario.user_message) # Your model
290
+ grade = synkro.grade(response, scenario, policy)
291
+
292
+ if not grade.passed:
293
+ print(f"Failed: {grade.feedback}")
294
+ ```
295
+
296
+ ### When to Use
297
+
298
+ | Use Case | API |
299
+ |----------|-----|
300
+ | Generate training data | `synkro.generate()` |
301
+ | Generate eval scenarios | `synkro.generate_scenarios()` |
302
+ | Grade external model | `synkro.grade()` |
303
+
304
+ ### Scenario Types
305
+
306
+ Scenarios are generated with balanced coverage:
307
+
308
+ | Type | % | Description |
309
+ |------|---|-------------|
310
+ | `positive` | 35% | Happy path - user meets all criteria |
311
+ | `negative` | 30% | Violations - user fails one criterion |
312
+ | `edge_case` | 25% | Boundary conditions at exact limits |
313
+ | `irrelevant` | 10% | Outside policy scope |
314
+
315
+ ### EvalScenario Fields
316
+
317
+ ```python
318
+ scenario.user_message # The test input
319
+ scenario.expected_outcome # Ground truth behavior
320
+ scenario.target_rule_ids # Rules being tested
321
+ scenario.scenario_type # positive/negative/edge_case/irrelevant
322
+ scenario.category # Policy category
323
+ scenario.context # Additional context
324
+ ```
325
+
326
+ ### Temperature
327
+
328
+ Use `temperature` to control output diversity:
329
+
330
+ ```python
331
+ # High temp for diverse scenario coverage
332
+ result = synkro.generate_scenarios(policy, temperature=0.8)
333
+
334
+ # Low temp for deterministic training data
335
+ dataset = synkro.generate(policy, temperature=0.2)
336
+ ```
337
+
338
+ ## Cost & Performance
339
+
340
+ Approximate costs using Gemini 2.5 Flash (multi-turn conversations):
341
+
342
+ | Traces | LLM Calls | Time | Cost |
343
+ |--------|-----------|------|------|
344
+ | 100 | ~335 | ~13 min | ~$3 |
345
+ | 500 | ~1,675 | ~1 hour | ~$14 |
346
+ | 1000 | ~3,350 | ~2 hours | ~$28 |
347
+
348
+ *Based on ~3.3 LLM calls per trace (generation + grading) with max_iterations=3. Actual costs vary by policy complexity and turn count.*
349
+
350
+ ## Local LLMs
351
+
352
+ Run with Ollama, vLLM, or any OpenAI-compatible endpoint:
353
+
354
+ ```python
355
+ from synkro import create_pipeline
356
+ from synkro.models import Local
357
+
358
+ # Ollama
359
+ pipeline = create_pipeline(model=Local.OLLAMA("llama3.2"))
360
+
361
+ # vLLM
362
+ pipeline = create_pipeline(model=Local.VLLM("mistral-7b"))
363
+
364
+ # Custom endpoint
365
+ pipeline = create_pipeline(model=Local.CUSTOM("my-model", endpoint="http://localhost:8080"))
366
+ ```
367
+
368
+ **CLI:**
369
+ ```bash
370
+ synkro generate policy.pdf --provider ollama --model llama3.2
371
+ synkro generate policy.pdf --provider vllm --endpoint http://localhost:8000
372
+ ```
373
+
374
+ ## CLI
375
+
376
+ ```bash
377
+ # From file
378
+ synkro generate policy.pdf --traces 50
379
+
380
+ # From text
381
+ synkro generate "All expenses over $50 need approval" -n 20
382
+
383
+ # From URL
384
+ synkro generate https://example.com/policy -o training.jsonl
385
+
386
+ # Skip interactive mode
387
+ synkro generate policy.pdf --no-interactive
388
+
389
+ # Quick demo with built-in policy
390
+ synkro demo
391
+ ```
392
+
393
+ **Options:**
394
+ - `--traces, -n` - Number of traces (default: 20)
395
+ - `--output, -o` - Output file path
396
+ - `--model, -m` - Model for generation
397
+ - `--format, -f` - Output format: `sft`, `qa`, `langsmith`, `langfuse`, `tool_call`, `chatml`
398
+ - `--provider, -p` - LLM provider for local models (`ollama`, `vllm`)
399
+ - `--endpoint, -e` - Custom API endpoint URL
400
+ - `--interactive/-i, --no-interactive/-I` - Review/edit extracted rules before generation (default: on)
401
+
402
+ ## Interactive Mode
403
+
404
+ By default, synkro extracts policy rules into a Logic Map and lets you review/edit them before generation. The interactive session also shows the recommended conversation turns based on policy complexity:
405
+
406
+ ```
407
+ ╭─────────────────────────── Conversation Settings ────────────────────────────╮
408
+ │ Complexity: Conditional │
409
+ │ Turns: 3 │
410
+ ╰──────────────────────────────────────────────────────────────────────────────╯
411
+
412
+ ╭────────────────────────── 📜 Logic Map (3 rules) ────────────────────────────╮
413
+ │ ├── R001: Expenses over $50 require manager approval │
414
+ │ ├── R002: Client meals limited to $75/person │
415
+ │ └── R003: Receipts required for all expenses │
416
+ ╰──────────────────────────────────────────────────────────────────────────────╯
417
+
418
+ Enter feedback: shorter conversations
419
+ ✓ Set to 2 turns (User requested shorter/simpler conversations)
420
+
421
+ Enter feedback: add a rule for travel expenses
422
+ ✓ Added R004: Travel expenses over $500 require VP approval
423
+
424
+ Enter feedback: done
425
+ ✅ Session complete - 1 rule change(s), 2 turns
426
+ ```
427
+
428
+ You can adjust both **conversation turns** and **rules** using natural language:
429
+
430
+ | Input | Action |
431
+ |-------|--------|
432
+ | `"shorter conversations"` | Reduce turns (1-2) |
433
+ | `"I want 5 turns"` | Set specific turn count |
434
+ | `"more thorough"` | Increase turns (5-6) |
435
+ | `"remove R002"` | Delete a rule |
436
+ | `"add a rule for..."` | Add new rule |
437
+
438
+ Commands: `done`, `undo`, `reset`, `show R001`, `help`
439
+
440
+ ## Advanced Features
441
+
442
+ ### Checkpointing
443
+
444
+ Resume interrupted generations:
445
+
446
+ ```python
447
+ pipeline = create_pipeline(checkpoint_dir="./checkpoints")
448
+ dataset = pipeline.generate(policy, traces=100) # Resumes from checkpoint
449
+ ```
450
+
451
+ ### Dataset Operations
452
+
453
+ ```python
454
+ # Filter by quality
455
+ high_quality = dataset.filter(passed=True)
456
+
457
+ # Remove duplicates
458
+ unique = dataset.dedupe(threshold=0.85)
459
+
460
+ # Check pass rate
461
+ print(f"Pass rate: {dataset.passing_rate:.1%}")
462
+ ```
463
+
464
+ ### Folder Loading
465
+
466
+ Generate from multiple documents at once:
467
+
468
+ ```python
469
+ from synkro.core.policy import Policy
470
+
471
+ policy = Policy.from_file("policies/") # Loads all PDF, DOCX, TXT, MD files
472
+ dataset = pipeline.generate(policy, traces=100)
473
+ ```
474
+
475
+ ### Thinking Mode
476
+
477
+ Generate training data with explicit reasoning in `<think>` tags, compatible with Qwen3 and DeepSeek-R1:
478
+
479
+ ```python
480
+ pipeline = create_pipeline(thinking=True)
481
+ dataset = pipeline.generate(policy, traces=50)
482
+ ```
483
+
484
+ Output:
485
+ ```json
486
+ {"messages": [
487
+ {"role": "user", "content": "Can I expense a $350 team dinner?"},
488
+ {"role": "assistant", "content": "<think>\nLet me check the expense policy...\n- Rule: Expenses over $50 require manager approval\n- $350 exceeds the $50 threshold\n- Manager approval is required\n</think>\n\nFor a $350 team dinner, you'll need manager approval since it exceeds the $50 threshold. Please submit your expense report with the receipt and request approval from your manager."}
489
+ ]}
490
+ ```
491
+
492
+ Works with all dataset types (`CONVERSATION`, `INSTRUCTION`, `TOOL_CALL`).
493
+
494
+ ## Logic Map Inspection
495
+
496
+ Access the extracted rules programmatically:
497
+
498
+ ```python
499
+ result = pipeline.generate(policy, traces=50, return_logic_map=True)
500
+
501
+ # Inspect extracted rules
502
+ for rule in result.logic_map.rules:
503
+ print(f"{rule.rule_id}: {rule.text}")
504
+
505
+ # Get the dataset
506
+ dataset = result.dataset
507
+ ```