@bgicli/bgicli 2.2.8 → 2.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/data/skills/anthropic-algorithmic-art/SKILL.md +405 -0
  2. package/data/skills/anthropic-canvas-design/SKILL.md +130 -0
  3. package/data/skills/anthropic-claude-api/SKILL.md +243 -0
  4. package/data/skills/anthropic-doc-coauthoring/SKILL.md +375 -0
  5. package/data/skills/anthropic-docx/SKILL.md +590 -0
  6. package/data/skills/anthropic-frontend-design/SKILL.md +42 -0
  7. package/data/skills/anthropic-internal-comms/SKILL.md +32 -0
  8. package/data/skills/anthropic-mcp-builder/SKILL.md +236 -0
  9. package/data/skills/anthropic-pdf/SKILL.md +314 -0
  10. package/data/skills/anthropic-pptx/SKILL.md +232 -0
  11. package/data/skills/anthropic-skill-creator/SKILL.md +485 -0
  12. package/data/skills/anthropic-webapp-testing/SKILL.md +96 -0
  13. package/data/skills/anthropic-xlsx/SKILL.md +292 -0
  14. package/data/skills/arxiv-database/SKILL.md +362 -0
  15. package/data/skills/astropy/SKILL.md +329 -0
  16. package/data/skills/ctx-advanced-evaluation/SKILL.md +402 -0
  17. package/data/skills/ctx-bdi-mental-states/SKILL.md +311 -0
  18. package/data/skills/ctx-context-compression/SKILL.md +272 -0
  19. package/data/skills/ctx-context-degradation/SKILL.md +206 -0
  20. package/data/skills/ctx-context-fundamentals/SKILL.md +201 -0
  21. package/data/skills/ctx-context-optimization/SKILL.md +195 -0
  22. package/data/skills/ctx-evaluation/SKILL.md +251 -0
  23. package/data/skills/ctx-filesystem-context/SKILL.md +287 -0
  24. package/data/skills/ctx-hosted-agents/SKILL.md +260 -0
  25. package/data/skills/ctx-memory-systems/SKILL.md +225 -0
  26. package/data/skills/ctx-multi-agent-patterns/SKILL.md +257 -0
  27. package/data/skills/ctx-project-development/SKILL.md +291 -0
  28. package/data/skills/ctx-tool-design/SKILL.md +271 -0
  29. package/data/skills/dhdna-profiler/SKILL.md +162 -0
  30. package/data/skills/generate-image/SKILL.md +183 -0
  31. package/data/skills/geomaster/SKILL.md +365 -0
  32. package/data/skills/get-available-resources/SKILL.md +275 -0
  33. package/data/skills/hamelsmu-build-review-interface/SKILL.md +96 -0
  34. package/data/skills/hamelsmu-error-analysis/SKILL.md +164 -0
  35. package/data/skills/hamelsmu-eval-audit/SKILL.md +183 -0
  36. package/data/skills/hamelsmu-evaluate-rag/SKILL.md +177 -0
  37. package/data/skills/hamelsmu-generate-synthetic-data/SKILL.md +131 -0
  38. package/data/skills/hamelsmu-validate-evaluator/SKILL.md +212 -0
  39. package/data/skills/hamelsmu-write-judge-prompt/SKILL.md +144 -0
  40. package/data/skills/hf-cli/SKILL.md +174 -0
  41. package/data/skills/hf-mcp/SKILL.md +178 -0
  42. package/data/skills/hugging-face-dataset-viewer/SKILL.md +121 -0
  43. package/data/skills/hugging-face-datasets/SKILL.md +542 -0
  44. package/data/skills/hugging-face-evaluation/SKILL.md +651 -0
  45. package/data/skills/hugging-face-jobs/SKILL.md +1042 -0
  46. package/data/skills/hugging-face-model-trainer/SKILL.md +717 -0
  47. package/data/skills/hugging-face-paper-pages/SKILL.md +239 -0
  48. package/data/skills/hugging-face-paper-publisher/SKILL.md +624 -0
  49. package/data/skills/hugging-face-tool-builder/SKILL.md +110 -0
  50. package/data/skills/hugging-face-trackio/SKILL.md +115 -0
  51. package/data/skills/hugging-face-vision-trainer/SKILL.md +593 -0
  52. package/data/skills/huggingface-gradio/SKILL.md +245 -0
  53. package/data/skills/matlab/SKILL.md +376 -0
  54. package/data/skills/modal/SKILL.md +381 -0
  55. package/data/skills/openai-cloudflare-deploy/SKILL.md +224 -0
  56. package/data/skills/openai-develop-web-game/SKILL.md +149 -0
  57. package/data/skills/openai-doc/SKILL.md +80 -0
  58. package/data/skills/openai-figma/SKILL.md +42 -0
  59. package/data/skills/openai-figma-implement-design/SKILL.md +264 -0
  60. package/data/skills/openai-gh-address-comments/SKILL.md +25 -0
  61. package/data/skills/openai-gh-fix-ci/SKILL.md +69 -0
  62. package/data/skills/openai-imagegen/SKILL.md +174 -0
  63. package/data/skills/openai-jupyter-notebook/SKILL.md +107 -0
  64. package/data/skills/openai-linear/SKILL.md +87 -0
  65. package/data/skills/openai-netlify-deploy/SKILL.md +247 -0
  66. package/data/skills/openai-notion-knowledge-capture/SKILL.md +56 -0
  67. package/data/skills/openai-notion-meeting-intelligence/SKILL.md +60 -0
  68. package/data/skills/openai-notion-research-documentation/SKILL.md +59 -0
  69. package/data/skills/openai-notion-spec-to-implementation/SKILL.md +58 -0
  70. package/data/skills/openai-openai-docs/SKILL.md +69 -0
  71. package/data/skills/openai-pdf/SKILL.md +67 -0
  72. package/data/skills/openai-playwright/SKILL.md +147 -0
  73. package/data/skills/openai-render-deploy/SKILL.md +479 -0
  74. package/data/skills/openai-screenshot/SKILL.md +267 -0
  75. package/data/skills/openai-security-best-practices/SKILL.md +86 -0
  76. package/data/skills/openai-security-ownership-map/SKILL.md +206 -0
  77. package/data/skills/openai-security-threat-model/SKILL.md +81 -0
  78. package/data/skills/openai-sentry/SKILL.md +123 -0
  79. package/data/skills/openai-sora/SKILL.md +178 -0
  80. package/data/skills/openai-speech/SKILL.md +144 -0
  81. package/data/skills/openai-spreadsheet/SKILL.md +145 -0
  82. package/data/skills/openai-transcribe/SKILL.md +81 -0
  83. package/data/skills/openai-vercel-deploy/SKILL.md +77 -0
  84. package/data/skills/openai-yeet/SKILL.md +28 -0
  85. package/data/skills/pennylane/SKILL.md +224 -0
  86. package/data/skills/polars-bio/SKILL.md +374 -0
  87. package/data/skills/primekg/SKILL.md +97 -0
  88. package/data/skills/pymatgen/SKILL.md +689 -0
  89. package/data/skills/qiskit/SKILL.md +273 -0
  90. package/data/skills/qutip/SKILL.md +316 -0
  91. package/data/skills/recursive-decomposition/SKILL.md +185 -0
  92. package/data/skills/rowan/SKILL.md +427 -0
  93. package/data/skills/scholar-evaluation/SKILL.md +298 -0
  94. package/data/skills/sentry-create-alert/SKILL.md +210 -0
  95. package/data/skills/sentry-fix-issues/SKILL.md +126 -0
  96. package/data/skills/sentry-pr-code-review/SKILL.md +105 -0
  97. package/data/skills/sentry-python-sdk/SKILL.md +317 -0
  98. package/data/skills/sentry-setup-ai-monitoring/SKILL.md +217 -0
  99. package/data/skills/stable-baselines3/SKILL.md +297 -0
  100. package/data/skills/sympy/SKILL.md +498 -0
  101. package/data/skills/trailofbits-ask-questions-if-underspecified/SKILL.md +85 -0
  102. package/data/skills/trailofbits-audit-context-building/SKILL.md +302 -0
  103. package/data/skills/trailofbits-differential-review/SKILL.md +220 -0
  104. package/data/skills/trailofbits-insecure-defaults/SKILL.md +117 -0
  105. package/data/skills/trailofbits-modern-python/SKILL.md +333 -0
  106. package/data/skills/trailofbits-property-based-testing/SKILL.md +123 -0
  107. package/data/skills/trailofbits-semgrep-rule-creator/SKILL.md +172 -0
  108. package/data/skills/trailofbits-sharp-edges/SKILL.md +292 -0
  109. package/data/skills/trailofbits-variant-analysis/SKILL.md +142 -0
  110. package/data/skills/transformers.js/SKILL.md +637 -0
  111. package/data/skills/writing/SKILL.md +419 -0
  112. package/dist/bgi.js +2 -2
  113. package/package.json +1 -1
@@ -0,0 +1,542 @@
1
+ ---
2
+ name: hugging-face-datasets
3
+ description: Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation. Designed to work alongside HF MCP server for comprehensive dataset workflows.
4
+ ---
5
+
6
+ # Overview
7
+ This skill provides tools to manage datasets on the Hugging Face Hub with a focus on creation, configuration, content management, and SQL-based data manipulation. It is designed to complement the existing Hugging Face MCP server by providing dataset editing and querying capabilities.
8
+
9
+ ## Integration with HF MCP Server
10
+ - **Use HF MCP Server for**: Dataset discovery, search, and metadata retrieval
11
+ - **Use This Skill for**: Dataset creation, content editing, SQL queries, data transformation, and structured data formatting
12
+
13
+ # Version
14
+ 2.1.0
15
+
16
+ # Dependencies
17
+ # This skill uses PEP 723 scripts with inline dependency management
18
+ # Scripts auto-install requirements when run with: uv run scripts/script_name.py
19
+
20
+ - uv (Python package manager)
21
+ - Getting Started: See "Usage Instructions" below for PEP 723 usage
22
+
23
+ # Core Capabilities
24
+
25
+ ## 1. Dataset Lifecycle Management
26
+ - **Initialize**: Create new dataset repositories with proper structure
27
+ - **Configure**: Store detailed configuration including system prompts and metadata
28
+ - **Stream Updates**: Add rows efficiently without downloading entire datasets
29
+
30
+ ## 2. SQL-Based Dataset Querying (NEW)
31
+ Query any Hugging Face dataset using DuckDB SQL via `scripts/sql_manager.py`:
32
+ - **Direct Queries**: Run SQL on datasets using the `hf://` protocol
33
+ - **Schema Discovery**: Describe dataset structure and column types
34
+ - **Data Sampling**: Get random samples for exploration
35
+ - **Aggregations**: Count, histogram, unique values analysis
36
+ - **Transformations**: Filter, join, reshape data with SQL
37
+ - **Export & Push**: Save results locally or push to new Hub repos
38
+
39
+ ## 3. Multi-Format Dataset Support
40
+ Supports diverse dataset types through template system:
41
+ - **Chat/Conversational**: Chat templating, multi-turn dialogues, tool usage examples
42
+ - **Text Classification**: Sentiment analysis, intent detection, topic classification
43
+ - **Question-Answering**: Reading comprehension, factual QA, knowledge bases
44
+ - **Text Completion**: Language modeling, code completion, creative writing
45
+ - **Tabular Data**: Structured data for regression/classification tasks
46
+ - **Custom Formats**: Flexible schema definition for specialized needs
47
+
48
+ ## 4. Quality Assurance Features
49
+ - **JSON Validation**: Ensures data integrity during uploads
50
+ - **Batch Processing**: Efficient handling of large datasets
51
+ - **Error Recovery**: Graceful handling of upload failures and conflicts
52
+
53
+ # Usage Instructions
54
+
55
+ The skill includes two Python scripts that use PEP 723 inline dependency management:
56
+
57
+ > **All paths are relative to the directory containing this SKILL.md
58
+ file.**
59
+ > Scripts are run with: `uv run scripts/script_name.py [arguments]`
60
+
61
+ - `scripts/dataset_manager.py` - Dataset creation and management
62
+ - `scripts/sql_manager.py` - SQL-based dataset querying and transformation
63
+
64
+ ### Prerequisites
65
+ - `uv` package manager installed
66
+ - `HF_TOKEN` environment variable must be set with a Write-access token
67
+
68
+ ---
69
+
70
+ # SQL Dataset Querying (sql_manager.py)
71
+
72
+ Query, transform, and push Hugging Face datasets using DuckDB SQL. The `hf://` protocol provides direct access to any public dataset (or private with token).
73
+
74
+ ## Quick Start
75
+
76
+ ```bash
77
+ # Query a dataset
78
+ uv run scripts/sql_manager.py query \
79
+ --dataset "cais/mmlu" \
80
+ --sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10"
81
+
82
+ # Get dataset schema
83
+ uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
84
+
85
+ # Sample random rows
86
+ uv run scripts/sql_manager.py sample --dataset "cais/mmlu" --n 5
87
+
88
+ # Count rows with filter
89
+ uv run scripts/sql_manager.py count --dataset "cais/mmlu" --where "subject='nutrition'"
90
+ ```
91
+
92
+ ## SQL Query Syntax
93
+
94
+ Use `data` as the table name in your SQL - it gets replaced with the actual `hf://` path:
95
+
96
+ ```sql
97
+ -- Basic select
98
+ SELECT * FROM data LIMIT 10
99
+
100
+ -- Filtering
101
+ SELECT * FROM data WHERE subject='nutrition'
102
+
103
+ -- Aggregations
104
+ SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject ORDER BY cnt DESC
105
+
106
+ -- Column selection and transformation
107
+ SELECT question, choices[answer] AS correct_answer FROM data
108
+
109
+ -- Regex matching
110
+ SELECT * FROM data WHERE regexp_matches(question, 'nutrition|diet')
111
+
112
+ -- String functions
113
+ SELECT regexp_replace(question, '\n', '') AS cleaned FROM data
114
+ ```
115
+
116
+ ## Common Operations
117
+
118
+ ### 1. Explore Dataset Structure
119
+ ```bash
120
+ # Get schema
121
+ uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
122
+
123
+ # Get unique values in column
124
+ uv run scripts/sql_manager.py unique --dataset "cais/mmlu" --column "subject"
125
+
126
+ # Get value distribution
127
+ uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject" --bins 20
128
+ ```
129
+
130
+ ### 2. Filter and Transform
131
+ ```bash
132
+ # Complex filtering with SQL
133
+ uv run scripts/sql_manager.py query \
134
+ --dataset "cais/mmlu" \
135
+ --sql "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject HAVING cnt > 100"
136
+
137
+ # Using transform command
138
+ uv run scripts/sql_manager.py transform \
139
+ --dataset "cais/mmlu" \
140
+ --select "subject, COUNT(*) as cnt" \
141
+ --group-by "subject" \
142
+ --order-by "cnt DESC" \
143
+ --limit 10
144
+ ```
145
+
146
+ ### 3. Create Subsets and Push to Hub
147
+ ```bash
148
+ # Query and push to new dataset
149
+ uv run scripts/sql_manager.py query \
150
+ --dataset "cais/mmlu" \
151
+ --sql "SELECT * FROM data WHERE subject='nutrition'" \
152
+ --push-to "username/mmlu-nutrition-subset" \
153
+ --private
154
+
155
+ # Transform and push
156
+ uv run scripts/sql_manager.py transform \
157
+ --dataset "ibm/duorc" \
158
+ --config "ParaphraseRC" \
159
+ --select "question, answers" \
160
+ --where "LENGTH(question) > 50" \
161
+ --push-to "username/duorc-long-questions"
162
+ ```
163
+
164
+ ### 4. Export to Local Files
165
+ ```bash
166
+ # Export to Parquet
167
+ uv run scripts/sql_manager.py export \
168
+ --dataset "cais/mmlu" \
169
+ --sql "SELECT * FROM data WHERE subject='nutrition'" \
170
+ --output "nutrition.parquet" \
171
+ --format parquet
172
+
173
+ # Export to JSONL
174
+ uv run scripts/sql_manager.py export \
175
+ --dataset "cais/mmlu" \
176
+ --sql "SELECT * FROM data LIMIT 100" \
177
+ --output "sample.jsonl" \
178
+ --format jsonl
179
+ ```
180
+
181
+ ### 5. Working with Dataset Configs/Splits
182
+ ```bash
183
+ # Specify config (subset)
184
+ uv run scripts/sql_manager.py query \
185
+ --dataset "ibm/duorc" \
186
+ --config "ParaphraseRC" \
187
+ --sql "SELECT * FROM data LIMIT 5"
188
+
189
+ # Specify split
190
+ uv run scripts/sql_manager.py query \
191
+ --dataset "cais/mmlu" \
192
+ --split "test" \
193
+ --sql "SELECT COUNT(*) FROM data"
194
+
195
+ # Query all splits
196
+ uv run scripts/sql_manager.py query \
197
+ --dataset "cais/mmlu" \
198
+ --split "*" \
199
+ --sql "SELECT * FROM data LIMIT 10"
200
+ ```
201
+
202
+ ### 6. Raw SQL with Full Paths
203
+ For complex queries or joining datasets:
204
+ ```bash
205
+ uv run scripts/sql_manager.py raw --sql "
206
+ SELECT a.*, b.*
207
+ FROM 'hf://datasets/dataset1@~parquet/default/train/*.parquet' a
208
+ JOIN 'hf://datasets/dataset2@~parquet/default/train/*.parquet' b
209
+ ON a.id = b.id
210
+ LIMIT 100
211
+ "
212
+ ```
213
+
214
+ ## Python API Usage
215
+
216
+ ```python
217
+ from sql_manager import HFDatasetSQL
218
+
219
+ sql = HFDatasetSQL()
220
+
221
+ # Query
222
+ results = sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10")
223
+
224
+ # Get schema
225
+ schema = sql.describe("cais/mmlu")
226
+
227
+ # Sample
228
+ samples = sql.sample("cais/mmlu", n=5, seed=42)
229
+
230
+ # Count
231
+ count = sql.count("cais/mmlu", where="subject='nutrition'")
232
+
233
+ # Histogram
234
+ dist = sql.histogram("cais/mmlu", "subject")
235
+
236
+ # Filter and transform
237
+ results = sql.filter_and_transform(
238
+ "cais/mmlu",
239
+ select="subject, COUNT(*) as cnt",
240
+ group_by="subject",
241
+ order_by="cnt DESC",
242
+ limit=10
243
+ )
244
+
245
+ # Push to Hub
246
+ url = sql.push_to_hub(
247
+ "cais/mmlu",
248
+ "username/nutrition-subset",
249
+ sql="SELECT * FROM data WHERE subject='nutrition'",
250
+ private=True
251
+ )
252
+
253
+ # Export locally
254
+ sql.export_to_parquet("cais/mmlu", "output.parquet", sql="SELECT * FROM data LIMIT 100")
255
+
256
+ sql.close()
257
+ ```
258
+
259
+ ## HF Path Format
260
+
261
+ DuckDB uses the `hf://` protocol to access datasets:
262
+ ```
263
+ hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet
264
+ ```
265
+
266
+ Examples:
267
+ - `hf://datasets/cais/mmlu@~parquet/default/train/*.parquet`
268
+ - `hf://datasets/ibm/duorc@~parquet/ParaphraseRC/test/*.parquet`
269
+
270
+ The `@~parquet` revision provides auto-converted Parquet files for any dataset format.
271
+
272
+ ## Useful DuckDB SQL Functions
273
+
274
+ ```sql
275
+ -- String functions
276
+ LENGTH(column) -- String length
277
+ regexp_replace(col, '\n', '') -- Regex replace
278
+ regexp_matches(col, 'pattern') -- Regex match
279
+ LOWER(col), UPPER(col) -- Case conversion
280
+
281
+ -- Array functions
282
+ choices[0] -- Array indexing (0-based)
283
+ array_length(choices) -- Array length
284
+ unnest(choices) -- Expand array to rows
285
+
286
+ -- Aggregations
287
+ COUNT(*), SUM(col), AVG(col)
288
+ GROUP BY col HAVING condition
289
+
290
+ -- Sampling
291
+ USING SAMPLE 10 -- Random sample
292
+ USING SAMPLE 10 (RESERVOIR, 42) -- Reproducible sample
293
+
294
+ -- Window functions
295
+ ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
296
+ ```
297
+
298
+ ---
299
+
300
+ # Dataset Creation (dataset_manager.py)
301
+
302
+ ### Recommended Workflow
303
+
304
+ **1. Discovery (Use HF MCP Server):**
305
+ ```python
306
+ # Use HF MCP tools to find existing datasets
307
+ search_datasets("conversational AI training")
308
+ get_dataset_details("username/dataset-name")
309
+ ```
310
+
311
+ **2. Creation (Use This Skill):**
312
+ ```bash
313
+ # Initialize new dataset
314
+ uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
315
+
316
+ # Configure with detailed system prompt
317
+ uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "$(cat system_prompt.txt)"
318
+ ```
319
+
320
+ **3. Content Management (Use This Skill):**
321
+ ```bash
322
+ # Quick setup with any template
323
+ uv run scripts/dataset_manager.py quick_setup \
324
+ --repo_id "your-username/dataset-name" \
325
+ --template classification
326
+
327
+ # Add data with template validation
328
+ uv run scripts/dataset_manager.py add_rows \
329
+ --repo_id "your-username/dataset-name" \
330
+ --template qa \
331
+ --rows_json "$(cat your_qa_data.json)"
332
+ ```
333
+
334
+ ### Template-Based Data Structures
335
+
336
+ **1. Chat Template (`--template chat`)**
337
+ ```json
338
+ {
339
+ "messages": [
340
+ {"role": "user", "content": "Natural user request"},
341
+ {"role": "assistant", "content": "Response with tool usage"},
342
+ {"role": "tool", "content": "Tool response", "tool_call_id": "call_123"}
343
+ ],
344
+ "scenario": "Description of use case",
345
+ "complexity": "simple|intermediate|advanced"
346
+ }
347
+ ```
348
+
349
+ **2. Classification Template (`--template classification`)**
350
+ ```json
351
+ {
352
+ "text": "Input text to be classified",
353
+ "label": "classification_label",
354
+ "confidence": 0.95,
355
+ "metadata": {"domain": "technology", "language": "en"}
356
+ }
357
+ ```
358
+
359
+ **3. QA Template (`--template qa`)**
360
+ ```json
361
+ {
362
+ "question": "What is the question being asked?",
363
+ "answer": "The complete answer",
364
+ "context": "Additional context if needed",
365
+ "answer_type": "factual|explanatory|opinion",
366
+ "difficulty": "easy|medium|hard"
367
+ }
368
+ ```
369
+
370
+ **4. Completion Template (`--template completion`)**
371
+ ```json
372
+ {
373
+ "prompt": "The beginning text or context",
374
+ "completion": "The expected continuation",
375
+ "domain": "code|creative|technical|conversational",
376
+ "style": "description of writing style"
377
+ }
378
+ ```
379
+
380
+ **5. Tabular Template (`--template tabular`)**
381
+ ```json
382
+ {
383
+ "columns": [
384
+ {"name": "feature1", "type": "numeric", "description": "First feature"},
385
+ {"name": "target", "type": "categorical", "description": "Target variable"}
386
+ ],
387
+ "data": [
388
+ {"feature1": 123, "target": "class_a"},
389
+ {"feature1": 456, "target": "class_b"}
390
+ ]
391
+ }
392
+ ```
393
+
394
+ ### Advanced System Prompt Template
395
+
396
+ For high-quality training data generation:
397
+ ```text
398
+ You are an AI assistant expert at using MCP tools effectively.
399
+
400
+ ## MCP SERVER DEFINITIONS
401
+ [Define available servers and tools]
402
+
403
+ ## TRAINING EXAMPLE STRUCTURE
404
+ [Specify exact JSON schema for chat templating]
405
+
406
+ ## QUALITY GUIDELINES
407
+ [Detail requirements for realistic scenarios, progressive complexity, proper tool usage]
408
+
409
+ ## EXAMPLE CATEGORIES
410
+ [List development workflows, debugging scenarios, data management tasks]
411
+ ```
412
+
413
+ ### Example Categories & Templates
414
+
415
+ The skill includes diverse training examples beyond just MCP usage:
416
+
417
+ **Available Example Sets:**
418
+ - `training_examples.json` - MCP tool usage examples (debugging, project setup, database analysis)
419
+ - `diverse_training_examples.json` - Broader scenarios including:
420
+ - **Educational Chat** - Explaining programming concepts, tutorials
421
+ - **Git Workflows** - Feature branches, version control guidance
422
+ - **Code Analysis** - Performance optimization, architecture review
423
+ - **Content Generation** - Professional writing, creative brainstorming
424
+ - **Codebase Navigation** - Legacy code exploration, systematic analysis
425
+ - **Conversational Support** - Problem-solving, technical discussions
426
+
427
+ **Using Different Example Sets:**
428
+ ```bash
429
+ # Add MCP-focused examples
430
+ uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
431
+ --rows_json "$(cat examples/training_examples.json)"
432
+
433
+ # Add diverse conversational examples
434
+ uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
435
+ --rows_json "$(cat examples/diverse_training_examples.json)"
436
+
437
+ # Mix both for comprehensive training data
438
+ uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
439
+ --rows_json "$(jq -s '.[0] + .[1]' examples/training_examples.json examples/diverse_training_examples.json)"
440
+ ```
441
+
442
+ ### Commands Reference
443
+
444
+ **List Available Templates:**
445
+ ```bash
446
+ uv run scripts/dataset_manager.py list_templates
447
+ ```
448
+
449
+ **Quick Setup (Recommended):**
450
+ ```bash
451
+ uv run scripts/dataset_manager.py quick_setup --repo_id "your-username/dataset-name" --template classification
452
+ ```
453
+
454
+ **Manual Setup:**
455
+ ```bash
456
+ # Initialize repository
457
+ uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
458
+
459
+ # Configure with system prompt
460
+ uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "Your prompt here"
461
+
462
+ # Add data with validation
463
+ uv run scripts/dataset_manager.py add_rows \
464
+ --repo_id "your-username/dataset-name" \
465
+ --template qa \
466
+ --rows_json '[{"question": "What is AI?", "answer": "Artificial Intelligence..."}]'
467
+ ```
468
+
469
+ **View Dataset Statistics:**
470
+ ```bash
471
+ uv run scripts/dataset_manager.py stats --repo_id "your-username/dataset-name"
472
+ ```
473
+
474
+ ### Error Handling
475
+ - **Repository exists**: Script will notify and continue with configuration
476
+ - **Invalid JSON**: Clear error message with parsing details
477
+ - **Network issues**: Automatic retry for transient failures
478
+ - **Token permissions**: Validation before operations begin
479
+
480
+ ---
481
+
482
+ # Combined Workflow Examples
483
+
484
+ ## Example 1: Create Training Subset from Existing Dataset
485
+ ```bash
486
+ # 1. Explore the source dataset
487
+ uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
488
+ uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject"
489
+
490
+ # 2. Query and create subset
491
+ uv run scripts/sql_manager.py query \
492
+ --dataset "cais/mmlu" \
493
+ --sql "SELECT * FROM data WHERE subject IN ('nutrition', 'anatomy', 'clinical_knowledge')" \
494
+ --push-to "username/mmlu-medical-subset" \
495
+ --private
496
+ ```
497
+
498
+ ## Example 2: Transform and Reshape Data
499
+ ```bash
500
+ # Transform MMLU to QA format with correct answers extracted
501
+ uv run scripts/sql_manager.py query \
502
+ --dataset "cais/mmlu" \
503
+ --sql "SELECT question, choices[answer] as correct_answer, subject FROM data" \
504
+ --push-to "username/mmlu-qa-format"
505
+ ```
506
+
507
+ ## Example 3: Merge Multiple Dataset Splits
508
+ ```bash
509
+ # Export multiple splits and combine
510
+ uv run scripts/sql_manager.py export \
511
+ --dataset "cais/mmlu" \
512
+ --split "*" \
513
+ --output "mmlu_all.parquet"
514
+ ```
515
+
516
+ ## Example 4: Quality Filtering
517
+ ```bash
518
+ # Filter for high-quality examples
519
+ uv run scripts/sql_manager.py query \
520
+ --dataset "squad" \
521
+ --sql "SELECT * FROM data WHERE LENGTH(context) > 500 AND LENGTH(question) > 20" \
522
+ --push-to "username/squad-filtered"
523
+ ```
524
+
525
+ ## Example 5: Create Custom Training Dataset
526
+ ```bash
527
+ # 1. Query source data
528
+ uv run scripts/sql_manager.py export \
529
+ --dataset "cais/mmlu" \
530
+ --sql "SELECT question, subject FROM data WHERE subject='nutrition'" \
531
+ --output "nutrition_source.jsonl" \
532
+ --format jsonl
533
+
534
+ # 2. Process with your pipeline (add answers, format, etc.)
535
+
536
+ # 3. Push processed data
537
+ uv run scripts/dataset_manager.py init --repo_id "username/nutrition-training"
538
+ uv run scripts/dataset_manager.py add_rows \
539
+ --repo_id "username/nutrition-training" \
540
+ --template qa \
541
+ --rows_json "$(cat processed_data.json)"
542
+ ```