aigroup-workflow 2.2.1 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +5 -5
- package/package.json +40 -39
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +195 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,540 +1,540 @@
|
|
|
1
|
-
# Dataset Preparation for Fine-Tuning
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
Dataset quality is the most important factor in fine-tuning success. This reference covers data formatting, validation, cleaning, and best practices for creating high-quality training data.
|
|
8
|
-
|
|
9
|
-
## Dataset Formats
|
|
10
|
-
|
|
11
|
-
### Alpaca Format (Instruction-Response)
|
|
12
|
-
|
|
13
|
-
```python
|
|
14
|
-
# Single-turn instruction format
|
|
15
|
-
alpaca_example = {
|
|
16
|
-
"instruction": "Summarize the following article in 2-3 sentences.",
|
|
17
|
-
"input": "The article text goes here...",
|
|
18
|
-
"output": "The summary goes here."
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
# Without input field
|
|
22
|
-
alpaca_no_input = {
|
|
23
|
-
"instruction": "What are the three primary colors?",
|
|
24
|
-
"input": "",
|
|
25
|
-
"output": "The three primary colors are red, blue, and yellow."
|
|
26
|
-
}
|
|
27
|
-
```
|
|
28
|
-
|
|
29
|
-
### ShareGPT Format (Multi-Turn Conversations)
|
|
30
|
-
|
|
31
|
-
```python
|
|
32
|
-
# Multi-turn conversation format
|
|
33
|
-
sharegpt_example = {
|
|
34
|
-
"conversations": [
|
|
35
|
-
{"from": "human", "value": "What is machine learning?"},
|
|
36
|
-
{"from": "gpt", "value": "Machine learning is a subset of AI that enables..."},
|
|
37
|
-
{"from": "human", "value": "Can you give me an example?"},
|
|
38
|
-
{"from": "gpt", "value": "A common example is email spam filtering..."}
|
|
39
|
-
]
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
# Alternative format with roles
|
|
43
|
-
openai_format = {
|
|
44
|
-
"messages": [
|
|
45
|
-
{"role": "system", "content": "You are a helpful assistant."},
|
|
46
|
-
{"role": "user", "content": "What is machine learning?"},
|
|
47
|
-
{"role": "assistant", "content": "Machine learning is..."}
|
|
48
|
-
]
|
|
49
|
-
}
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
### Converting Between Formats
|
|
53
|
-
|
|
54
|
-
```python
|
|
55
|
-
from typing import TypedDict
|
|
56
|
-
from datasets import Dataset
|
|
57
|
-
|
|
58
|
-
class AlpacaExample(TypedDict):
|
|
59
|
-
instruction: str
|
|
60
|
-
input: str
|
|
61
|
-
output: str
|
|
62
|
-
|
|
63
|
-
class ShareGPTExample(TypedDict):
|
|
64
|
-
conversations: list[dict[str, str]]
|
|
65
|
-
|
|
66
|
-
def alpaca_to_sharegpt(example: AlpacaExample) -> ShareGPTExample:
|
|
67
|
-
"""Convert Alpaca format to ShareGPT multi-turn format."""
|
|
68
|
-
user_content = example["instruction"]
|
|
69
|
-
if example.get("input"):
|
|
70
|
-
user_content += f"\n\n{example['input']}"
|
|
71
|
-
|
|
72
|
-
return {
|
|
73
|
-
"conversations": [
|
|
74
|
-
{"from": "human", "value": user_content},
|
|
75
|
-
{"from": "gpt", "value": example["output"]}
|
|
76
|
-
]
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
def sharegpt_to_messages(example: ShareGPTExample, system_prompt: str = "") -> dict:
|
|
80
|
-
"""Convert ShareGPT to OpenAI messages format."""
|
|
81
|
-
messages = []
|
|
82
|
-
if system_prompt:
|
|
83
|
-
messages.append({"role": "system", "content": system_prompt})
|
|
84
|
-
|
|
85
|
-
role_map = {"human": "user", "gpt": "assistant", "system": "system"}
|
|
86
|
-
for turn in example["conversations"]:
|
|
87
|
-
messages.append({
|
|
88
|
-
"role": role_map.get(turn["from"], turn["from"]),
|
|
89
|
-
"content": turn["value"]
|
|
90
|
-
})
|
|
91
|
-
|
|
92
|
-
return {"messages": messages}
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
## Formatting for Training
|
|
96
|
-
|
|
97
|
-
```python
|
|
98
|
-
from transformers import AutoTokenizer
|
|
99
|
-
|
|
100
|
-
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
|
101
|
-
|
|
102
|
-
def format_instruction_prompt(
|
|
103
|
-
instruction: str,
|
|
104
|
-
input_text: str = "",
|
|
105
|
-
response: str = "",
|
|
106
|
-
system_prompt: str = "You are a helpful assistant."
|
|
107
|
-
) -> str:
|
|
108
|
-
"""Format for Llama 3.1 Instruct chat template."""
|
|
109
|
-
messages = [
|
|
110
|
-
{"role": "system", "content": system_prompt},
|
|
111
|
-
{"role": "user", "content": f"{instruction}\n{input_text}".strip()},
|
|
112
|
-
]
|
|
113
|
-
if response:
|
|
114
|
-
messages.append({"role": "assistant", "content": response})
|
|
115
|
-
|
|
116
|
-
return tokenizer.apply_chat_template(
|
|
117
|
-
messages,
|
|
118
|
-
tokenize=False,
|
|
119
|
-
add_generation_prompt=not response # Add prompt if no response
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
# Example usage
|
|
123
|
-
formatted = format_instruction_prompt(
|
|
124
|
-
instruction="Translate to French:",
|
|
125
|
-
input_text="Hello, how are you?",
|
|
126
|
-
response="Bonjour, comment allez-vous?"
|
|
127
|
-
)
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
## Dataset Validation
|
|
131
|
-
|
|
132
|
-
```python
|
|
133
|
-
from dataclasses import dataclass
|
|
134
|
-
from collections import Counter
|
|
135
|
-
import re
|
|
136
|
-
|
|
137
|
-
@dataclass
|
|
138
|
-
class DatasetStats:
|
|
139
|
-
total_examples: int
|
|
140
|
-
avg_input_length: float
|
|
141
|
-
avg_output_length: float
|
|
142
|
-
max_input_length: int
|
|
143
|
-
max_output_length: int
|
|
144
|
-
empty_inputs: int
|
|
145
|
-
empty_outputs: int
|
|
146
|
-
duplicate_count: int
|
|
147
|
-
language_distribution: dict
|
|
148
|
-
|
|
149
|
-
def validate_dataset(examples: list[dict], tokenizer) -> tuple[DatasetStats, list[str]]:
|
|
150
|
-
"""
|
|
151
|
-
Validate dataset and return statistics and warnings.
|
|
152
|
-
|
|
153
|
-
Args:
|
|
154
|
-
examples: List of training examples
|
|
155
|
-
tokenizer: Tokenizer for length calculations
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
Tuple of (stats, warnings)
|
|
159
|
-
"""
|
|
160
|
-
warnings = []
|
|
161
|
-
input_lengths = []
|
|
162
|
-
output_lengths = []
|
|
163
|
-
seen_inputs = set()
|
|
164
|
-
duplicates = 0
|
|
165
|
-
|
|
166
|
-
for i, ex in enumerate(examples):
|
|
167
|
-
# Check for required fields
|
|
168
|
-
if "instruction" not in ex and "messages" not in ex:
|
|
169
|
-
warnings.append(f"Example {i}: Missing instruction or messages field")
|
|
170
|
-
continue
|
|
171
|
-
|
|
172
|
-
# Get input/output text
|
|
173
|
-
if "instruction" in ex:
|
|
174
|
-
input_text = f"{ex.get('instruction', '')} {ex.get('input', '')}".strip()
|
|
175
|
-
output_text = ex.get("output", "")
|
|
176
|
-
else:
|
|
177
|
-
input_text = " ".join(m["content"] for m in ex["messages"] if m["role"] == "user")
|
|
178
|
-
output_text = " ".join(m["content"] for m in ex["messages"] if m["role"] == "assistant")
|
|
179
|
-
|
|
180
|
-
# Check for empty fields
|
|
181
|
-
if not input_text:
|
|
182
|
-
warnings.append(f"Example {i}: Empty input")
|
|
183
|
-
if not output_text:
|
|
184
|
-
warnings.append(f"Example {i}: Empty output")
|
|
185
|
-
|
|
186
|
-
# Check lengths
|
|
187
|
-
input_len = len(tokenizer.encode(input_text))
|
|
188
|
-
output_len = len(tokenizer.encode(output_text))
|
|
189
|
-
input_lengths.append(input_len)
|
|
190
|
-
output_lengths.append(output_len)
|
|
191
|
-
|
|
192
|
-
if input_len + output_len > 4096:
|
|
193
|
-
warnings.append(f"Example {i}: Total length {input_len + output_len} exceeds 4096")
|
|
194
|
-
|
|
195
|
-
# Check for duplicates
|
|
196
|
-
input_hash = hash(input_text)
|
|
197
|
-
if input_hash in seen_inputs:
|
|
198
|
-
duplicates += 1
|
|
199
|
-
seen_inputs.add(input_hash)
|
|
200
|
-
|
|
201
|
-
stats = DatasetStats(
|
|
202
|
-
total_examples=len(examples),
|
|
203
|
-
avg_input_length=sum(input_lengths) / len(input_lengths) if input_lengths else 0,
|
|
204
|
-
avg_output_length=sum(output_lengths) / len(output_lengths) if output_lengths else 0,
|
|
205
|
-
max_input_length=max(input_lengths) if input_lengths else 0,
|
|
206
|
-
max_output_length=max(output_lengths) if output_lengths else 0,
|
|
207
|
-
empty_inputs=sum(1 for w in warnings if "Empty input" in w),
|
|
208
|
-
empty_outputs=sum(1 for w in warnings if "Empty output" in w),
|
|
209
|
-
duplicate_count=duplicates,
|
|
210
|
-
language_distribution={} # Implement language detection if needed
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
return stats, warnings
|
|
214
|
-
```
|
|
215
|
-
|
|
216
|
-
## Data Quality Checks
|
|
217
|
-
|
|
218
|
-
```python
|
|
219
|
-
import re
|
|
220
|
-
from typing import Callable
|
|
221
|
-
|
|
222
|
-
def create_quality_filter(
|
|
223
|
-
min_input_tokens: int = 10,
|
|
224
|
-
max_input_tokens: int = 2048,
|
|
225
|
-
min_output_tokens: int = 5,
|
|
226
|
-
max_output_tokens: int = 2048,
|
|
227
|
-
custom_filters: list[Callable[[dict], bool]] = None
|
|
228
|
-
) -> Callable[[dict, AutoTokenizer], bool]:
|
|
229
|
-
"""
|
|
230
|
-
Create a quality filter function for dataset examples.
|
|
231
|
-
|
|
232
|
-
Returns True if example passes all quality checks.
|
|
233
|
-
"""
|
|
234
|
-
def quality_filter(example: dict, tokenizer) -> bool:
|
|
235
|
-
if "instruction" in example:
|
|
236
|
-
input_text = f"{example.get('instruction', '')} {example.get('input', '')}".strip()
|
|
237
|
-
output_text = example.get("output", "")
|
|
238
|
-
else:
|
|
239
|
-
input_text = " ".join(m["content"] for m in example.get("messages", []) if m["role"] == "user")
|
|
240
|
-
output_text = " ".join(m["content"] for m in example.get("messages", []) if m["role"] == "assistant")
|
|
241
|
-
|
|
242
|
-
# Length checks
|
|
243
|
-
input_tokens = len(tokenizer.encode(input_text))
|
|
244
|
-
output_tokens = len(tokenizer.encode(output_text))
|
|
245
|
-
|
|
246
|
-
if not (min_input_tokens <= input_tokens <= max_input_tokens):
|
|
247
|
-
return False
|
|
248
|
-
if not (min_output_tokens <= output_tokens <= max_output_tokens):
|
|
249
|
-
return False
|
|
250
|
-
|
|
251
|
-
# Quality checks
|
|
252
|
-
if not output_text.strip():
|
|
253
|
-
return False
|
|
254
|
-
|
|
255
|
-
# Check for common issues
|
|
256
|
-
bad_patterns = [
|
|
257
|
-
r"I cannot",
|
|
258
|
-
r"I'm sorry, but",
|
|
259
|
-
r"As an AI",
|
|
260
|
-
r"I don't have access",
|
|
261
|
-
r"\[.*\]$", # Trailing brackets
|
|
262
|
-
]
|
|
263
|
-
for pattern in bad_patterns:
|
|
264
|
-
if re.search(pattern, output_text, re.IGNORECASE):
|
|
265
|
-
return False
|
|
266
|
-
|
|
267
|
-
# Custom filters
|
|
268
|
-
if custom_filters:
|
|
269
|
-
for filter_fn in custom_filters:
|
|
270
|
-
if not filter_fn(example):
|
|
271
|
-
return False
|
|
272
|
-
|
|
273
|
-
return True
|
|
274
|
-
|
|
275
|
-
return quality_filter
|
|
276
|
-
|
|
277
|
-
# Usage
|
|
278
|
-
quality_filter = create_quality_filter(min_output_tokens=20)
|
|
279
|
-
filtered_dataset = [ex for ex in dataset if quality_filter(ex, tokenizer)]
|
|
280
|
-
```
|
|
281
|
-
|
|
282
|
-
## Deduplication
|
|
283
|
-
|
|
284
|
-
```python
|
|
285
|
-
from datasketch import MinHash, MinHashLSH
|
|
286
|
-
import hashlib
|
|
287
|
-
|
|
288
|
-
def exact_dedup(examples: list[dict], key_field: str = "instruction") -> list[dict]:
|
|
289
|
-
"""Remove exact duplicates based on a key field."""
|
|
290
|
-
seen = set()
|
|
291
|
-
unique = []
|
|
292
|
-
for ex in examples:
|
|
293
|
-
key = ex.get(key_field, "")
|
|
294
|
-
if key not in seen:
|
|
295
|
-
seen.add(key)
|
|
296
|
-
unique.append(ex)
|
|
297
|
-
return unique
|
|
298
|
-
|
|
299
|
-
def fuzzy_dedup(
|
|
300
|
-
examples: list[dict],
|
|
301
|
-
key_field: str = "output",
|
|
302
|
-
threshold: float = 0.8,
|
|
303
|
-
num_perm: int = 128
|
|
304
|
-
) -> list[dict]:
|
|
305
|
-
"""
|
|
306
|
-
Remove near-duplicate examples using MinHash LSH.
|
|
307
|
-
|
|
308
|
-
Args:
|
|
309
|
-
examples: List of examples
|
|
310
|
-
key_field: Field to check for similarity
|
|
311
|
-
threshold: Jaccard similarity threshold (0-1)
|
|
312
|
-
num_perm: Number of permutations for MinHash
|
|
313
|
-
"""
|
|
314
|
-
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
315
|
-
unique = []
|
|
316
|
-
|
|
317
|
-
for i, ex in enumerate(examples):
|
|
318
|
-
text = ex.get(key_field, "")
|
|
319
|
-
words = text.lower().split()
|
|
320
|
-
|
|
321
|
-
# Create MinHash
|
|
322
|
-
m = MinHash(num_perm=num_perm)
|
|
323
|
-
for word in words:
|
|
324
|
-
m.update(word.encode('utf-8'))
|
|
325
|
-
|
|
326
|
-
# Check for near-duplicates
|
|
327
|
-
result = lsh.query(m)
|
|
328
|
-
if not result:
|
|
329
|
-
lsh.insert(str(i), m)
|
|
330
|
-
unique.append(ex)
|
|
331
|
-
|
|
332
|
-
return unique
|
|
333
|
-
|
|
334
|
-
# Combined deduplication pipeline
|
|
335
|
-
def deduplicate_dataset(examples: list[dict]) -> list[dict]:
|
|
336
|
-
"""Full deduplication pipeline."""
|
|
337
|
-
print(f"Starting examples: {len(examples)}")
|
|
338
|
-
|
|
339
|
-
# Step 1: Exact deduplication on input
|
|
340
|
-
examples = exact_dedup(examples, key_field="instruction")
|
|
341
|
-
print(f"After exact dedup on instruction: {len(examples)}")
|
|
342
|
-
|
|
343
|
-
# Step 2: Fuzzy deduplication on output
|
|
344
|
-
examples = fuzzy_dedup(examples, key_field="output", threshold=0.85)
|
|
345
|
-
print(f"After fuzzy dedup on output: {len(examples)}")
|
|
346
|
-
|
|
347
|
-
return examples
|
|
348
|
-
```
|
|
349
|
-
|
|
350
|
-
## Train/Validation Split
|
|
351
|
-
|
|
352
|
-
```python
|
|
353
|
-
from datasets import Dataset, DatasetDict
|
|
354
|
-
from sklearn.model_selection import train_test_split
|
|
355
|
-
import random
|
|
356
|
-
|
|
357
|
-
def create_stratified_split(
|
|
358
|
-
examples: list[dict],
|
|
359
|
-
test_size: float = 0.1,
|
|
360
|
-
stratify_field: str = None,
|
|
361
|
-
seed: int = 42
|
|
362
|
-
) -> DatasetDict:
|
|
363
|
-
"""
|
|
364
|
-
Create train/validation split with optional stratification.
|
|
365
|
-
|
|
366
|
-
Args:
|
|
367
|
-
examples: List of examples
|
|
368
|
-
test_size: Fraction for validation set
|
|
369
|
-
stratify_field: Field to stratify by (e.g., "category")
|
|
370
|
-
seed: Random seed for reproducibility
|
|
371
|
-
"""
|
|
372
|
-
if stratify_field and all(stratify_field in ex for ex in examples):
|
|
373
|
-
stratify = [ex[stratify_field] for ex in examples]
|
|
374
|
-
train_examples, val_examples = train_test_split(
|
|
375
|
-
examples,
|
|
376
|
-
test_size=test_size,
|
|
377
|
-
stratify=stratify,
|
|
378
|
-
random_state=seed
|
|
379
|
-
)
|
|
380
|
-
else:
|
|
381
|
-
random.seed(seed)
|
|
382
|
-
shuffled = examples.copy()
|
|
383
|
-
random.shuffle(shuffled)
|
|
384
|
-
split_idx = int(len(shuffled) * (1 - test_size))
|
|
385
|
-
train_examples = shuffled[:split_idx]
|
|
386
|
-
val_examples = shuffled[split_idx:]
|
|
387
|
-
|
|
388
|
-
return DatasetDict({
|
|
389
|
-
"train": Dataset.from_list(train_examples),
|
|
390
|
-
"validation": Dataset.from_list(val_examples)
|
|
391
|
-
})
|
|
392
|
-
```
|
|
393
|
-
|
|
394
|
-
## Data Augmentation
|
|
395
|
-
|
|
396
|
-
```python
|
|
397
|
-
import random
|
|
398
|
-
|
|
399
|
-
def augment_instruction(example: dict) -> list[dict]:
|
|
400
|
-
"""Generate augmented versions of an instruction example."""
|
|
401
|
-
augmented = [example]
|
|
402
|
-
|
|
403
|
-
instruction = example.get("instruction", "")
|
|
404
|
-
input_text = example.get("input", "")
|
|
405
|
-
output = example.get("output", "")
|
|
406
|
-
|
|
407
|
-
# Instruction paraphrasing templates
|
|
408
|
-
prefixes = [
|
|
409
|
-
"",
|
|
410
|
-
"Please ",
|
|
411
|
-
"Can you ",
|
|
412
|
-
"I need you to ",
|
|
413
|
-
"Your task is to ",
|
|
414
|
-
]
|
|
415
|
-
suffixes = [
|
|
416
|
-
"",
|
|
417
|
-
" Be concise.",
|
|
418
|
-
" Provide a detailed response.",
|
|
419
|
-
" Think step by step.",
|
|
420
|
-
]
|
|
421
|
-
|
|
422
|
-
# Generate variations
|
|
423
|
-
for prefix in random.sample(prefixes, min(2, len(prefixes))):
|
|
424
|
-
for suffix in random.sample(suffixes, min(2, len(suffixes))):
|
|
425
|
-
new_instruction = f"{prefix}{instruction[0].lower() if prefix else instruction[0]}{instruction[1:]}{suffix}"
|
|
426
|
-
if new_instruction != instruction:
|
|
427
|
-
augmented.append({
|
|
428
|
-
"instruction": new_instruction.strip(),
|
|
429
|
-
"input": input_text,
|
|
430
|
-
"output": output
|
|
431
|
-
})
|
|
432
|
-
|
|
433
|
-
return augmented
|
|
434
|
-
|
|
435
|
-
def augment_dataset(examples: list[dict], augmentation_factor: float = 1.5) -> list[dict]:
|
|
436
|
-
"""
|
|
437
|
-
Augment dataset to reach target size.
|
|
438
|
-
|
|
439
|
-
Args:
|
|
440
|
-
examples: Original examples
|
|
441
|
-
augmentation_factor: Target size as multiple of original
|
|
442
|
-
"""
|
|
443
|
-
augmented = []
|
|
444
|
-
target_size = int(len(examples) * augmentation_factor)
|
|
445
|
-
|
|
446
|
-
for ex in examples:
|
|
447
|
-
variations = augment_instruction(ex)
|
|
448
|
-
augmented.extend(variations)
|
|
449
|
-
|
|
450
|
-
# Deduplicate and trim to target
|
|
451
|
-
augmented = exact_dedup(augmented, "instruction")
|
|
452
|
-
random.shuffle(augmented)
|
|
453
|
-
return augmented[:target_size]
|
|
454
|
-
```
|
|
455
|
-
|
|
456
|
-
## Loading and Saving Datasets
|
|
457
|
-
|
|
458
|
-
```python
|
|
459
|
-
from datasets import load_dataset, Dataset
|
|
460
|
-
import json
|
|
461
|
-
|
|
462
|
-
def load_custom_dataset(path: str) -> Dataset:
|
|
463
|
-
"""Load dataset from various formats."""
|
|
464
|
-
if path.endswith(".jsonl"):
|
|
465
|
-
return load_dataset("json", data_files=path, split="train")
|
|
466
|
-
elif path.endswith(".json"):
|
|
467
|
-
with open(path, "r") as f:
|
|
468
|
-
data = json.load(f)
|
|
469
|
-
return Dataset.from_list(data)
|
|
470
|
-
elif path.endswith(".parquet"):
|
|
471
|
-
return load_dataset("parquet", data_files=path, split="train")
|
|
472
|
-
else:
|
|
473
|
-
# Try loading from Hugging Face Hub
|
|
474
|
-
return load_dataset(path, split="train")
|
|
475
|
-
|
|
476
|
-
def save_dataset(dataset: Dataset, path: str, format: str = "jsonl"):
|
|
477
|
-
"""Save dataset in specified format."""
|
|
478
|
-
if format == "jsonl":
|
|
479
|
-
dataset.to_json(path, orient="records", lines=True)
|
|
480
|
-
elif format == "parquet":
|
|
481
|
-
dataset.to_parquet(path)
|
|
482
|
-
elif format == "json":
|
|
483
|
-
with open(path, "w") as f:
|
|
484
|
-
json.dump(list(dataset), f, indent=2)
|
|
485
|
-
```
|
|
486
|
-
|
|
487
|
-
## Dataset Size Guidelines
|
|
488
|
-
|
|
489
|
-
| Task Type | Minimum Examples | Recommended | Notes |
|
|
490
|
-
|-----------|------------------|-------------|-------|
|
|
491
|
-
| Classification | 100 per class | 500+ per class | Balance classes |
|
|
492
|
-
| Instruction Following | 1,000 | 5,000-10,000 | Diverse instructions |
|
|
493
|
-
| Domain Adaptation | 5,000 | 20,000+ | High-quality domain data |
|
|
494
|
-
| Code Generation | 2,000 | 10,000+ | Include edge cases |
|
|
495
|
-
| Multi-turn Chat | 1,000 conversations | 5,000+ | Varied conversation lengths |
|
|
496
|
-
|
|
497
|
-
## Quick Reference
|
|
498
|
-
|
|
499
|
-
```python
|
|
500
|
-
# Complete dataset preparation pipeline
|
|
501
|
-
from datasets import Dataset
|
|
502
|
-
|
|
503
|
-
def prepare_dataset(raw_data_path: str, output_path: str, tokenizer) -> Dataset:
|
|
504
|
-
"""Full dataset preparation pipeline."""
|
|
505
|
-
# 1. Load raw data
|
|
506
|
-
examples = load_custom_dataset(raw_data_path)
|
|
507
|
-
|
|
508
|
-
# 2. Validate
|
|
509
|
-
stats, warnings = validate_dataset(list(examples), tokenizer)
|
|
510
|
-
print(f"Dataset stats: {stats}")
|
|
511
|
-
if warnings[:10]: # Show first 10 warnings
|
|
512
|
-
print(f"Sample warnings: {warnings[:10]}")
|
|
513
|
-
|
|
514
|
-
# 3. Filter for quality
|
|
515
|
-
quality_filter = create_quality_filter()
|
|
516
|
-
examples = [ex for ex in examples if quality_filter(ex, tokenizer)]
|
|
517
|
-
print(f"After quality filter: {len(examples)}")
|
|
518
|
-
|
|
519
|
-
# 4. Deduplicate
|
|
520
|
-
examples = deduplicate_dataset(examples)
|
|
521
|
-
print(f"After deduplication: {len(examples)}")
|
|
522
|
-
|
|
523
|
-
# 5. Split
|
|
524
|
-
dataset = create_stratified_split(examples, test_size=0.1)
|
|
525
|
-
|
|
526
|
-
# 6. Save
|
|
527
|
-
dataset["train"].to_json(f"{output_path}/train.jsonl", lines=True)
|
|
528
|
-
dataset["validation"].to_json(f"{output_path}/val.jsonl", lines=True)
|
|
529
|
-
|
|
530
|
-
return dataset
|
|
531
|
-
|
|
532
|
-
# Usage
|
|
533
|
-
dataset = prepare_dataset("raw_data.jsonl", "./processed", tokenizer)
|
|
534
|
-
```
|
|
535
|
-
|
|
536
|
-
## Related References
|
|
537
|
-
|
|
538
|
-
- `lora-peft.md` - Training configuration
|
|
539
|
-
- `evaluation-metrics.md` - Measuring dataset quality impact
|
|
540
|
-
- `hyperparameter-tuning.md` - Adjusting training for dataset size
|
|
1
|
+
# Dataset Preparation for Fine-Tuning
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Dataset quality is the most important factor in fine-tuning success. This reference covers data formatting, validation, cleaning, and best practices for creating high-quality training data.
|
|
8
|
+
|
|
9
|
+
## Dataset Formats
|
|
10
|
+
|
|
11
|
+
### Alpaca Format (Instruction-Response)
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
# Single-turn instruction format
|
|
15
|
+
alpaca_example = {
|
|
16
|
+
"instruction": "Summarize the following article in 2-3 sentences.",
|
|
17
|
+
"input": "The article text goes here...",
|
|
18
|
+
"output": "The summary goes here."
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
# Without input field
|
|
22
|
+
alpaca_no_input = {
|
|
23
|
+
"instruction": "What are the three primary colors?",
|
|
24
|
+
"input": "",
|
|
25
|
+
"output": "The three primary colors are red, blue, and yellow."
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### ShareGPT Format (Multi-Turn Conversations)
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
# Multi-turn conversation format
|
|
33
|
+
sharegpt_example = {
|
|
34
|
+
"conversations": [
|
|
35
|
+
{"from": "human", "value": "What is machine learning?"},
|
|
36
|
+
{"from": "gpt", "value": "Machine learning is a subset of AI that enables..."},
|
|
37
|
+
{"from": "human", "value": "Can you give me an example?"},
|
|
38
|
+
{"from": "gpt", "value": "A common example is email spam filtering..."}
|
|
39
|
+
]
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Alternative format with roles
|
|
43
|
+
openai_format = {
|
|
44
|
+
"messages": [
|
|
45
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
46
|
+
{"role": "user", "content": "What is machine learning?"},
|
|
47
|
+
{"role": "assistant", "content": "Machine learning is..."}
|
|
48
|
+
]
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Converting Between Formats
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from typing import TypedDict
|
|
56
|
+
from datasets import Dataset
|
|
57
|
+
|
|
58
|
+
class AlpacaExample(TypedDict):
|
|
59
|
+
instruction: str
|
|
60
|
+
input: str
|
|
61
|
+
output: str
|
|
62
|
+
|
|
63
|
+
class ShareGPTExample(TypedDict):
|
|
64
|
+
conversations: list[dict[str, str]]
|
|
65
|
+
|
|
66
|
+
def alpaca_to_sharegpt(example: AlpacaExample) -> ShareGPTExample:
|
|
67
|
+
"""Convert Alpaca format to ShareGPT multi-turn format."""
|
|
68
|
+
user_content = example["instruction"]
|
|
69
|
+
if example.get("input"):
|
|
70
|
+
user_content += f"\n\n{example['input']}"
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
"conversations": [
|
|
74
|
+
{"from": "human", "value": user_content},
|
|
75
|
+
{"from": "gpt", "value": example["output"]}
|
|
76
|
+
]
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
def sharegpt_to_messages(example: ShareGPTExample, system_prompt: str = "") -> dict:
|
|
80
|
+
"""Convert ShareGPT to OpenAI messages format."""
|
|
81
|
+
messages = []
|
|
82
|
+
if system_prompt:
|
|
83
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
84
|
+
|
|
85
|
+
role_map = {"human": "user", "gpt": "assistant", "system": "system"}
|
|
86
|
+
for turn in example["conversations"]:
|
|
87
|
+
messages.append({
|
|
88
|
+
"role": role_map.get(turn["from"], turn["from"]),
|
|
89
|
+
"content": turn["value"]
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
return {"messages": messages}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Formatting for Training
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from transformers import AutoTokenizer
|
|
99
|
+
|
|
100
|
+
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
|
101
|
+
|
|
102
|
+
def format_instruction_prompt(
|
|
103
|
+
instruction: str,
|
|
104
|
+
input_text: str = "",
|
|
105
|
+
response: str = "",
|
|
106
|
+
system_prompt: str = "You are a helpful assistant."
|
|
107
|
+
) -> str:
|
|
108
|
+
"""Format for Llama 3.1 Instruct chat template."""
|
|
109
|
+
messages = [
|
|
110
|
+
{"role": "system", "content": system_prompt},
|
|
111
|
+
{"role": "user", "content": f"{instruction}\n{input_text}".strip()},
|
|
112
|
+
]
|
|
113
|
+
if response:
|
|
114
|
+
messages.append({"role": "assistant", "content": response})
|
|
115
|
+
|
|
116
|
+
return tokenizer.apply_chat_template(
|
|
117
|
+
messages,
|
|
118
|
+
tokenize=False,
|
|
119
|
+
add_generation_prompt=not response # Add prompt if no response
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Example usage
|
|
123
|
+
formatted = format_instruction_prompt(
|
|
124
|
+
instruction="Translate to French:",
|
|
125
|
+
input_text="Hello, how are you?",
|
|
126
|
+
response="Bonjour, comment allez-vous?"
|
|
127
|
+
)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Dataset Validation
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from dataclasses import dataclass
|
|
134
|
+
from collections import Counter
|
|
135
|
+
import re
|
|
136
|
+
|
|
137
|
+
@dataclass
|
|
138
|
+
class DatasetStats:
|
|
139
|
+
total_examples: int
|
|
140
|
+
avg_input_length: float
|
|
141
|
+
avg_output_length: float
|
|
142
|
+
max_input_length: int
|
|
143
|
+
max_output_length: int
|
|
144
|
+
empty_inputs: int
|
|
145
|
+
empty_outputs: int
|
|
146
|
+
duplicate_count: int
|
|
147
|
+
language_distribution: dict
|
|
148
|
+
|
|
149
|
+
def validate_dataset(examples: list[dict], tokenizer) -> tuple[DatasetStats, list[str]]:
|
|
150
|
+
"""
|
|
151
|
+
Validate dataset and return statistics and warnings.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
examples: List of training examples
|
|
155
|
+
tokenizer: Tokenizer for length calculations
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Tuple of (stats, warnings)
|
|
159
|
+
"""
|
|
160
|
+
warnings = []
|
|
161
|
+
input_lengths = []
|
|
162
|
+
output_lengths = []
|
|
163
|
+
seen_inputs = set()
|
|
164
|
+
duplicates = 0
|
|
165
|
+
|
|
166
|
+
for i, ex in enumerate(examples):
|
|
167
|
+
# Check for required fields
|
|
168
|
+
if "instruction" not in ex and "messages" not in ex:
|
|
169
|
+
warnings.append(f"Example {i}: Missing instruction or messages field")
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
# Get input/output text
|
|
173
|
+
if "instruction" in ex:
|
|
174
|
+
input_text = f"{ex.get('instruction', '')} {ex.get('input', '')}".strip()
|
|
175
|
+
output_text = ex.get("output", "")
|
|
176
|
+
else:
|
|
177
|
+
input_text = " ".join(m["content"] for m in ex["messages"] if m["role"] == "user")
|
|
178
|
+
output_text = " ".join(m["content"] for m in ex["messages"] if m["role"] == "assistant")
|
|
179
|
+
|
|
180
|
+
# Check for empty fields
|
|
181
|
+
if not input_text:
|
|
182
|
+
warnings.append(f"Example {i}: Empty input")
|
|
183
|
+
if not output_text:
|
|
184
|
+
warnings.append(f"Example {i}: Empty output")
|
|
185
|
+
|
|
186
|
+
# Check lengths
|
|
187
|
+
input_len = len(tokenizer.encode(input_text))
|
|
188
|
+
output_len = len(tokenizer.encode(output_text))
|
|
189
|
+
input_lengths.append(input_len)
|
|
190
|
+
output_lengths.append(output_len)
|
|
191
|
+
|
|
192
|
+
if input_len + output_len > 4096:
|
|
193
|
+
warnings.append(f"Example {i}: Total length {input_len + output_len} exceeds 4096")
|
|
194
|
+
|
|
195
|
+
# Check for duplicates
|
|
196
|
+
input_hash = hash(input_text)
|
|
197
|
+
if input_hash in seen_inputs:
|
|
198
|
+
duplicates += 1
|
|
199
|
+
seen_inputs.add(input_hash)
|
|
200
|
+
|
|
201
|
+
stats = DatasetStats(
|
|
202
|
+
total_examples=len(examples),
|
|
203
|
+
avg_input_length=sum(input_lengths) / len(input_lengths) if input_lengths else 0,
|
|
204
|
+
avg_output_length=sum(output_lengths) / len(output_lengths) if output_lengths else 0,
|
|
205
|
+
max_input_length=max(input_lengths) if input_lengths else 0,
|
|
206
|
+
max_output_length=max(output_lengths) if output_lengths else 0,
|
|
207
|
+
empty_inputs=sum(1 for w in warnings if "Empty input" in w),
|
|
208
|
+
empty_outputs=sum(1 for w in warnings if "Empty output" in w),
|
|
209
|
+
duplicate_count=duplicates,
|
|
210
|
+
language_distribution={} # Implement language detection if needed
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return stats, warnings
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## Data Quality Checks
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
import re
|
|
220
|
+
from typing import Callable
|
|
221
|
+
|
|
222
|
+
def create_quality_filter(
|
|
223
|
+
min_input_tokens: int = 10,
|
|
224
|
+
max_input_tokens: int = 2048,
|
|
225
|
+
min_output_tokens: int = 5,
|
|
226
|
+
max_output_tokens: int = 2048,
|
|
227
|
+
custom_filters: list[Callable[[dict], bool]] = None
|
|
228
|
+
) -> Callable[[dict, AutoTokenizer], bool]:
|
|
229
|
+
"""
|
|
230
|
+
Create a quality filter function for dataset examples.
|
|
231
|
+
|
|
232
|
+
Returns True if example passes all quality checks.
|
|
233
|
+
"""
|
|
234
|
+
def quality_filter(example: dict, tokenizer) -> bool:
|
|
235
|
+
if "instruction" in example:
|
|
236
|
+
input_text = f"{example.get('instruction', '')} {example.get('input', '')}".strip()
|
|
237
|
+
output_text = example.get("output", "")
|
|
238
|
+
else:
|
|
239
|
+
input_text = " ".join(m["content"] for m in example.get("messages", []) if m["role"] == "user")
|
|
240
|
+
output_text = " ".join(m["content"] for m in example.get("messages", []) if m["role"] == "assistant")
|
|
241
|
+
|
|
242
|
+
# Length checks
|
|
243
|
+
input_tokens = len(tokenizer.encode(input_text))
|
|
244
|
+
output_tokens = len(tokenizer.encode(output_text))
|
|
245
|
+
|
|
246
|
+
if not (min_input_tokens <= input_tokens <= max_input_tokens):
|
|
247
|
+
return False
|
|
248
|
+
if not (min_output_tokens <= output_tokens <= max_output_tokens):
|
|
249
|
+
return False
|
|
250
|
+
|
|
251
|
+
# Quality checks
|
|
252
|
+
if not output_text.strip():
|
|
253
|
+
return False
|
|
254
|
+
|
|
255
|
+
# Check for common issues
|
|
256
|
+
bad_patterns = [
|
|
257
|
+
r"I cannot",
|
|
258
|
+
r"I'm sorry, but",
|
|
259
|
+
r"As an AI",
|
|
260
|
+
r"I don't have access",
|
|
261
|
+
r"\[.*\]$", # Trailing brackets
|
|
262
|
+
]
|
|
263
|
+
for pattern in bad_patterns:
|
|
264
|
+
if re.search(pattern, output_text, re.IGNORECASE):
|
|
265
|
+
return False
|
|
266
|
+
|
|
267
|
+
# Custom filters
|
|
268
|
+
if custom_filters:
|
|
269
|
+
for filter_fn in custom_filters:
|
|
270
|
+
if not filter_fn(example):
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
return True
|
|
274
|
+
|
|
275
|
+
return quality_filter
|
|
276
|
+
|
|
277
|
+
# Usage
|
|
278
|
+
quality_filter = create_quality_filter(min_output_tokens=20)
|
|
279
|
+
filtered_dataset = [ex for ex in dataset if quality_filter(ex, tokenizer)]
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## Deduplication
|
|
283
|
+
|
|
284
|
+
```python
|
|
285
|
+
from datasketch import MinHash, MinHashLSH
|
|
286
|
+
import hashlib
|
|
287
|
+
|
|
288
|
+
def exact_dedup(examples: list[dict], key_field: str = "instruction") -> list[dict]:
|
|
289
|
+
"""Remove exact duplicates based on a key field."""
|
|
290
|
+
seen = set()
|
|
291
|
+
unique = []
|
|
292
|
+
for ex in examples:
|
|
293
|
+
key = ex.get(key_field, "")
|
|
294
|
+
if key not in seen:
|
|
295
|
+
seen.add(key)
|
|
296
|
+
unique.append(ex)
|
|
297
|
+
return unique
|
|
298
|
+
|
|
299
|
+
def fuzzy_dedup(
|
|
300
|
+
examples: list[dict],
|
|
301
|
+
key_field: str = "output",
|
|
302
|
+
threshold: float = 0.8,
|
|
303
|
+
num_perm: int = 128
|
|
304
|
+
) -> list[dict]:
|
|
305
|
+
"""
|
|
306
|
+
Remove near-duplicate examples using MinHash LSH.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
examples: List of examples
|
|
310
|
+
key_field: Field to check for similarity
|
|
311
|
+
threshold: Jaccard similarity threshold (0-1)
|
|
312
|
+
num_perm: Number of permutations for MinHash
|
|
313
|
+
"""
|
|
314
|
+
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
315
|
+
unique = []
|
|
316
|
+
|
|
317
|
+
for i, ex in enumerate(examples):
|
|
318
|
+
text = ex.get(key_field, "")
|
|
319
|
+
words = text.lower().split()
|
|
320
|
+
|
|
321
|
+
# Create MinHash
|
|
322
|
+
m = MinHash(num_perm=num_perm)
|
|
323
|
+
for word in words:
|
|
324
|
+
m.update(word.encode('utf-8'))
|
|
325
|
+
|
|
326
|
+
# Check for near-duplicates
|
|
327
|
+
result = lsh.query(m)
|
|
328
|
+
if not result:
|
|
329
|
+
lsh.insert(str(i), m)
|
|
330
|
+
unique.append(ex)
|
|
331
|
+
|
|
332
|
+
return unique
|
|
333
|
+
|
|
334
|
+
# Combined deduplication pipeline
|
|
335
|
+
def deduplicate_dataset(examples: list[dict]) -> list[dict]:
|
|
336
|
+
"""Full deduplication pipeline."""
|
|
337
|
+
print(f"Starting examples: {len(examples)}")
|
|
338
|
+
|
|
339
|
+
# Step 1: Exact deduplication on input
|
|
340
|
+
examples = exact_dedup(examples, key_field="instruction")
|
|
341
|
+
print(f"After exact dedup on instruction: {len(examples)}")
|
|
342
|
+
|
|
343
|
+
# Step 2: Fuzzy deduplication on output
|
|
344
|
+
examples = fuzzy_dedup(examples, key_field="output", threshold=0.85)
|
|
345
|
+
print(f"After fuzzy dedup on output: {len(examples)}")
|
|
346
|
+
|
|
347
|
+
return examples
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
## Train/Validation Split
|
|
351
|
+
|
|
352
|
+
```python
|
|
353
|
+
from datasets import Dataset, DatasetDict
|
|
354
|
+
from sklearn.model_selection import train_test_split
|
|
355
|
+
import random
|
|
356
|
+
|
|
357
|
+
def create_stratified_split(
|
|
358
|
+
examples: list[dict],
|
|
359
|
+
test_size: float = 0.1,
|
|
360
|
+
stratify_field: str = None,
|
|
361
|
+
seed: int = 42
|
|
362
|
+
) -> DatasetDict:
|
|
363
|
+
"""
|
|
364
|
+
Create train/validation split with optional stratification.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
examples: List of examples
|
|
368
|
+
test_size: Fraction for validation set
|
|
369
|
+
stratify_field: Field to stratify by (e.g., "category")
|
|
370
|
+
seed: Random seed for reproducibility
|
|
371
|
+
"""
|
|
372
|
+
if stratify_field and all(stratify_field in ex for ex in examples):
|
|
373
|
+
stratify = [ex[stratify_field] for ex in examples]
|
|
374
|
+
train_examples, val_examples = train_test_split(
|
|
375
|
+
examples,
|
|
376
|
+
test_size=test_size,
|
|
377
|
+
stratify=stratify,
|
|
378
|
+
random_state=seed
|
|
379
|
+
)
|
|
380
|
+
else:
|
|
381
|
+
random.seed(seed)
|
|
382
|
+
shuffled = examples.copy()
|
|
383
|
+
random.shuffle(shuffled)
|
|
384
|
+
split_idx = int(len(shuffled) * (1 - test_size))
|
|
385
|
+
train_examples = shuffled[:split_idx]
|
|
386
|
+
val_examples = shuffled[split_idx:]
|
|
387
|
+
|
|
388
|
+
return DatasetDict({
|
|
389
|
+
"train": Dataset.from_list(train_examples),
|
|
390
|
+
"validation": Dataset.from_list(val_examples)
|
|
391
|
+
})
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
## Data Augmentation
|
|
395
|
+
|
|
396
|
+
```python
|
|
397
|
+
import random
|
|
398
|
+
|
|
399
|
+
def augment_instruction(example: dict) -> list[dict]:
|
|
400
|
+
"""Generate augmented versions of an instruction example."""
|
|
401
|
+
augmented = [example]
|
|
402
|
+
|
|
403
|
+
instruction = example.get("instruction", "")
|
|
404
|
+
input_text = example.get("input", "")
|
|
405
|
+
output = example.get("output", "")
|
|
406
|
+
|
|
407
|
+
# Instruction paraphrasing templates
|
|
408
|
+
prefixes = [
|
|
409
|
+
"",
|
|
410
|
+
"Please ",
|
|
411
|
+
"Can you ",
|
|
412
|
+
"I need you to ",
|
|
413
|
+
"Your task is to ",
|
|
414
|
+
]
|
|
415
|
+
suffixes = [
|
|
416
|
+
"",
|
|
417
|
+
" Be concise.",
|
|
418
|
+
" Provide a detailed response.",
|
|
419
|
+
" Think step by step.",
|
|
420
|
+
]
|
|
421
|
+
|
|
422
|
+
# Generate variations
|
|
423
|
+
for prefix in random.sample(prefixes, min(2, len(prefixes))):
|
|
424
|
+
for suffix in random.sample(suffixes, min(2, len(suffixes))):
|
|
425
|
+
new_instruction = f"{prefix}{instruction[0].lower() if prefix else instruction[0]}{instruction[1:]}{suffix}"
|
|
426
|
+
if new_instruction != instruction:
|
|
427
|
+
augmented.append({
|
|
428
|
+
"instruction": new_instruction.strip(),
|
|
429
|
+
"input": input_text,
|
|
430
|
+
"output": output
|
|
431
|
+
})
|
|
432
|
+
|
|
433
|
+
return augmented
|
|
434
|
+
|
|
435
|
+
def augment_dataset(examples: list[dict], augmentation_factor: float = 1.5) -> list[dict]:
|
|
436
|
+
"""
|
|
437
|
+
Augment dataset to reach target size.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
examples: Original examples
|
|
441
|
+
augmentation_factor: Target size as multiple of original
|
|
442
|
+
"""
|
|
443
|
+
augmented = []
|
|
444
|
+
target_size = int(len(examples) * augmentation_factor)
|
|
445
|
+
|
|
446
|
+
for ex in examples:
|
|
447
|
+
variations = augment_instruction(ex)
|
|
448
|
+
augmented.extend(variations)
|
|
449
|
+
|
|
450
|
+
# Deduplicate and trim to target
|
|
451
|
+
augmented = exact_dedup(augmented, "instruction")
|
|
452
|
+
random.shuffle(augmented)
|
|
453
|
+
return augmented[:target_size]
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
## Loading and Saving Datasets
|
|
457
|
+
|
|
458
|
+
```python
|
|
459
|
+
from datasets import load_dataset, Dataset
|
|
460
|
+
import json
|
|
461
|
+
|
|
462
|
+
def load_custom_dataset(path: str) -> Dataset:
|
|
463
|
+
"""Load dataset from various formats."""
|
|
464
|
+
if path.endswith(".jsonl"):
|
|
465
|
+
return load_dataset("json", data_files=path, split="train")
|
|
466
|
+
elif path.endswith(".json"):
|
|
467
|
+
with open(path, "r") as f:
|
|
468
|
+
data = json.load(f)
|
|
469
|
+
return Dataset.from_list(data)
|
|
470
|
+
elif path.endswith(".parquet"):
|
|
471
|
+
return load_dataset("parquet", data_files=path, split="train")
|
|
472
|
+
else:
|
|
473
|
+
# Try loading from Hugging Face Hub
|
|
474
|
+
return load_dataset(path, split="train")
|
|
475
|
+
|
|
476
|
+
def save_dataset(dataset: Dataset, path: str, format: str = "jsonl"):
|
|
477
|
+
"""Save dataset in specified format."""
|
|
478
|
+
if format == "jsonl":
|
|
479
|
+
dataset.to_json(path, orient="records", lines=True)
|
|
480
|
+
elif format == "parquet":
|
|
481
|
+
dataset.to_parquet(path)
|
|
482
|
+
elif format == "json":
|
|
483
|
+
with open(path, "w") as f:
|
|
484
|
+
json.dump(list(dataset), f, indent=2)
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
## Dataset Size Guidelines
|
|
488
|
+
|
|
489
|
+
| Task Type | Minimum Examples | Recommended | Notes |
|
|
490
|
+
|-----------|------------------|-------------|-------|
|
|
491
|
+
| Classification | 100 per class | 500+ per class | Balance classes |
|
|
492
|
+
| Instruction Following | 1,000 | 5,000-10,000 | Diverse instructions |
|
|
493
|
+
| Domain Adaptation | 5,000 | 20,000+ | High-quality domain data |
|
|
494
|
+
| Code Generation | 2,000 | 10,000+ | Include edge cases |
|
|
495
|
+
| Multi-turn Chat | 1,000 conversations | 5,000+ | Varied conversation lengths |
|
|
496
|
+
|
|
497
|
+
## Quick Reference
|
|
498
|
+
|
|
499
|
+
```python
|
|
500
|
+
# Complete dataset preparation pipeline
|
|
501
|
+
from datasets import Dataset
|
|
502
|
+
|
|
503
|
+
def prepare_dataset(raw_data_path: str, output_path: str, tokenizer) -> Dataset:
|
|
504
|
+
"""Full dataset preparation pipeline."""
|
|
505
|
+
# 1. Load raw data
|
|
506
|
+
examples = load_custom_dataset(raw_data_path)
|
|
507
|
+
|
|
508
|
+
# 2. Validate
|
|
509
|
+
stats, warnings = validate_dataset(list(examples), tokenizer)
|
|
510
|
+
print(f"Dataset stats: {stats}")
|
|
511
|
+
if warnings[:10]: # Show first 10 warnings
|
|
512
|
+
print(f"Sample warnings: {warnings[:10]}")
|
|
513
|
+
|
|
514
|
+
# 3. Filter for quality
|
|
515
|
+
quality_filter = create_quality_filter()
|
|
516
|
+
examples = [ex for ex in examples if quality_filter(ex, tokenizer)]
|
|
517
|
+
print(f"After quality filter: {len(examples)}")
|
|
518
|
+
|
|
519
|
+
# 4. Deduplicate
|
|
520
|
+
examples = deduplicate_dataset(examples)
|
|
521
|
+
print(f"After deduplication: {len(examples)}")
|
|
522
|
+
|
|
523
|
+
# 5. Split
|
|
524
|
+
dataset = create_stratified_split(examples, test_size=0.1)
|
|
525
|
+
|
|
526
|
+
# 6. Save
|
|
527
|
+
dataset["train"].to_json(f"{output_path}/train.jsonl", lines=True)
|
|
528
|
+
dataset["validation"].to_json(f"{output_path}/val.jsonl", lines=True)
|
|
529
|
+
|
|
530
|
+
return dataset
|
|
531
|
+
|
|
532
|
+
# Usage
|
|
533
|
+
dataset = prepare_dataset("raw_data.jsonl", "./processed", tokenizer)
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
## Related References
|
|
537
|
+
|
|
538
|
+
- `lora-peft.md` - Training configuration
|
|
539
|
+
- `evaluation-metrics.md` - Measuring dataset quality impact
|
|
540
|
+
- `hyperparameter-tuning.md` - Adjusting training for dataset size
|