aigroup-workflow 2.2.1 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +5 -5
- package/package.json +40 -39
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +195 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,878 +1,878 @@
|
|
|
1
|
-
# Chunking Strategies
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Strategy Comparison Matrix
|
|
6
|
-
|
|
7
|
-
| Strategy | Best For | Chunk Quality | Implementation Complexity |
|
|
8
|
-
|----------|----------|---------------|---------------------------|
|
|
9
|
-
| **Fixed-size** | Simple documents, logs | Low-Medium | Simple |
|
|
10
|
-
| **Recursive character** | General text, articles | Medium | Simple |
|
|
11
|
-
| **Sentence-based** | Conversational, Q&A | Medium-High | Medium |
|
|
12
|
-
| **Semantic** | Technical docs, manuals | High | Medium |
|
|
13
|
-
| **Document-aware** | Structured content (MD, HTML) | High | Medium |
|
|
14
|
-
| **Agentic/Contextual** | Complex documents | Very High | Complex |
|
|
15
|
-
| **Late chunking** | Long-context embeddings | High | Medium |
|
|
16
|
-
|
|
17
|
-
---
|
|
18
|
-
|
|
19
|
-
## When to Use Each Strategy
|
|
20
|
-
|
|
21
|
-
### Fixed-Size Chunking
|
|
22
|
-
```
|
|
23
|
-
Best For:
|
|
24
|
-
- Log files and structured data
|
|
25
|
-
- Quick prototyping
|
|
26
|
-
- When content has no natural structure
|
|
27
|
-
- Baseline comparison
|
|
28
|
-
|
|
29
|
-
When to Avoid:
|
|
30
|
-
- Technical documentation
|
|
31
|
-
- Content with semantic units (paragraphs, sections)
|
|
32
|
-
- When context preservation matters
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
### Recursive Character Splitting
|
|
36
|
-
```
|
|
37
|
-
Best For:
|
|
38
|
-
- General articles and blog posts
|
|
39
|
-
- Mixed content types
|
|
40
|
-
- Default starting point for most RAG
|
|
41
|
-
- LangChain/LlamaIndex default
|
|
42
|
-
|
|
43
|
-
When to Avoid:
|
|
44
|
-
- Highly structured documents
|
|
45
|
-
- Code-heavy content
|
|
46
|
-
- Tables and lists
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
### Semantic Chunking
|
|
50
|
-
```
|
|
51
|
-
Best For:
|
|
52
|
-
- Technical documentation
|
|
53
|
-
- Research papers
|
|
54
|
-
- Content with natural topic boundaries
|
|
55
|
-
- When retrieval precision is critical
|
|
56
|
-
|
|
57
|
-
When to Avoid:
|
|
58
|
-
- Real-time ingestion (slower)
|
|
59
|
-
- Very short documents
|
|
60
|
-
- Cost-sensitive pipelines (requires embeddings)
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
### Document-Aware Chunking
|
|
64
|
-
```
|
|
65
|
-
Best For:
|
|
66
|
-
- Markdown documentation
|
|
67
|
-
- HTML pages
|
|
68
|
-
- LaTeX papers
|
|
69
|
-
- Code files
|
|
70
|
-
|
|
71
|
-
When to Avoid:
|
|
72
|
-
- Plain text without structure
|
|
73
|
-
- Inconsistent formatting
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
---
|
|
77
|
-
|
|
78
|
-
## Fixed-Size Chunking
|
|
79
|
-
|
|
80
|
-
```python
|
|
81
|
-
def fixed_size_chunk(
|
|
82
|
-
text: str,
|
|
83
|
-
chunk_size: int = 500,
|
|
84
|
-
overlap: int = 50
|
|
85
|
-
) -> list[str]:
|
|
86
|
-
"""Simple fixed-size chunking with overlap."""
|
|
87
|
-
chunks = []
|
|
88
|
-
start = 0
|
|
89
|
-
|
|
90
|
-
while start < len(text):
|
|
91
|
-
end = start + chunk_size
|
|
92
|
-
chunk = text[start:end]
|
|
93
|
-
|
|
94
|
-
# Try to break at word boundary
|
|
95
|
-
if end < len(text):
|
|
96
|
-
last_space = chunk.rfind(' ')
|
|
97
|
-
if last_space > chunk_size * 0.8: # Only if reasonably far in
|
|
98
|
-
chunk = chunk[:last_space]
|
|
99
|
-
end = start + last_space
|
|
100
|
-
|
|
101
|
-
chunks.append(chunk.strip())
|
|
102
|
-
start = end - overlap
|
|
103
|
-
|
|
104
|
-
return chunks
|
|
105
|
-
|
|
106
|
-
# Usage
|
|
107
|
-
chunks = fixed_size_chunk(document_text, chunk_size=500, overlap=50)
|
|
108
|
-
```
|
|
109
|
-
|
|
110
|
-
---
|
|
111
|
-
|
|
112
|
-
## Recursive Character Splitting (LangChain Style)
|
|
113
|
-
|
|
114
|
-
```python
|
|
115
|
-
from typing import Callable
|
|
116
|
-
|
|
117
|
-
class RecursiveCharacterSplitter:
|
|
118
|
-
"""Split text recursively using multiple separators."""
|
|
119
|
-
|
|
120
|
-
def __init__(
|
|
121
|
-
self,
|
|
122
|
-
chunk_size: int = 1000,
|
|
123
|
-
chunk_overlap: int = 200,
|
|
124
|
-
separators: list[str] | None = None,
|
|
125
|
-
length_function: Callable[[str], int] = len
|
|
126
|
-
):
|
|
127
|
-
self.chunk_size = chunk_size
|
|
128
|
-
self.chunk_overlap = chunk_overlap
|
|
129
|
-
self.separators = separators or ["\n\n", "\n", ". ", " ", ""]
|
|
130
|
-
self.length_function = length_function
|
|
131
|
-
|
|
132
|
-
def split_text(self, text: str) -> list[str]:
|
|
133
|
-
"""Split text into chunks."""
|
|
134
|
-
return self._split_text(text, self.separators)
|
|
135
|
-
|
|
136
|
-
def _split_text(self, text: str, separators: list[str]) -> list[str]:
|
|
137
|
-
final_chunks = []
|
|
138
|
-
separator = separators[-1]
|
|
139
|
-
|
|
140
|
-
for i, sep in enumerate(separators):
|
|
141
|
-
if sep == "":
|
|
142
|
-
separator = sep
|
|
143
|
-
break
|
|
144
|
-
if sep in text:
|
|
145
|
-
separator = sep
|
|
146
|
-
break
|
|
147
|
-
|
|
148
|
-
splits = text.split(separator) if separator else list(text)
|
|
149
|
-
|
|
150
|
-
good_splits = []
|
|
151
|
-
for split in splits:
|
|
152
|
-
if self.length_function(split) < self.chunk_size:
|
|
153
|
-
good_splits.append(split)
|
|
154
|
-
else:
|
|
155
|
-
if good_splits:
|
|
156
|
-
merged = self._merge_splits(good_splits, separator)
|
|
157
|
-
final_chunks.extend(merged)
|
|
158
|
-
good_splits = []
|
|
159
|
-
# Recursively split large chunks
|
|
160
|
-
other_chunks = self._split_text(split, separators[separators.index(separator) + 1:])
|
|
161
|
-
final_chunks.extend(other_chunks)
|
|
162
|
-
|
|
163
|
-
if good_splits:
|
|
164
|
-
merged = self._merge_splits(good_splits, separator)
|
|
165
|
-
final_chunks.extend(merged)
|
|
166
|
-
|
|
167
|
-
return final_chunks
|
|
168
|
-
|
|
169
|
-
def _merge_splits(self, splits: list[str], separator: str) -> list[str]:
|
|
170
|
-
"""Merge splits into chunks respecting size limits."""
|
|
171
|
-
chunks = []
|
|
172
|
-
current_chunk = []
|
|
173
|
-
current_length = 0
|
|
174
|
-
|
|
175
|
-
for split in splits:
|
|
176
|
-
split_length = self.length_function(split)
|
|
177
|
-
|
|
178
|
-
if current_length + split_length > self.chunk_size:
|
|
179
|
-
if current_chunk:
|
|
180
|
-
chunks.append(separator.join(current_chunk))
|
|
181
|
-
# Keep overlap
|
|
182
|
-
while current_length > self.chunk_overlap and current_chunk:
|
|
183
|
-
current_length -= self.length_function(current_chunk[0])
|
|
184
|
-
current_chunk = current_chunk[1:]
|
|
185
|
-
|
|
186
|
-
current_chunk.append(split)
|
|
187
|
-
current_length += split_length
|
|
188
|
-
|
|
189
|
-
if current_chunk:
|
|
190
|
-
chunks.append(separator.join(current_chunk))
|
|
191
|
-
|
|
192
|
-
return chunks
|
|
193
|
-
|
|
194
|
-
# Usage
|
|
195
|
-
splitter = RecursiveCharacterSplitter(
|
|
196
|
-
chunk_size=1000,
|
|
197
|
-
chunk_overlap=200,
|
|
198
|
-
separators=["\n\n", "\n", ". ", " "]
|
|
199
|
-
)
|
|
200
|
-
chunks = splitter.split_text(document_text)
|
|
201
|
-
```
|
|
202
|
-
|
|
203
|
-
### Token-Based Splitting
|
|
204
|
-
|
|
205
|
-
```python
|
|
206
|
-
import tiktoken
|
|
207
|
-
|
|
208
|
-
def create_token_splitter(
|
|
209
|
-
model: str = "gpt-4",
|
|
210
|
-
chunk_size: int = 500,
|
|
211
|
-
chunk_overlap: int = 50
|
|
212
|
-
):
|
|
213
|
-
"""Create splitter that counts tokens instead of characters."""
|
|
214
|
-
encoding = tiktoken.encoding_for_model(model)
|
|
215
|
-
|
|
216
|
-
def token_length(text: str) -> int:
|
|
217
|
-
return len(encoding.encode(text))
|
|
218
|
-
|
|
219
|
-
return RecursiveCharacterSplitter(
|
|
220
|
-
chunk_size=chunk_size,
|
|
221
|
-
chunk_overlap=chunk_overlap,
|
|
222
|
-
length_function=token_length
|
|
223
|
-
)
|
|
224
|
-
|
|
225
|
-
# Usage
|
|
226
|
-
token_splitter = create_token_splitter(chunk_size=500, chunk_overlap=50)
|
|
227
|
-
chunks = token_splitter.split_text(document_text)
|
|
228
|
-
```
|
|
229
|
-
|
|
230
|
-
---
|
|
231
|
-
|
|
232
|
-
## Sentence-Based Chunking
|
|
233
|
-
|
|
234
|
-
```python
|
|
235
|
-
import re
|
|
236
|
-
from dataclasses import dataclass
|
|
237
|
-
|
|
238
|
-
@dataclass
|
|
239
|
-
class SentenceChunk:
|
|
240
|
-
text: str
|
|
241
|
-
sentences: list[str]
|
|
242
|
-
start_sentence: int
|
|
243
|
-
end_sentence: int
|
|
244
|
-
|
|
245
|
-
def sentence_chunk(
|
|
246
|
-
text: str,
|
|
247
|
-
sentences_per_chunk: int = 5,
|
|
248
|
-
overlap_sentences: int = 1
|
|
249
|
-
) -> list[SentenceChunk]:
|
|
250
|
-
"""Chunk by sentence count with overlap."""
|
|
251
|
-
# Split into sentences
|
|
252
|
-
sentence_pattern = r'(?<=[.!?])\s+'
|
|
253
|
-
sentences = re.split(sentence_pattern, text)
|
|
254
|
-
sentences = [s.strip() for s in sentences if s.strip()]
|
|
255
|
-
|
|
256
|
-
chunks = []
|
|
257
|
-
i = 0
|
|
258
|
-
|
|
259
|
-
while i < len(sentences):
|
|
260
|
-
end = min(i + sentences_per_chunk, len(sentences))
|
|
261
|
-
chunk_sentences = sentences[i:end]
|
|
262
|
-
|
|
263
|
-
chunks.append(SentenceChunk(
|
|
264
|
-
text=" ".join(chunk_sentences),
|
|
265
|
-
sentences=chunk_sentences,
|
|
266
|
-
start_sentence=i,
|
|
267
|
-
end_sentence=end - 1
|
|
268
|
-
))
|
|
269
|
-
|
|
270
|
-
i += sentences_per_chunk - overlap_sentences
|
|
271
|
-
|
|
272
|
-
return chunks
|
|
273
|
-
|
|
274
|
-
# Better sentence splitting with NLTK
|
|
275
|
-
import nltk
|
|
276
|
-
nltk.download('punkt')
|
|
277
|
-
from nltk.tokenize import sent_tokenize
|
|
278
|
-
|
|
279
|
-
def sentence_chunk_nltk(
|
|
280
|
-
text: str,
|
|
281
|
-
max_chunk_size: int = 1000,
|
|
282
|
-
overlap_sentences: int = 2
|
|
283
|
-
) -> list[str]:
|
|
284
|
-
"""Chunk by sentences up to max size."""
|
|
285
|
-
sentences = sent_tokenize(text)
|
|
286
|
-
chunks = []
|
|
287
|
-
current_chunk = []
|
|
288
|
-
current_size = 0
|
|
289
|
-
|
|
290
|
-
for sentence in sentences:
|
|
291
|
-
sentence_size = len(sentence)
|
|
292
|
-
|
|
293
|
-
if current_size + sentence_size > max_chunk_size and current_chunk:
|
|
294
|
-
chunks.append(" ".join(current_chunk))
|
|
295
|
-
# Keep overlap sentences
|
|
296
|
-
current_chunk = current_chunk[-overlap_sentences:] if overlap_sentences else []
|
|
297
|
-
current_size = sum(len(s) for s in current_chunk)
|
|
298
|
-
|
|
299
|
-
current_chunk.append(sentence)
|
|
300
|
-
current_size += sentence_size
|
|
301
|
-
|
|
302
|
-
if current_chunk:
|
|
303
|
-
chunks.append(" ".join(current_chunk))
|
|
304
|
-
|
|
305
|
-
return chunks
|
|
306
|
-
```
|
|
307
|
-
|
|
308
|
-
---
|
|
309
|
-
|
|
310
|
-
## Semantic Chunking
|
|
311
|
-
|
|
312
|
-
```python
|
|
313
|
-
import numpy as np
|
|
314
|
-
from sentence_transformers import SentenceTransformer
|
|
315
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
|
316
|
-
|
|
317
|
-
class SemanticChunker:
|
|
318
|
-
"""Chunk based on semantic similarity between sentences."""
|
|
319
|
-
|
|
320
|
-
def __init__(
|
|
321
|
-
self,
|
|
322
|
-
model_name: str = "all-MiniLM-L6-v2",
|
|
323
|
-
similarity_threshold: float = 0.5,
|
|
324
|
-
min_chunk_size: int = 100,
|
|
325
|
-
max_chunk_size: int = 1500
|
|
326
|
-
):
|
|
327
|
-
self.model = SentenceTransformer(model_name)
|
|
328
|
-
self.similarity_threshold = similarity_threshold
|
|
329
|
-
self.min_chunk_size = min_chunk_size
|
|
330
|
-
self.max_chunk_size = max_chunk_size
|
|
331
|
-
|
|
332
|
-
def chunk(self, text: str) -> list[str]:
|
|
333
|
-
"""Split text at semantic boundaries."""
|
|
334
|
-
# Split into sentences
|
|
335
|
-
sentences = self._split_sentences(text)
|
|
336
|
-
if len(sentences) <= 1:
|
|
337
|
-
return [text]
|
|
338
|
-
|
|
339
|
-
# Get embeddings
|
|
340
|
-
embeddings = self.model.encode(sentences)
|
|
341
|
-
|
|
342
|
-
# Find breakpoints based on similarity drops
|
|
343
|
-
breakpoints = self._find_breakpoints(embeddings)
|
|
344
|
-
|
|
345
|
-
# Create chunks
|
|
346
|
-
chunks = []
|
|
347
|
-
start = 0
|
|
348
|
-
|
|
349
|
-
for bp in breakpoints:
|
|
350
|
-
chunk_text = " ".join(sentences[start:bp])
|
|
351
|
-
|
|
352
|
-
# Handle size constraints
|
|
353
|
-
if len(chunk_text) > self.max_chunk_size:
|
|
354
|
-
# Split large chunks
|
|
355
|
-
sub_chunks = self._split_large_chunk(sentences[start:bp])
|
|
356
|
-
chunks.extend(sub_chunks)
|
|
357
|
-
elif len(chunk_text) >= self.min_chunk_size:
|
|
358
|
-
chunks.append(chunk_text)
|
|
359
|
-
elif chunks:
|
|
360
|
-
# Merge small chunk with previous
|
|
361
|
-
chunks[-1] += " " + chunk_text
|
|
362
|
-
else:
|
|
363
|
-
chunks.append(chunk_text)
|
|
364
|
-
|
|
365
|
-
start = bp
|
|
366
|
-
|
|
367
|
-
# Handle remaining sentences
|
|
368
|
-
if start < len(sentences):
|
|
369
|
-
remaining = " ".join(sentences[start:])
|
|
370
|
-
if chunks and len(remaining) < self.min_chunk_size:
|
|
371
|
-
chunks[-1] += " " + remaining
|
|
372
|
-
else:
|
|
373
|
-
chunks.append(remaining)
|
|
374
|
-
|
|
375
|
-
return chunks
|
|
376
|
-
|
|
377
|
-
def _split_sentences(self, text: str) -> list[str]:
|
|
378
|
-
"""Split text into sentences."""
|
|
379
|
-
import re
|
|
380
|
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
381
|
-
return [s.strip() for s in sentences if s.strip()]
|
|
382
|
-
|
|
383
|
-
def _find_breakpoints(self, embeddings: np.ndarray) -> list[int]:
|
|
384
|
-
"""Find semantic breakpoints using similarity drops."""
|
|
385
|
-
breakpoints = []
|
|
386
|
-
|
|
387
|
-
for i in range(1, len(embeddings)):
|
|
388
|
-
similarity = cosine_similarity(
|
|
389
|
-
embeddings[i-1:i],
|
|
390
|
-
embeddings[i:i+1]
|
|
391
|
-
)[0][0]
|
|
392
|
-
|
|
393
|
-
if similarity < self.similarity_threshold:
|
|
394
|
-
breakpoints.append(i)
|
|
395
|
-
|
|
396
|
-
return breakpoints
|
|
397
|
-
|
|
398
|
-
def _split_large_chunk(self, sentences: list[str]) -> list[str]:
|
|
399
|
-
"""Split oversized chunk at midpoint."""
|
|
400
|
-
mid = len(sentences) // 2
|
|
401
|
-
return [
|
|
402
|
-
" ".join(sentences[:mid]),
|
|
403
|
-
" ".join(sentences[mid:])
|
|
404
|
-
]
|
|
405
|
-
|
|
406
|
-
# Usage
|
|
407
|
-
chunker = SemanticChunker(
|
|
408
|
-
similarity_threshold=0.5,
|
|
409
|
-
min_chunk_size=200,
|
|
410
|
-
max_chunk_size=1000
|
|
411
|
-
)
|
|
412
|
-
semantic_chunks = chunker.chunk(document_text)
|
|
413
|
-
```
|
|
414
|
-
|
|
415
|
-
### Percentile-Based Breakpoints
|
|
416
|
-
|
|
417
|
-
```python
|
|
418
|
-
def find_breakpoints_percentile(
|
|
419
|
-
embeddings: np.ndarray,
|
|
420
|
-
percentile: int = 25
|
|
421
|
-
) -> list[int]:
|
|
422
|
-
"""Find breakpoints at similarity drops below percentile threshold."""
|
|
423
|
-
similarities = []
|
|
424
|
-
|
|
425
|
-
for i in range(1, len(embeddings)):
|
|
426
|
-
sim = cosine_similarity(
|
|
427
|
-
embeddings[i-1:i],
|
|
428
|
-
embeddings[i:i+1]
|
|
429
|
-
)[0][0]
|
|
430
|
-
similarities.append((i, sim))
|
|
431
|
-
|
|
432
|
-
# Dynamic threshold based on distribution
|
|
433
|
-
sim_values = [s[1] for s in similarities]
|
|
434
|
-
threshold = np.percentile(sim_values, percentile)
|
|
435
|
-
|
|
436
|
-
return [i for i, sim in similarities if sim < threshold]
|
|
437
|
-
```
|
|
438
|
-
|
|
439
|
-
---
|
|
440
|
-
|
|
441
|
-
## Document-Aware Chunking
|
|
442
|
-
|
|
443
|
-
### Markdown Chunking
|
|
444
|
-
|
|
445
|
-
```python
|
|
446
|
-
import re
|
|
447
|
-
from dataclasses import dataclass
|
|
448
|
-
|
|
449
|
-
@dataclass
|
|
450
|
-
class MarkdownChunk:
|
|
451
|
-
text: str
|
|
452
|
-
heading: str | None
|
|
453
|
-
heading_level: int
|
|
454
|
-
metadata: dict
|
|
455
|
-
|
|
456
|
-
def chunk_markdown(
|
|
457
|
-
text: str,
|
|
458
|
-
max_chunk_size: int = 1500,
|
|
459
|
-
include_heading_in_chunk: bool = True
|
|
460
|
-
) -> list[MarkdownChunk]:
|
|
461
|
-
"""Chunk markdown by headers while respecting structure."""
|
|
462
|
-
# Pattern to match headers
|
|
463
|
-
header_pattern = r'^(#{1,6})\s+(.+)$'
|
|
464
|
-
|
|
465
|
-
lines = text.split('\n')
|
|
466
|
-
chunks = []
|
|
467
|
-
current_chunk_lines = []
|
|
468
|
-
current_heading = None
|
|
469
|
-
current_level = 0
|
|
470
|
-
heading_stack = [] # For breadcrumb context
|
|
471
|
-
|
|
472
|
-
for line in lines:
|
|
473
|
-
header_match = re.match(header_pattern, line)
|
|
474
|
-
|
|
475
|
-
if header_match:
|
|
476
|
-
# Save current chunk if exists
|
|
477
|
-
if current_chunk_lines:
|
|
478
|
-
chunk_text = '\n'.join(current_chunk_lines)
|
|
479
|
-
if len(chunk_text.strip()) > 0:
|
|
480
|
-
prefix = f"# {current_heading}\n\n" if include_heading_in_chunk and current_heading else ""
|
|
481
|
-
chunks.append(MarkdownChunk(
|
|
482
|
-
text=prefix + chunk_text,
|
|
483
|
-
heading=current_heading,
|
|
484
|
-
heading_level=current_level,
|
|
485
|
-
metadata={"breadcrumb": " > ".join(heading_stack)}
|
|
486
|
-
))
|
|
487
|
-
|
|
488
|
-
# Update heading context
|
|
489
|
-
level = len(header_match.group(1))
|
|
490
|
-
heading = header_match.group(2).strip()
|
|
491
|
-
|
|
492
|
-
# Maintain heading stack for breadcrumbs
|
|
493
|
-
while heading_stack and current_level >= level:
|
|
494
|
-
heading_stack.pop()
|
|
495
|
-
current_level -= 1
|
|
496
|
-
|
|
497
|
-
heading_stack.append(heading)
|
|
498
|
-
current_heading = heading
|
|
499
|
-
current_level = level
|
|
500
|
-
current_chunk_lines = []
|
|
501
|
-
|
|
502
|
-
else:
|
|
503
|
-
current_chunk_lines.append(line)
|
|
504
|
-
|
|
505
|
-
# Check chunk size
|
|
506
|
-
current_text = '\n'.join(current_chunk_lines)
|
|
507
|
-
if len(current_text) > max_chunk_size:
|
|
508
|
-
# Split at paragraph boundary
|
|
509
|
-
paragraphs = current_text.split('\n\n')
|
|
510
|
-
if len(paragraphs) > 1:
|
|
511
|
-
split_point = len('\n\n'.join(paragraphs[:-1]))
|
|
512
|
-
chunk_text = current_text[:split_point]
|
|
513
|
-
prefix = f"# {current_heading}\n\n" if include_heading_in_chunk and current_heading else ""
|
|
514
|
-
chunks.append(MarkdownChunk(
|
|
515
|
-
text=prefix + chunk_text,
|
|
516
|
-
heading=current_heading,
|
|
517
|
-
heading_level=current_level,
|
|
518
|
-
metadata={"breadcrumb": " > ".join(heading_stack)}
|
|
519
|
-
))
|
|
520
|
-
current_chunk_lines = [current_text[split_point:].strip()]
|
|
521
|
-
|
|
522
|
-
# Don't forget the last chunk
|
|
523
|
-
if current_chunk_lines:
|
|
524
|
-
chunk_text = '\n'.join(current_chunk_lines)
|
|
525
|
-
if len(chunk_text.strip()) > 0:
|
|
526
|
-
prefix = f"# {current_heading}\n\n" if include_heading_in_chunk and current_heading else ""
|
|
527
|
-
chunks.append(MarkdownChunk(
|
|
528
|
-
text=prefix + chunk_text,
|
|
529
|
-
heading=current_heading,
|
|
530
|
-
heading_level=current_level,
|
|
531
|
-
metadata={"breadcrumb": " > ".join(heading_stack)}
|
|
532
|
-
))
|
|
533
|
-
|
|
534
|
-
return chunks
|
|
535
|
-
```
|
|
536
|
-
|
|
537
|
-
### Code-Aware Chunking
|
|
538
|
-
|
|
539
|
-
```python
|
|
540
|
-
import re
|
|
541
|
-
from dataclasses import dataclass
|
|
542
|
-
|
|
543
|
-
@dataclass
|
|
544
|
-
class CodeChunk:
|
|
545
|
-
text: str
|
|
546
|
-
language: str | None
|
|
547
|
-
chunk_type: str # "code", "text", "mixed"
|
|
548
|
-
|
|
549
|
-
def chunk_with_code_blocks(
|
|
550
|
-
text: str,
|
|
551
|
-
max_chunk_size: int = 1500
|
|
552
|
-
) -> list[CodeChunk]:
|
|
553
|
-
"""Chunk text while keeping code blocks intact."""
|
|
554
|
-
# Pattern to match code blocks
|
|
555
|
-
code_block_pattern = r'```(\w+)?\n(.*?)```'
|
|
556
|
-
|
|
557
|
-
chunks = []
|
|
558
|
-
last_end = 0
|
|
559
|
-
|
|
560
|
-
for match in re.finditer(code_block_pattern, text, re.DOTALL):
|
|
561
|
-
# Text before code block
|
|
562
|
-
text_before = text[last_end:match.start()].strip()
|
|
563
|
-
if text_before:
|
|
564
|
-
# Chunk the text portion
|
|
565
|
-
text_chunks = recursive_chunk(text_before, max_chunk_size)
|
|
566
|
-
chunks.extend([
|
|
567
|
-
CodeChunk(text=t, language=None, chunk_type="text")
|
|
568
|
-
for t in text_chunks
|
|
569
|
-
])
|
|
570
|
-
|
|
571
|
-
# Code block (keep intact if possible)
|
|
572
|
-
language = match.group(1)
|
|
573
|
-
code_content = match.group(2)
|
|
574
|
-
full_block = match.group(0)
|
|
575
|
-
|
|
576
|
-
if len(full_block) <= max_chunk_size:
|
|
577
|
-
chunks.append(CodeChunk(
|
|
578
|
-
text=full_block,
|
|
579
|
-
language=language,
|
|
580
|
-
chunk_type="code"
|
|
581
|
-
))
|
|
582
|
-
else:
|
|
583
|
-
# Split large code blocks by function/class
|
|
584
|
-
code_chunks = split_code_block(code_content, language, max_chunk_size)
|
|
585
|
-
chunks.extend(code_chunks)
|
|
586
|
-
|
|
587
|
-
last_end = match.end()
|
|
588
|
-
|
|
589
|
-
# Remaining text after last code block
|
|
590
|
-
remaining = text[last_end:].strip()
|
|
591
|
-
if remaining:
|
|
592
|
-
text_chunks = recursive_chunk(remaining, max_chunk_size)
|
|
593
|
-
chunks.extend([
|
|
594
|
-
CodeChunk(text=t, language=None, chunk_type="text")
|
|
595
|
-
for t in text_chunks
|
|
596
|
-
])
|
|
597
|
-
|
|
598
|
-
return chunks
|
|
599
|
-
|
|
600
|
-
def split_code_block(code: str, language: str, max_size: int) -> list[CodeChunk]:
|
|
601
|
-
"""Split code block at logical boundaries."""
|
|
602
|
-
# Simple function/class boundary splitting for Python
|
|
603
|
-
if language == "python":
|
|
604
|
-
pattern = r'\n(?=def |class |async def )'
|
|
605
|
-
elif language in ["javascript", "typescript"]:
|
|
606
|
-
pattern = r'\n(?=function |class |const |export )'
|
|
607
|
-
else:
|
|
608
|
-
pattern = r'\n\n'
|
|
609
|
-
|
|
610
|
-
parts = re.split(pattern, code)
|
|
611
|
-
chunks = []
|
|
612
|
-
current = ""
|
|
613
|
-
|
|
614
|
-
for part in parts:
|
|
615
|
-
if len(current) + len(part) > max_size and current:
|
|
616
|
-
chunks.append(CodeChunk(
|
|
617
|
-
text=f"```{language}\n{current}```",
|
|
618
|
-
language=language,
|
|
619
|
-
chunk_type="code"
|
|
620
|
-
))
|
|
621
|
-
current = part
|
|
622
|
-
else:
|
|
623
|
-
current += part
|
|
624
|
-
|
|
625
|
-
if current:
|
|
626
|
-
chunks.append(CodeChunk(
|
|
627
|
-
text=f"```{language}\n{current}```",
|
|
628
|
-
language=language,
|
|
629
|
-
chunk_type="code"
|
|
630
|
-
))
|
|
631
|
-
|
|
632
|
-
return chunks
|
|
633
|
-
```
|
|
634
|
-
|
|
635
|
-
---
|
|
636
|
-
|
|
637
|
-
## Contextual/Agentic Chunking
|
|
638
|
-
|
|
639
|
-
```python
|
|
640
|
-
from openai import OpenAI
|
|
641
|
-
|
|
642
|
-
def contextual_chunk(
|
|
643
|
-
document: str,
|
|
644
|
-
max_chunk_size: int = 1500
|
|
645
|
-
) -> list[dict]:
|
|
646
|
-
"""Use LLM to add context to each chunk."""
|
|
647
|
-
# First, do structural chunking
|
|
648
|
-
base_chunks = recursive_chunk(document, max_chunk_size)
|
|
649
|
-
|
|
650
|
-
client = OpenAI()
|
|
651
|
-
contextualized_chunks = []
|
|
652
|
-
|
|
653
|
-
for chunk in base_chunks:
|
|
654
|
-
# Generate contextual summary
|
|
655
|
-
response = client.chat.completions.create(
|
|
656
|
-
model="gpt-4o-mini",
|
|
657
|
-
messages=[
|
|
658
|
-
{
|
|
659
|
-
"role": "system",
|
|
660
|
-
"content": """Provide a brief context for this document chunk.
|
|
661
|
-
Include: what topic it covers, how it relates to the broader document,
|
|
662
|
-
and key concepts mentioned. Keep it under 100 words."""
|
|
663
|
-
},
|
|
664
|
-
{
|
|
665
|
-
"role": "user",
|
|
666
|
-
"content": f"Document excerpt:\n\n{chunk}"
|
|
667
|
-
}
|
|
668
|
-
],
|
|
669
|
-
max_tokens=150
|
|
670
|
-
)
|
|
671
|
-
|
|
672
|
-
context = response.choices[0].message.content
|
|
673
|
-
|
|
674
|
-
contextualized_chunks.append({
|
|
675
|
-
"text": chunk,
|
|
676
|
-
"context": context,
|
|
677
|
-
"text_with_context": f"Context: {context}\n\nContent: {chunk}"
|
|
678
|
-
})
|
|
679
|
-
|
|
680
|
-
return contextualized_chunks
|
|
681
|
-
```
|
|
682
|
-
|
|
683
|
-
### Propositions-Based Chunking
|
|
684
|
-
|
|
685
|
-
```python
|
|
686
|
-
def extract_propositions(text: str) -> list[str]:
|
|
687
|
-
"""Extract atomic propositions from text using LLM."""
|
|
688
|
-
client = OpenAI()
|
|
689
|
-
|
|
690
|
-
response = client.chat.completions.create(
|
|
691
|
-
model="gpt-4o-mini",
|
|
692
|
-
messages=[
|
|
693
|
-
{
|
|
694
|
-
"role": "system",
|
|
695
|
-
"content": """Extract atomic propositions from the text.
|
|
696
|
-
Each proposition should:
|
|
697
|
-
- Be a single, complete fact
|
|
698
|
-
- Be self-contained (understandable without context)
|
|
699
|
-
- Include necessary entity references
|
|
700
|
-
|
|
701
|
-
Return as a JSON array of strings."""
|
|
702
|
-
},
|
|
703
|
-
{
|
|
704
|
-
"role": "user",
|
|
705
|
-
"content": text
|
|
706
|
-
}
|
|
707
|
-
],
|
|
708
|
-
response_format={"type": "json_object"}
|
|
709
|
-
)
|
|
710
|
-
|
|
711
|
-
import json
|
|
712
|
-
result = json.loads(response.choices[0].message.content)
|
|
713
|
-
return result.get("propositions", [])
|
|
714
|
-
|
|
715
|
-
# Usage: For very fine-grained retrieval
|
|
716
|
-
propositions = extract_propositions(document_text)
|
|
717
|
-
# Each proposition becomes its own retrievable unit
|
|
718
|
-
```
|
|
719
|
-
|
|
720
|
-
---
|
|
721
|
-
|
|
722
|
-
## Late Chunking (for Long-Context Embeddings)
|
|
723
|
-
|
|
724
|
-
```python
|
|
725
|
-
from transformers import AutoTokenizer, AutoModel
|
|
726
|
-
import torch
|
|
727
|
-
|
|
728
|
-
class LateChunker:
|
|
729
|
-
"""
|
|
730
|
-
Late chunking: embed full document, then pool token embeddings into chunks.
|
|
731
|
-
Preserves full document context while creating retrievable chunks.
|
|
732
|
-
"""
|
|
733
|
-
|
|
734
|
-
def __init__(self, model_name: str = "jinaai/jina-embeddings-v2-base-en"):
|
|
735
|
-
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
736
|
-
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
|
737
|
-
self.model.eval()
|
|
738
|
-
|
|
739
|
-
def chunk_and_embed(
|
|
740
|
-
self,
|
|
741
|
-
text: str,
|
|
742
|
-
chunk_size: int = 512,
|
|
743
|
-
overlap: int = 64
|
|
744
|
-
) -> list[dict]:
|
|
745
|
-
"""
|
|
746
|
-
Embed full document, then create chunk embeddings via mean pooling.
|
|
747
|
-
"""
|
|
748
|
-
# Tokenize full document
|
|
749
|
-
inputs = self.tokenizer(
|
|
750
|
-
text,
|
|
751
|
-
return_tensors="pt",
|
|
752
|
-
truncation=True,
|
|
753
|
-
max_length=8192 # Model's max context
|
|
754
|
-
)
|
|
755
|
-
|
|
756
|
-
# Get token-level embeddings
|
|
757
|
-
with torch.no_grad():
|
|
758
|
-
outputs = self.model(**inputs)
|
|
759
|
-
token_embeddings = outputs.last_hidden_state[0] # [seq_len, hidden_dim]
|
|
760
|
-
|
|
761
|
-
# Get token-to-text mapping
|
|
762
|
-
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
|
763
|
-
|
|
764
|
-
# Create chunks from token embeddings
|
|
765
|
-
chunks = []
|
|
766
|
-
seq_len = token_embeddings.shape[0]
|
|
767
|
-
start = 0
|
|
768
|
-
|
|
769
|
-
while start < seq_len:
|
|
770
|
-
end = min(start + chunk_size, seq_len)
|
|
771
|
-
|
|
772
|
-
# Mean pool token embeddings for this chunk
|
|
773
|
-
chunk_embedding = token_embeddings[start:end].mean(dim=0).numpy()
|
|
774
|
-
|
|
775
|
-
# Reconstruct text for this chunk
|
|
776
|
-
chunk_token_ids = inputs["input_ids"][0][start:end]
|
|
777
|
-
chunk_text = self.tokenizer.decode(chunk_token_ids, skip_special_tokens=True)
|
|
778
|
-
|
|
779
|
-
chunks.append({
|
|
780
|
-
"text": chunk_text,
|
|
781
|
-
"embedding": chunk_embedding,
|
|
782
|
-
"start_token": start,
|
|
783
|
-
"end_token": end
|
|
784
|
-
})
|
|
785
|
-
|
|
786
|
-
start = end - overlap
|
|
787
|
-
|
|
788
|
-
return chunks
|
|
789
|
-
|
|
790
|
-
# Usage
|
|
791
|
-
late_chunker = LateChunker()
|
|
792
|
-
chunks_with_embeddings = late_chunker.chunk_and_embed(
|
|
793
|
-
long_document,
|
|
794
|
-
chunk_size=512,
|
|
795
|
-
overlap=64
|
|
796
|
-
)
|
|
797
|
-
```
|
|
798
|
-
|
|
799
|
-
---
|
|
800
|
-
|
|
801
|
-
## Metadata Enrichment
|
|
802
|
-
|
|
803
|
-
```python
|
|
804
|
-
from dataclasses import dataclass
|
|
805
|
-
from datetime import datetime
|
|
806
|
-
import hashlib
|
|
807
|
-
|
|
808
|
-
@dataclass
|
|
809
|
-
class EnrichedChunk:
|
|
810
|
-
text: str
|
|
811
|
-
embedding: list[float] | None
|
|
812
|
-
metadata: dict
|
|
813
|
-
|
|
814
|
-
def enrich_chunk(
|
|
815
|
-
text: str,
|
|
816
|
-
source_file: str,
|
|
817
|
-
chunk_index: int,
|
|
818
|
-
total_chunks: int,
|
|
819
|
-
additional_metadata: dict | None = None
|
|
820
|
-
) -> EnrichedChunk:
|
|
821
|
-
"""Add comprehensive metadata to chunk."""
|
|
822
|
-
metadata = {
|
|
823
|
-
# Source tracking
|
|
824
|
-
"source": source_file,
|
|
825
|
-
"chunk_index": chunk_index,
|
|
826
|
-
"total_chunks": total_chunks,
|
|
827
|
-
|
|
828
|
-
# Content characteristics
|
|
829
|
-
"char_count": len(text),
|
|
830
|
-
"word_count": len(text.split()),
|
|
831
|
-
"content_hash": hashlib.md5(text.encode()).hexdigest()[:12],
|
|
832
|
-
|
|
833
|
-
# Temporal
|
|
834
|
-
"indexed_at": datetime.utcnow().isoformat(),
|
|
835
|
-
|
|
836
|
-
# Position context
|
|
837
|
-
"position": "start" if chunk_index == 0 else (
|
|
838
|
-
"end" if chunk_index == total_chunks - 1 else "middle"
|
|
839
|
-
)
|
|
840
|
-
}
|
|
841
|
-
|
|
842
|
-
if additional_metadata:
|
|
843
|
-
metadata.update(additional_metadata)
|
|
844
|
-
|
|
845
|
-
return EnrichedChunk(text=text, embedding=None, metadata=metadata)
|
|
846
|
-
```
|
|
847
|
-
|
|
848
|
-
---
|
|
849
|
-
|
|
850
|
-
## Chunk Size Selection Guide
|
|
851
|
-
|
|
852
|
-
| Document Type | Recommended Size | Overlap | Rationale |
|
|
853
|
-
|--------------|------------------|---------|-----------|
|
|
854
|
-
| FAQ/Q&A | 200-400 tokens | 20-50 | Keep Q&A pairs together |
|
|
855
|
-
| Technical docs | 400-600 tokens | 50-100 | Balance context vs precision |
|
|
856
|
-
| Legal/contracts | 600-800 tokens | 100-150 | Preserve clause context |
|
|
857
|
-
| Code documentation | 300-500 tokens | 50-100 | Keep function docs together |
|
|
858
|
-
| Chat transcripts | 150-300 tokens | 25-50 | Natural turn boundaries |
|
|
859
|
-
| Research papers | 500-800 tokens | 100-200 | Section-level coherence |
|
|
860
|
-
|
|
861
|
-
---
|
|
862
|
-
|
|
863
|
-
## Quick Reference
|
|
864
|
-
|
|
865
|
-
| Strategy | Use Case | Code Pattern |
|
|
866
|
-
|----------|----------|--------------|
|
|
867
|
-
| Fixed-size | Logs, baseline | `text[i:i+chunk_size]` |
|
|
868
|
-
| Recursive | General text | Split by `["\n\n", "\n", ". "]` |
|
|
869
|
-
| Sentence | Q&A content | `sent_tokenize()` + merge |
|
|
870
|
-
| Semantic | Technical docs | Similarity-based breaks |
|
|
871
|
-
| Markdown | Documentation | Header-aware splitting |
|
|
872
|
-
| Late chunking | Long-context models | Embed full, pool chunks |
|
|
873
|
-
|
|
874
|
-
## Related Skills
|
|
875
|
-
|
|
876
|
-
- **RAG Architect** - Integration with vector databases
|
|
877
|
-
- **Python Pro** - Preprocessing pipelines
|
|
878
|
-
- **NLP Engineer** - Tokenization and text processing
|
|
1
|
+
# Chunking Strategies
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Strategy Comparison Matrix
|
|
6
|
+
|
|
7
|
+
| Strategy | Best For | Chunk Quality | Implementation Complexity |
|
|
8
|
+
|----------|----------|---------------|---------------------------|
|
|
9
|
+
| **Fixed-size** | Simple documents, logs | Low-Medium | Simple |
|
|
10
|
+
| **Recursive character** | General text, articles | Medium | Simple |
|
|
11
|
+
| **Sentence-based** | Conversational, Q&A | Medium-High | Medium |
|
|
12
|
+
| **Semantic** | Technical docs, manuals | High | Medium |
|
|
13
|
+
| **Document-aware** | Structured content (MD, HTML) | High | Medium |
|
|
14
|
+
| **Agentic/Contextual** | Complex documents | Very High | Complex |
|
|
15
|
+
| **Late chunking** | Long-context embeddings | High | Medium |
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## When to Use Each Strategy
|
|
20
|
+
|
|
21
|
+
### Fixed-Size Chunking
|
|
22
|
+
```
|
|
23
|
+
Best For:
|
|
24
|
+
- Log files and structured data
|
|
25
|
+
- Quick prototyping
|
|
26
|
+
- When content has no natural structure
|
|
27
|
+
- Baseline comparison
|
|
28
|
+
|
|
29
|
+
When to Avoid:
|
|
30
|
+
- Technical documentation
|
|
31
|
+
- Content with semantic units (paragraphs, sections)
|
|
32
|
+
- When context preservation matters
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Recursive Character Splitting
|
|
36
|
+
```
|
|
37
|
+
Best For:
|
|
38
|
+
- General articles and blog posts
|
|
39
|
+
- Mixed content types
|
|
40
|
+
- Default starting point for most RAG
|
|
41
|
+
- LangChain/LlamaIndex default
|
|
42
|
+
|
|
43
|
+
When to Avoid:
|
|
44
|
+
- Highly structured documents
|
|
45
|
+
- Code-heavy content
|
|
46
|
+
- Tables and lists
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Semantic Chunking
|
|
50
|
+
```
|
|
51
|
+
Best For:
|
|
52
|
+
- Technical documentation
|
|
53
|
+
- Research papers
|
|
54
|
+
- Content with natural topic boundaries
|
|
55
|
+
- When retrieval precision is critical
|
|
56
|
+
|
|
57
|
+
When to Avoid:
|
|
58
|
+
- Real-time ingestion (slower)
|
|
59
|
+
- Very short documents
|
|
60
|
+
- Cost-sensitive pipelines (requires embeddings)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Document-Aware Chunking
|
|
64
|
+
```
|
|
65
|
+
Best For:
|
|
66
|
+
- Markdown documentation
|
|
67
|
+
- HTML pages
|
|
68
|
+
- LaTeX papers
|
|
69
|
+
- Code files
|
|
70
|
+
|
|
71
|
+
When to Avoid:
|
|
72
|
+
- Plain text without structure
|
|
73
|
+
- Inconsistent formatting
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Fixed-Size Chunking
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
def fixed_size_chunk(
|
|
82
|
+
text: str,
|
|
83
|
+
chunk_size: int = 500,
|
|
84
|
+
overlap: int = 50
|
|
85
|
+
) -> list[str]:
|
|
86
|
+
"""Simple fixed-size chunking with overlap."""
|
|
87
|
+
chunks = []
|
|
88
|
+
start = 0
|
|
89
|
+
|
|
90
|
+
while start < len(text):
|
|
91
|
+
end = start + chunk_size
|
|
92
|
+
chunk = text[start:end]
|
|
93
|
+
|
|
94
|
+
# Try to break at word boundary
|
|
95
|
+
if end < len(text):
|
|
96
|
+
last_space = chunk.rfind(' ')
|
|
97
|
+
if last_space > chunk_size * 0.8: # Only if reasonably far in
|
|
98
|
+
chunk = chunk[:last_space]
|
|
99
|
+
end = start + last_space
|
|
100
|
+
|
|
101
|
+
chunks.append(chunk.strip())
|
|
102
|
+
start = end - overlap
|
|
103
|
+
|
|
104
|
+
return chunks
|
|
105
|
+
|
|
106
|
+
# Usage
|
|
107
|
+
chunks = fixed_size_chunk(document_text, chunk_size=500, overlap=50)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Recursive Character Splitting (LangChain Style)
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from typing import Callable
|
|
116
|
+
|
|
117
|
+
class RecursiveCharacterSplitter:
|
|
118
|
+
"""Split text recursively using multiple separators."""
|
|
119
|
+
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
chunk_size: int = 1000,
|
|
123
|
+
chunk_overlap: int = 200,
|
|
124
|
+
separators: list[str] | None = None,
|
|
125
|
+
length_function: Callable[[str], int] = len
|
|
126
|
+
):
|
|
127
|
+
self.chunk_size = chunk_size
|
|
128
|
+
self.chunk_overlap = chunk_overlap
|
|
129
|
+
self.separators = separators or ["\n\n", "\n", ". ", " ", ""]
|
|
130
|
+
self.length_function = length_function
|
|
131
|
+
|
|
132
|
+
def split_text(self, text: str) -> list[str]:
|
|
133
|
+
"""Split text into chunks."""
|
|
134
|
+
return self._split_text(text, self.separators)
|
|
135
|
+
|
|
136
|
+
def _split_text(self, text: str, separators: list[str]) -> list[str]:
|
|
137
|
+
final_chunks = []
|
|
138
|
+
separator = separators[-1]
|
|
139
|
+
|
|
140
|
+
for i, sep in enumerate(separators):
|
|
141
|
+
if sep == "":
|
|
142
|
+
separator = sep
|
|
143
|
+
break
|
|
144
|
+
if sep in text:
|
|
145
|
+
separator = sep
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
splits = text.split(separator) if separator else list(text)
|
|
149
|
+
|
|
150
|
+
good_splits = []
|
|
151
|
+
for split in splits:
|
|
152
|
+
if self.length_function(split) < self.chunk_size:
|
|
153
|
+
good_splits.append(split)
|
|
154
|
+
else:
|
|
155
|
+
if good_splits:
|
|
156
|
+
merged = self._merge_splits(good_splits, separator)
|
|
157
|
+
final_chunks.extend(merged)
|
|
158
|
+
good_splits = []
|
|
159
|
+
# Recursively split large chunks
|
|
160
|
+
other_chunks = self._split_text(split, separators[separators.index(separator) + 1:])
|
|
161
|
+
final_chunks.extend(other_chunks)
|
|
162
|
+
|
|
163
|
+
if good_splits:
|
|
164
|
+
merged = self._merge_splits(good_splits, separator)
|
|
165
|
+
final_chunks.extend(merged)
|
|
166
|
+
|
|
167
|
+
return final_chunks
|
|
168
|
+
|
|
169
|
+
def _merge_splits(self, splits: list[str], separator: str) -> list[str]:
|
|
170
|
+
"""Merge splits into chunks respecting size limits."""
|
|
171
|
+
chunks = []
|
|
172
|
+
current_chunk = []
|
|
173
|
+
current_length = 0
|
|
174
|
+
|
|
175
|
+
for split in splits:
|
|
176
|
+
split_length = self.length_function(split)
|
|
177
|
+
|
|
178
|
+
if current_length + split_length > self.chunk_size:
|
|
179
|
+
if current_chunk:
|
|
180
|
+
chunks.append(separator.join(current_chunk))
|
|
181
|
+
# Keep overlap
|
|
182
|
+
while current_length > self.chunk_overlap and current_chunk:
|
|
183
|
+
current_length -= self.length_function(current_chunk[0])
|
|
184
|
+
current_chunk = current_chunk[1:]
|
|
185
|
+
|
|
186
|
+
current_chunk.append(split)
|
|
187
|
+
current_length += split_length
|
|
188
|
+
|
|
189
|
+
if current_chunk:
|
|
190
|
+
chunks.append(separator.join(current_chunk))
|
|
191
|
+
|
|
192
|
+
return chunks
|
|
193
|
+
|
|
194
|
+
# Usage
|
|
195
|
+
splitter = RecursiveCharacterSplitter(
|
|
196
|
+
chunk_size=1000,
|
|
197
|
+
chunk_overlap=200,
|
|
198
|
+
separators=["\n\n", "\n", ". ", " "]
|
|
199
|
+
)
|
|
200
|
+
chunks = splitter.split_text(document_text)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Token-Based Splitting
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
import tiktoken
|
|
207
|
+
|
|
208
|
+
def create_token_splitter(
|
|
209
|
+
model: str = "gpt-4",
|
|
210
|
+
chunk_size: int = 500,
|
|
211
|
+
chunk_overlap: int = 50
|
|
212
|
+
):
|
|
213
|
+
"""Create splitter that counts tokens instead of characters."""
|
|
214
|
+
encoding = tiktoken.encoding_for_model(model)
|
|
215
|
+
|
|
216
|
+
def token_length(text: str) -> int:
|
|
217
|
+
return len(encoding.encode(text))
|
|
218
|
+
|
|
219
|
+
return RecursiveCharacterSplitter(
|
|
220
|
+
chunk_size=chunk_size,
|
|
221
|
+
chunk_overlap=chunk_overlap,
|
|
222
|
+
length_function=token_length
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Usage
|
|
226
|
+
token_splitter = create_token_splitter(chunk_size=500, chunk_overlap=50)
|
|
227
|
+
chunks = token_splitter.split_text(document_text)
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Sentence-Based Chunking
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
import re
|
|
236
|
+
from dataclasses import dataclass
|
|
237
|
+
|
|
238
|
+
@dataclass
|
|
239
|
+
class SentenceChunk:
|
|
240
|
+
text: str
|
|
241
|
+
sentences: list[str]
|
|
242
|
+
start_sentence: int
|
|
243
|
+
end_sentence: int
|
|
244
|
+
|
|
245
|
+
def sentence_chunk(
|
|
246
|
+
text: str,
|
|
247
|
+
sentences_per_chunk: int = 5,
|
|
248
|
+
overlap_sentences: int = 1
|
|
249
|
+
) -> list[SentenceChunk]:
|
|
250
|
+
"""Chunk by sentence count with overlap."""
|
|
251
|
+
# Split into sentences
|
|
252
|
+
sentence_pattern = r'(?<=[.!?])\s+'
|
|
253
|
+
sentences = re.split(sentence_pattern, text)
|
|
254
|
+
sentences = [s.strip() for s in sentences if s.strip()]
|
|
255
|
+
|
|
256
|
+
chunks = []
|
|
257
|
+
i = 0
|
|
258
|
+
|
|
259
|
+
while i < len(sentences):
|
|
260
|
+
end = min(i + sentences_per_chunk, len(sentences))
|
|
261
|
+
chunk_sentences = sentences[i:end]
|
|
262
|
+
|
|
263
|
+
chunks.append(SentenceChunk(
|
|
264
|
+
text=" ".join(chunk_sentences),
|
|
265
|
+
sentences=chunk_sentences,
|
|
266
|
+
start_sentence=i,
|
|
267
|
+
end_sentence=end - 1
|
|
268
|
+
))
|
|
269
|
+
|
|
270
|
+
i += sentences_per_chunk - overlap_sentences
|
|
271
|
+
|
|
272
|
+
return chunks
|
|
273
|
+
|
|
274
|
+
# Better sentence splitting with NLTK
|
|
275
|
+
import nltk
|
|
276
|
+
nltk.download('punkt')
|
|
277
|
+
from nltk.tokenize import sent_tokenize
|
|
278
|
+
|
|
279
|
+
def sentence_chunk_nltk(
|
|
280
|
+
text: str,
|
|
281
|
+
max_chunk_size: int = 1000,
|
|
282
|
+
overlap_sentences: int = 2
|
|
283
|
+
) -> list[str]:
|
|
284
|
+
"""Chunk by sentences up to max size."""
|
|
285
|
+
sentences = sent_tokenize(text)
|
|
286
|
+
chunks = []
|
|
287
|
+
current_chunk = []
|
|
288
|
+
current_size = 0
|
|
289
|
+
|
|
290
|
+
for sentence in sentences:
|
|
291
|
+
sentence_size = len(sentence)
|
|
292
|
+
|
|
293
|
+
if current_size + sentence_size > max_chunk_size and current_chunk:
|
|
294
|
+
chunks.append(" ".join(current_chunk))
|
|
295
|
+
# Keep overlap sentences
|
|
296
|
+
current_chunk = current_chunk[-overlap_sentences:] if overlap_sentences else []
|
|
297
|
+
current_size = sum(len(s) for s in current_chunk)
|
|
298
|
+
|
|
299
|
+
current_chunk.append(sentence)
|
|
300
|
+
current_size += sentence_size
|
|
301
|
+
|
|
302
|
+
if current_chunk:
|
|
303
|
+
chunks.append(" ".join(current_chunk))
|
|
304
|
+
|
|
305
|
+
return chunks
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
---
|
|
309
|
+
|
|
310
|
+
## Semantic Chunking
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
import numpy as np
|
|
314
|
+
from sentence_transformers import SentenceTransformer
|
|
315
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
316
|
+
|
|
317
|
+
class SemanticChunker:
|
|
318
|
+
"""Chunk based on semantic similarity between sentences."""
|
|
319
|
+
|
|
320
|
+
def __init__(
|
|
321
|
+
self,
|
|
322
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
323
|
+
similarity_threshold: float = 0.5,
|
|
324
|
+
min_chunk_size: int = 100,
|
|
325
|
+
max_chunk_size: int = 1500
|
|
326
|
+
):
|
|
327
|
+
self.model = SentenceTransformer(model_name)
|
|
328
|
+
self.similarity_threshold = similarity_threshold
|
|
329
|
+
self.min_chunk_size = min_chunk_size
|
|
330
|
+
self.max_chunk_size = max_chunk_size
|
|
331
|
+
|
|
332
|
+
def chunk(self, text: str) -> list[str]:
|
|
333
|
+
"""Split text at semantic boundaries."""
|
|
334
|
+
# Split into sentences
|
|
335
|
+
sentences = self._split_sentences(text)
|
|
336
|
+
if len(sentences) <= 1:
|
|
337
|
+
return [text]
|
|
338
|
+
|
|
339
|
+
# Get embeddings
|
|
340
|
+
embeddings = self.model.encode(sentences)
|
|
341
|
+
|
|
342
|
+
# Find breakpoints based on similarity drops
|
|
343
|
+
breakpoints = self._find_breakpoints(embeddings)
|
|
344
|
+
|
|
345
|
+
# Create chunks
|
|
346
|
+
chunks = []
|
|
347
|
+
start = 0
|
|
348
|
+
|
|
349
|
+
for bp in breakpoints:
|
|
350
|
+
chunk_text = " ".join(sentences[start:bp])
|
|
351
|
+
|
|
352
|
+
# Handle size constraints
|
|
353
|
+
if len(chunk_text) > self.max_chunk_size:
|
|
354
|
+
# Split large chunks
|
|
355
|
+
sub_chunks = self._split_large_chunk(sentences[start:bp])
|
|
356
|
+
chunks.extend(sub_chunks)
|
|
357
|
+
elif len(chunk_text) >= self.min_chunk_size:
|
|
358
|
+
chunks.append(chunk_text)
|
|
359
|
+
elif chunks:
|
|
360
|
+
# Merge small chunk with previous
|
|
361
|
+
chunks[-1] += " " + chunk_text
|
|
362
|
+
else:
|
|
363
|
+
chunks.append(chunk_text)
|
|
364
|
+
|
|
365
|
+
start = bp
|
|
366
|
+
|
|
367
|
+
# Handle remaining sentences
|
|
368
|
+
if start < len(sentences):
|
|
369
|
+
remaining = " ".join(sentences[start:])
|
|
370
|
+
if chunks and len(remaining) < self.min_chunk_size:
|
|
371
|
+
chunks[-1] += " " + remaining
|
|
372
|
+
else:
|
|
373
|
+
chunks.append(remaining)
|
|
374
|
+
|
|
375
|
+
return chunks
|
|
376
|
+
|
|
377
|
+
def _split_sentences(self, text: str) -> list[str]:
|
|
378
|
+
"""Split text into sentences."""
|
|
379
|
+
import re
|
|
380
|
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
381
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
382
|
+
|
|
383
|
+
def _find_breakpoints(self, embeddings: np.ndarray) -> list[int]:
|
|
384
|
+
"""Find semantic breakpoints using similarity drops."""
|
|
385
|
+
breakpoints = []
|
|
386
|
+
|
|
387
|
+
for i in range(1, len(embeddings)):
|
|
388
|
+
similarity = cosine_similarity(
|
|
389
|
+
embeddings[i-1:i],
|
|
390
|
+
embeddings[i:i+1]
|
|
391
|
+
)[0][0]
|
|
392
|
+
|
|
393
|
+
if similarity < self.similarity_threshold:
|
|
394
|
+
breakpoints.append(i)
|
|
395
|
+
|
|
396
|
+
return breakpoints
|
|
397
|
+
|
|
398
|
+
def _split_large_chunk(self, sentences: list[str]) -> list[str]:
|
|
399
|
+
"""Split oversized chunk at midpoint."""
|
|
400
|
+
mid = len(sentences) // 2
|
|
401
|
+
return [
|
|
402
|
+
" ".join(sentences[:mid]),
|
|
403
|
+
" ".join(sentences[mid:])
|
|
404
|
+
]
|
|
405
|
+
|
|
406
|
+
# Usage
|
|
407
|
+
chunker = SemanticChunker(
|
|
408
|
+
similarity_threshold=0.5,
|
|
409
|
+
min_chunk_size=200,
|
|
410
|
+
max_chunk_size=1000
|
|
411
|
+
)
|
|
412
|
+
semantic_chunks = chunker.chunk(document_text)
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
### Percentile-Based Breakpoints
|
|
416
|
+
|
|
417
|
+
```python
|
|
418
|
+
def find_breakpoints_percentile(
|
|
419
|
+
embeddings: np.ndarray,
|
|
420
|
+
percentile: int = 25
|
|
421
|
+
) -> list[int]:
|
|
422
|
+
"""Find breakpoints at similarity drops below percentile threshold."""
|
|
423
|
+
similarities = []
|
|
424
|
+
|
|
425
|
+
for i in range(1, len(embeddings)):
|
|
426
|
+
sim = cosine_similarity(
|
|
427
|
+
embeddings[i-1:i],
|
|
428
|
+
embeddings[i:i+1]
|
|
429
|
+
)[0][0]
|
|
430
|
+
similarities.append((i, sim))
|
|
431
|
+
|
|
432
|
+
# Dynamic threshold based on distribution
|
|
433
|
+
sim_values = [s[1] for s in similarities]
|
|
434
|
+
threshold = np.percentile(sim_values, percentile)
|
|
435
|
+
|
|
436
|
+
return [i for i, sim in similarities if sim < threshold]
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
---
|
|
440
|
+
|
|
441
|
+
## Document-Aware Chunking
|
|
442
|
+
|
|
443
|
+
### Markdown Chunking
|
|
444
|
+
|
|
445
|
+
```python
|
|
446
|
+
import re
|
|
447
|
+
from dataclasses import dataclass
|
|
448
|
+
|
|
449
|
+
@dataclass
|
|
450
|
+
class MarkdownChunk:
|
|
451
|
+
text: str
|
|
452
|
+
heading: str | None
|
|
453
|
+
heading_level: int
|
|
454
|
+
metadata: dict
|
|
455
|
+
|
|
456
|
+
def chunk_markdown(
|
|
457
|
+
text: str,
|
|
458
|
+
max_chunk_size: int = 1500,
|
|
459
|
+
include_heading_in_chunk: bool = True
|
|
460
|
+
) -> list[MarkdownChunk]:
|
|
461
|
+
"""Chunk markdown by headers while respecting structure."""
|
|
462
|
+
# Pattern to match headers
|
|
463
|
+
header_pattern = r'^(#{1,6})\s+(.+)$'
|
|
464
|
+
|
|
465
|
+
lines = text.split('\n')
|
|
466
|
+
chunks = []
|
|
467
|
+
current_chunk_lines = []
|
|
468
|
+
current_heading = None
|
|
469
|
+
current_level = 0
|
|
470
|
+
heading_stack = [] # For breadcrumb context
|
|
471
|
+
|
|
472
|
+
for line in lines:
|
|
473
|
+
header_match = re.match(header_pattern, line)
|
|
474
|
+
|
|
475
|
+
if header_match:
|
|
476
|
+
# Save current chunk if exists
|
|
477
|
+
if current_chunk_lines:
|
|
478
|
+
chunk_text = '\n'.join(current_chunk_lines)
|
|
479
|
+
if len(chunk_text.strip()) > 0:
|
|
480
|
+
prefix = f"# {current_heading}\n\n" if include_heading_in_chunk and current_heading else ""
|
|
481
|
+
chunks.append(MarkdownChunk(
|
|
482
|
+
text=prefix + chunk_text,
|
|
483
|
+
heading=current_heading,
|
|
484
|
+
heading_level=current_level,
|
|
485
|
+
metadata={"breadcrumb": " > ".join(heading_stack)}
|
|
486
|
+
))
|
|
487
|
+
|
|
488
|
+
# Update heading context
|
|
489
|
+
level = len(header_match.group(1))
|
|
490
|
+
heading = header_match.group(2).strip()
|
|
491
|
+
|
|
492
|
+
# Maintain heading stack for breadcrumbs
|
|
493
|
+
while heading_stack and current_level >= level:
|
|
494
|
+
heading_stack.pop()
|
|
495
|
+
current_level -= 1
|
|
496
|
+
|
|
497
|
+
heading_stack.append(heading)
|
|
498
|
+
current_heading = heading
|
|
499
|
+
current_level = level
|
|
500
|
+
current_chunk_lines = []
|
|
501
|
+
|
|
502
|
+
else:
|
|
503
|
+
current_chunk_lines.append(line)
|
|
504
|
+
|
|
505
|
+
# Check chunk size
|
|
506
|
+
current_text = '\n'.join(current_chunk_lines)
|
|
507
|
+
if len(current_text) > max_chunk_size:
|
|
508
|
+
# Split at paragraph boundary
|
|
509
|
+
paragraphs = current_text.split('\n\n')
|
|
510
|
+
if len(paragraphs) > 1:
|
|
511
|
+
split_point = len('\n\n'.join(paragraphs[:-1]))
|
|
512
|
+
chunk_text = current_text[:split_point]
|
|
513
|
+
prefix = f"# {current_heading}\n\n" if include_heading_in_chunk and current_heading else ""
|
|
514
|
+
chunks.append(MarkdownChunk(
|
|
515
|
+
text=prefix + chunk_text,
|
|
516
|
+
heading=current_heading,
|
|
517
|
+
heading_level=current_level,
|
|
518
|
+
metadata={"breadcrumb": " > ".join(heading_stack)}
|
|
519
|
+
))
|
|
520
|
+
current_chunk_lines = [current_text[split_point:].strip()]
|
|
521
|
+
|
|
522
|
+
# Don't forget the last chunk
|
|
523
|
+
if current_chunk_lines:
|
|
524
|
+
chunk_text = '\n'.join(current_chunk_lines)
|
|
525
|
+
if len(chunk_text.strip()) > 0:
|
|
526
|
+
prefix = f"# {current_heading}\n\n" if include_heading_in_chunk and current_heading else ""
|
|
527
|
+
chunks.append(MarkdownChunk(
|
|
528
|
+
text=prefix + chunk_text,
|
|
529
|
+
heading=current_heading,
|
|
530
|
+
heading_level=current_level,
|
|
531
|
+
metadata={"breadcrumb": " > ".join(heading_stack)}
|
|
532
|
+
))
|
|
533
|
+
|
|
534
|
+
return chunks
|
|
535
|
+
```
|
|
536
|
+
|
|
537
|
+
### Code-Aware Chunking
|
|
538
|
+
|
|
539
|
+
```python
|
|
540
|
+
import re
|
|
541
|
+
from dataclasses import dataclass
|
|
542
|
+
|
|
543
|
+
@dataclass
|
|
544
|
+
class CodeChunk:
|
|
545
|
+
text: str
|
|
546
|
+
language: str | None
|
|
547
|
+
chunk_type: str # "code", "text", "mixed"
|
|
548
|
+
|
|
549
|
+
def chunk_with_code_blocks(
|
|
550
|
+
text: str,
|
|
551
|
+
max_chunk_size: int = 1500
|
|
552
|
+
) -> list[CodeChunk]:
|
|
553
|
+
"""Chunk text while keeping code blocks intact."""
|
|
554
|
+
# Pattern to match code blocks
|
|
555
|
+
code_block_pattern = r'```(\w+)?\n(.*?)```'
|
|
556
|
+
|
|
557
|
+
chunks = []
|
|
558
|
+
last_end = 0
|
|
559
|
+
|
|
560
|
+
for match in re.finditer(code_block_pattern, text, re.DOTALL):
|
|
561
|
+
# Text before code block
|
|
562
|
+
text_before = text[last_end:match.start()].strip()
|
|
563
|
+
if text_before:
|
|
564
|
+
# Chunk the text portion
|
|
565
|
+
text_chunks = recursive_chunk(text_before, max_chunk_size)
|
|
566
|
+
chunks.extend([
|
|
567
|
+
CodeChunk(text=t, language=None, chunk_type="text")
|
|
568
|
+
for t in text_chunks
|
|
569
|
+
])
|
|
570
|
+
|
|
571
|
+
# Code block (keep intact if possible)
|
|
572
|
+
language = match.group(1)
|
|
573
|
+
code_content = match.group(2)
|
|
574
|
+
full_block = match.group(0)
|
|
575
|
+
|
|
576
|
+
if len(full_block) <= max_chunk_size:
|
|
577
|
+
chunks.append(CodeChunk(
|
|
578
|
+
text=full_block,
|
|
579
|
+
language=language,
|
|
580
|
+
chunk_type="code"
|
|
581
|
+
))
|
|
582
|
+
else:
|
|
583
|
+
# Split large code blocks by function/class
|
|
584
|
+
code_chunks = split_code_block(code_content, language, max_chunk_size)
|
|
585
|
+
chunks.extend(code_chunks)
|
|
586
|
+
|
|
587
|
+
last_end = match.end()
|
|
588
|
+
|
|
589
|
+
# Remaining text after last code block
|
|
590
|
+
remaining = text[last_end:].strip()
|
|
591
|
+
if remaining:
|
|
592
|
+
text_chunks = recursive_chunk(remaining, max_chunk_size)
|
|
593
|
+
chunks.extend([
|
|
594
|
+
CodeChunk(text=t, language=None, chunk_type="text")
|
|
595
|
+
for t in text_chunks
|
|
596
|
+
])
|
|
597
|
+
|
|
598
|
+
return chunks
|
|
599
|
+
|
|
600
|
+
def split_code_block(code: str, language: str, max_size: int) -> list[CodeChunk]:
|
|
601
|
+
"""Split code block at logical boundaries."""
|
|
602
|
+
# Simple function/class boundary splitting for Python
|
|
603
|
+
if language == "python":
|
|
604
|
+
pattern = r'\n(?=def |class |async def )'
|
|
605
|
+
elif language in ["javascript", "typescript"]:
|
|
606
|
+
pattern = r'\n(?=function |class |const |export )'
|
|
607
|
+
else:
|
|
608
|
+
pattern = r'\n\n'
|
|
609
|
+
|
|
610
|
+
parts = re.split(pattern, code)
|
|
611
|
+
chunks = []
|
|
612
|
+
current = ""
|
|
613
|
+
|
|
614
|
+
for part in parts:
|
|
615
|
+
if len(current) + len(part) > max_size and current:
|
|
616
|
+
chunks.append(CodeChunk(
|
|
617
|
+
text=f"```{language}\n{current}```",
|
|
618
|
+
language=language,
|
|
619
|
+
chunk_type="code"
|
|
620
|
+
))
|
|
621
|
+
current = part
|
|
622
|
+
else:
|
|
623
|
+
current += part
|
|
624
|
+
|
|
625
|
+
if current:
|
|
626
|
+
chunks.append(CodeChunk(
|
|
627
|
+
text=f"```{language}\n{current}```",
|
|
628
|
+
language=language,
|
|
629
|
+
chunk_type="code"
|
|
630
|
+
))
|
|
631
|
+
|
|
632
|
+
return chunks
|
|
633
|
+
```
|
|
634
|
+
|
|
635
|
+
---
|
|
636
|
+
|
|
637
|
+
## Contextual/Agentic Chunking
|
|
638
|
+
|
|
639
|
+
```python
|
|
640
|
+
from openai import OpenAI
|
|
641
|
+
|
|
642
|
+
def contextual_chunk(
|
|
643
|
+
document: str,
|
|
644
|
+
max_chunk_size: int = 1500
|
|
645
|
+
) -> list[dict]:
|
|
646
|
+
"""Use LLM to add context to each chunk."""
|
|
647
|
+
# First, do structural chunking
|
|
648
|
+
base_chunks = recursive_chunk(document, max_chunk_size)
|
|
649
|
+
|
|
650
|
+
client = OpenAI()
|
|
651
|
+
contextualized_chunks = []
|
|
652
|
+
|
|
653
|
+
for chunk in base_chunks:
|
|
654
|
+
# Generate contextual summary
|
|
655
|
+
response = client.chat.completions.create(
|
|
656
|
+
model="gpt-4o-mini",
|
|
657
|
+
messages=[
|
|
658
|
+
{
|
|
659
|
+
"role": "system",
|
|
660
|
+
"content": """Provide a brief context for this document chunk.
|
|
661
|
+
Include: what topic it covers, how it relates to the broader document,
|
|
662
|
+
and key concepts mentioned. Keep it under 100 words."""
|
|
663
|
+
},
|
|
664
|
+
{
|
|
665
|
+
"role": "user",
|
|
666
|
+
"content": f"Document excerpt:\n\n{chunk}"
|
|
667
|
+
}
|
|
668
|
+
],
|
|
669
|
+
max_tokens=150
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
context = response.choices[0].message.content
|
|
673
|
+
|
|
674
|
+
contextualized_chunks.append({
|
|
675
|
+
"text": chunk,
|
|
676
|
+
"context": context,
|
|
677
|
+
"text_with_context": f"Context: {context}\n\nContent: {chunk}"
|
|
678
|
+
})
|
|
679
|
+
|
|
680
|
+
return contextualized_chunks
|
|
681
|
+
```
|
|
682
|
+
|
|
683
|
+
### Propositions-Based Chunking
|
|
684
|
+
|
|
685
|
+
```python
|
|
686
|
+
def extract_propositions(text: str) -> list[str]:
|
|
687
|
+
"""Extract atomic propositions from text using LLM."""
|
|
688
|
+
client = OpenAI()
|
|
689
|
+
|
|
690
|
+
response = client.chat.completions.create(
|
|
691
|
+
model="gpt-4o-mini",
|
|
692
|
+
messages=[
|
|
693
|
+
{
|
|
694
|
+
"role": "system",
|
|
695
|
+
"content": """Extract atomic propositions from the text.
|
|
696
|
+
Each proposition should:
|
|
697
|
+
- Be a single, complete fact
|
|
698
|
+
- Be self-contained (understandable without context)
|
|
699
|
+
- Include necessary entity references
|
|
700
|
+
|
|
701
|
+
Return as a JSON array of strings."""
|
|
702
|
+
},
|
|
703
|
+
{
|
|
704
|
+
"role": "user",
|
|
705
|
+
"content": text
|
|
706
|
+
}
|
|
707
|
+
],
|
|
708
|
+
response_format={"type": "json_object"}
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
import json
|
|
712
|
+
result = json.loads(response.choices[0].message.content)
|
|
713
|
+
return result.get("propositions", [])
|
|
714
|
+
|
|
715
|
+
# Usage: For very fine-grained retrieval
|
|
716
|
+
propositions = extract_propositions(document_text)
|
|
717
|
+
# Each proposition becomes its own retrievable unit
|
|
718
|
+
```
|
|
719
|
+
|
|
720
|
+
---
|
|
721
|
+
|
|
722
|
+
## Late Chunking (for Long-Context Embeddings)
|
|
723
|
+
|
|
724
|
+
```python
|
|
725
|
+
from transformers import AutoTokenizer, AutoModel
|
|
726
|
+
import torch
|
|
727
|
+
|
|
728
|
+
class LateChunker:
|
|
729
|
+
"""
|
|
730
|
+
Late chunking: embed full document, then pool token embeddings into chunks.
|
|
731
|
+
Preserves full document context while creating retrievable chunks.
|
|
732
|
+
"""
|
|
733
|
+
|
|
734
|
+
def __init__(self, model_name: str = "jinaai/jina-embeddings-v2-base-en"):
|
|
735
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
736
|
+
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
|
737
|
+
self.model.eval()
|
|
738
|
+
|
|
739
|
+
def chunk_and_embed(
|
|
740
|
+
self,
|
|
741
|
+
text: str,
|
|
742
|
+
chunk_size: int = 512,
|
|
743
|
+
overlap: int = 64
|
|
744
|
+
) -> list[dict]:
|
|
745
|
+
"""
|
|
746
|
+
Embed full document, then create chunk embeddings via mean pooling.
|
|
747
|
+
"""
|
|
748
|
+
# Tokenize full document
|
|
749
|
+
inputs = self.tokenizer(
|
|
750
|
+
text,
|
|
751
|
+
return_tensors="pt",
|
|
752
|
+
truncation=True,
|
|
753
|
+
max_length=8192 # Model's max context
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
# Get token-level embeddings
|
|
757
|
+
with torch.no_grad():
|
|
758
|
+
outputs = self.model(**inputs)
|
|
759
|
+
token_embeddings = outputs.last_hidden_state[0] # [seq_len, hidden_dim]
|
|
760
|
+
|
|
761
|
+
# Get token-to-text mapping
|
|
762
|
+
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
|
763
|
+
|
|
764
|
+
# Create chunks from token embeddings
|
|
765
|
+
chunks = []
|
|
766
|
+
seq_len = token_embeddings.shape[0]
|
|
767
|
+
start = 0
|
|
768
|
+
|
|
769
|
+
while start < seq_len:
|
|
770
|
+
end = min(start + chunk_size, seq_len)
|
|
771
|
+
|
|
772
|
+
# Mean pool token embeddings for this chunk
|
|
773
|
+
chunk_embedding = token_embeddings[start:end].mean(dim=0).numpy()
|
|
774
|
+
|
|
775
|
+
# Reconstruct text for this chunk
|
|
776
|
+
chunk_token_ids = inputs["input_ids"][0][start:end]
|
|
777
|
+
chunk_text = self.tokenizer.decode(chunk_token_ids, skip_special_tokens=True)
|
|
778
|
+
|
|
779
|
+
chunks.append({
|
|
780
|
+
"text": chunk_text,
|
|
781
|
+
"embedding": chunk_embedding,
|
|
782
|
+
"start_token": start,
|
|
783
|
+
"end_token": end
|
|
784
|
+
})
|
|
785
|
+
|
|
786
|
+
start = end - overlap
|
|
787
|
+
|
|
788
|
+
return chunks
|
|
789
|
+
|
|
790
|
+
# Usage
|
|
791
|
+
late_chunker = LateChunker()
|
|
792
|
+
chunks_with_embeddings = late_chunker.chunk_and_embed(
|
|
793
|
+
long_document,
|
|
794
|
+
chunk_size=512,
|
|
795
|
+
overlap=64
|
|
796
|
+
)
|
|
797
|
+
```
|
|
798
|
+
|
|
799
|
+
---
|
|
800
|
+
|
|
801
|
+
## Metadata Enrichment
|
|
802
|
+
|
|
803
|
+
```python
|
|
804
|
+
from dataclasses import dataclass
|
|
805
|
+
from datetime import datetime
|
|
806
|
+
import hashlib
|
|
807
|
+
|
|
808
|
+
@dataclass
|
|
809
|
+
class EnrichedChunk:
|
|
810
|
+
text: str
|
|
811
|
+
embedding: list[float] | None
|
|
812
|
+
metadata: dict
|
|
813
|
+
|
|
814
|
+
def enrich_chunk(
|
|
815
|
+
text: str,
|
|
816
|
+
source_file: str,
|
|
817
|
+
chunk_index: int,
|
|
818
|
+
total_chunks: int,
|
|
819
|
+
additional_metadata: dict | None = None
|
|
820
|
+
) -> EnrichedChunk:
|
|
821
|
+
"""Add comprehensive metadata to chunk."""
|
|
822
|
+
metadata = {
|
|
823
|
+
# Source tracking
|
|
824
|
+
"source": source_file,
|
|
825
|
+
"chunk_index": chunk_index,
|
|
826
|
+
"total_chunks": total_chunks,
|
|
827
|
+
|
|
828
|
+
# Content characteristics
|
|
829
|
+
"char_count": len(text),
|
|
830
|
+
"word_count": len(text.split()),
|
|
831
|
+
"content_hash": hashlib.md5(text.encode()).hexdigest()[:12],
|
|
832
|
+
|
|
833
|
+
# Temporal
|
|
834
|
+
"indexed_at": datetime.utcnow().isoformat(),
|
|
835
|
+
|
|
836
|
+
# Position context
|
|
837
|
+
"position": "start" if chunk_index == 0 else (
|
|
838
|
+
"end" if chunk_index == total_chunks - 1 else "middle"
|
|
839
|
+
)
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
if additional_metadata:
|
|
843
|
+
metadata.update(additional_metadata)
|
|
844
|
+
|
|
845
|
+
return EnrichedChunk(text=text, embedding=None, metadata=metadata)
|
|
846
|
+
```
|
|
847
|
+
|
|
848
|
+
---
|
|
849
|
+
|
|
850
|
+
## Chunk Size Selection Guide
|
|
851
|
+
|
|
852
|
+
| Document Type | Recommended Size | Overlap | Rationale |
|
|
853
|
+
|--------------|------------------|---------|-----------|
|
|
854
|
+
| FAQ/Q&A | 200-400 tokens | 20-50 | Keep Q&A pairs together |
|
|
855
|
+
| Technical docs | 400-600 tokens | 50-100 | Balance context vs precision |
|
|
856
|
+
| Legal/contracts | 600-800 tokens | 100-150 | Preserve clause context |
|
|
857
|
+
| Code documentation | 300-500 tokens | 50-100 | Keep function docs together |
|
|
858
|
+
| Chat transcripts | 150-300 tokens | 25-50 | Natural turn boundaries |
|
|
859
|
+
| Research papers | 500-800 tokens | 100-200 | Section-level coherence |
|
|
860
|
+
|
|
861
|
+
---
|
|
862
|
+
|
|
863
|
+
## Quick Reference
|
|
864
|
+
|
|
865
|
+
| Strategy | Use Case | Code Pattern |
|
|
866
|
+
|----------|----------|--------------|
|
|
867
|
+
| Fixed-size | Logs, baseline | `text[i:i+chunk_size]` |
|
|
868
|
+
| Recursive | General text | Split by `["\n\n", "\n", ". "]` |
|
|
869
|
+
| Sentence | Q&A content | `sent_tokenize()` + merge |
|
|
870
|
+
| Semantic | Technical docs | Similarity-based breaks |
|
|
871
|
+
| Markdown | Documentation | Header-aware splitting |
|
|
872
|
+
| Late chunking | Long-context models | Embed full, pool chunks |
|
|
873
|
+
|
|
874
|
+
## Related Skills
|
|
875
|
+
|
|
876
|
+
- **RAG Architect** - Integration with vector databases
|
|
877
|
+
- **Python Pro** - Preprocessing pipelines
|
|
878
|
+
- **NLP Engineer** - Tokenization and text processing
|