mdcontext 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.changeset/README.md +28 -0
- package/.changeset/config.json +11 -0
- package/.claude/settings.local.json +25 -0
- package/.github/workflows/ci.yml +83 -0
- package/.github/workflows/claude-code-review.yml +44 -0
- package/.github/workflows/claude.yml +85 -0
- package/.github/workflows/release.yml +113 -0
- package/.tldrignore +112 -0
- package/BACKLOG.md +338 -0
- package/CONTRIBUTING.md +186 -0
- package/NOTES/NOTES +44 -0
- package/README.md +434 -11
- package/biome.json +36 -0
- package/cspell.config.yaml +14 -0
- package/dist/chunk-23UPXDNL.js +3044 -0
- package/dist/chunk-2W7MO2DL.js +1366 -0
- package/dist/chunk-3NUAZGMA.js +1689 -0
- package/dist/chunk-7TOWB2XB.js +366 -0
- package/dist/chunk-7XOTOADQ.js +3065 -0
- package/dist/chunk-AH2PDM2K.js +3042 -0
- package/dist/chunk-BNXWSZ63.js +3742 -0
- package/dist/chunk-BTL5DJVU.js +3222 -0
- package/dist/chunk-HDHYG7E4.js +104 -0
- package/dist/chunk-HLR4KZBP.js +3234 -0
- package/dist/chunk-IP3FRFEB.js +1045 -0
- package/dist/chunk-KHU56VDO.js +3042 -0
- package/dist/chunk-KRYIFLQR.js +88 -0
- package/dist/chunk-LBSDNLEM.js +287 -0
- package/dist/chunk-MNTQ7HCP.js +2643 -0
- package/dist/chunk-MUJELQQ6.js +1387 -0
- package/dist/chunk-MXJGMSLV.js +2199 -0
- package/dist/chunk-N6QJGC3Z.js +2636 -0
- package/dist/chunk-OBELGBPM.js +1713 -0
- package/dist/chunk-OT7R5XTA.js +3192 -0
- package/dist/chunk-P7X4RA2T.js +106 -0
- package/dist/chunk-PIDUQNC2.js +3185 -0
- package/dist/chunk-POGCDIH4.js +3187 -0
- package/dist/chunk-PSIEOQGZ.js +3043 -0
- package/dist/chunk-PVRT3IHA.js +3238 -0
- package/dist/chunk-QNN4TT23.js +1430 -0
- package/dist/chunk-RE3R45RJ.js +3042 -0
- package/dist/chunk-S7E6TFX6.js +803 -0
- package/dist/chunk-SG6GLU4U.js +1378 -0
- package/dist/chunk-SJCDV2ST.js +274 -0
- package/dist/chunk-SYE5XLF3.js +104 -0
- package/dist/chunk-T5VLYBZD.js +103 -0
- package/dist/chunk-TOQB7VWU.js +3238 -0
- package/dist/chunk-VFNMZ4ZQ.js +3228 -0
- package/dist/chunk-VVTGZNBT.js +1629 -0
- package/dist/chunk-W7Q4RFEV.js +104 -0
- package/dist/chunk-XTYYVRLO.js +3190 -0
- package/dist/chunk-Y6MDYVJD.js +3063 -0
- package/dist/cli/main.d.ts +1 -0
- package/dist/cli/main.js +5458 -0
- package/dist/index.d.ts +653 -0
- package/dist/index.js +79 -0
- package/dist/mcp/server.d.ts +1 -0
- package/dist/mcp/server.js +472 -0
- package/dist/schema-BAWSG7KY.js +22 -0
- package/dist/schema-E3QUPL26.js +20 -0
- package/dist/schema-EHL7WUT6.js +20 -0
- package/docs/019-USAGE.md +625 -0
- package/docs/020-current-implementation.md +364 -0
- package/docs/021-DOGFOODING-FINDINGS.md +175 -0
- package/docs/BACKLOG.md +80 -0
- package/docs/CONFIG.md +1123 -0
- package/docs/DESIGN.md +439 -0
- package/docs/ERRORS.md +383 -0
- package/docs/PROJECT.md +88 -0
- package/docs/ROADMAP.md +407 -0
- package/docs/summarization.md +320 -0
- package/docs/test-links.md +9 -0
- package/justfile +40 -0
- package/package.json +74 -9
- package/pnpm-workspace.yaml +5 -0
- package/research/INDEX.md +315 -0
- package/research/code-review/README.md +90 -0
- package/research/code-review/cli-error-handling-review.md +979 -0
- package/research/code-review/code-review-validation-report.md +464 -0
- package/research/code-review/main-ts-review.md +1128 -0
- package/research/config-analysis/01-current-implementation.md +470 -0
- package/research/config-analysis/02-strategy-recommendation.md +428 -0
- package/research/config-analysis/03-task-candidates.md +715 -0
- package/research/config-analysis/033-research-configuration-management.md +828 -0
- package/research/config-analysis/034-research-effect-cli-config.md +1504 -0
- package/research/config-analysis/04-consolidated-task-candidates.md +277 -0
- package/research/config-docs/SUMMARY.md +357 -0
- package/research/config-docs/TEST-RESULTS.md +776 -0
- package/research/config-docs/TODO.md +542 -0
- package/research/config-docs/analysis.md +744 -0
- package/research/config-docs/fix-validation.md +502 -0
- package/research/config-docs/help-audit.md +264 -0
- package/research/config-docs/help-system-analysis.md +890 -0
- package/research/dogfood/consolidated-tool-evaluation.md +373 -0
- package/research/dogfood/strategy-a/a-synthesis.md +184 -0
- package/research/dogfood/strategy-a/a1-docs.md +226 -0
- package/research/dogfood/strategy-a/a2-amorphic.md +156 -0
- package/research/dogfood/strategy-a/a3-llm.md +164 -0
- package/research/dogfood/strategy-b/b-synthesis.md +228 -0
- package/research/dogfood/strategy-b/b1-architecture.md +207 -0
- package/research/dogfood/strategy-b/b2-gaps.md +258 -0
- package/research/dogfood/strategy-b/b3-workflows.md +250 -0
- package/research/dogfood/strategy-c/c-synthesis.md +451 -0
- package/research/dogfood/strategy-c/c1-explorer.md +192 -0
- package/research/dogfood/strategy-c/c2-diver-memory.md +145 -0
- package/research/dogfood/strategy-c/c3-diver-control.md +148 -0
- package/research/dogfood/strategy-c/c4-diver-failure.md +151 -0
- package/research/dogfood/strategy-c/c5-diver-execution.md +221 -0
- package/research/dogfood/strategy-c/c6-diver-org.md +221 -0
- package/research/effect-cli-error-handling.md +845 -0
- package/research/effect-errors-as-values.md +943 -0
- package/research/errors-task-analysis/00-consolidated-tasks.md +207 -0
- package/research/errors-task-analysis/cli-commands-analysis.md +909 -0
- package/research/errors-task-analysis/embeddings-analysis.md +709 -0
- package/research/errors-task-analysis/index-search-analysis.md +812 -0
- package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
- package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
- package/research/issue-review.md +603 -0
- package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
- package/research/llm-summarization/alternative-providers-2026.md +1428 -0
- package/research/llm-summarization/anthropic-2026.md +367 -0
- package/research/llm-summarization/claude-cli-integration.md +1706 -0
- package/research/llm-summarization/cli-integration-patterns.md +3155 -0
- package/research/llm-summarization/openai-2026.md +473 -0
- package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
- package/research/llm-summarization/opencode-cli-integration.md +1552 -0
- package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
- package/research/llm-summarization/prototype-results.md +56 -0
- package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
- package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
- package/research/mdcontext-error-analysis.md +521 -0
- package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
- package/research/mdcontext-pudding/01-index-embed.md +956 -0
- package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
- package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
- package/research/mdcontext-pudding/02-search.md +970 -0
- package/research/mdcontext-pudding/03-context.md +779 -0
- package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
- package/research/mdcontext-pudding/04-tree.md +704 -0
- package/research/mdcontext-pudding/05-config.md +1038 -0
- package/research/mdcontext-pudding/06-links-summary.txt +87 -0
- package/research/mdcontext-pudding/06-links.md +679 -0
- package/research/mdcontext-pudding/07-stats.md +693 -0
- package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
- package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
- package/research/mdcontext-pudding/README.md +168 -0
- package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
- package/research/npm_publish/011-npm-workflow-research-agent2.md +792 -0
- package/research/npm_publish/012-npm-workflow-research-agent1.md +530 -0
- package/research/npm_publish/013-npm-workflow-research-agent3.md +722 -0
- package/research/npm_publish/014-npm-workflow-synthesis.md +556 -0
- package/research/npm_publish/031-npm-workflow-task-analysis.md +134 -0
- package/research/research-quality-review.md +834 -0
- package/research/semantic-search/002-research-embedding-models.md +490 -0
- package/research/semantic-search/003-research-rag-alternatives.md +523 -0
- package/research/semantic-search/004-research-vector-search.md +841 -0
- package/research/semantic-search/032-research-semantic-search.md +427 -0
- package/research/semantic-search/embedding-text-analysis.md +156 -0
- package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
- package/research/semantic-search/query-processing-analysis.md +207 -0
- package/research/semantic-search/root-cause-and-solution.md +114 -0
- package/research/semantic-search/threshold-validation-report.md +69 -0
- package/research/semantic-search/vector-search-analysis.md +63 -0
- package/research/task-management-2026/00-synthesis-recommendations.md +295 -0
- package/research/task-management-2026/01-ai-workflow-tools.md +416 -0
- package/research/task-management-2026/02-agent-framework-patterns.md +476 -0
- package/research/task-management-2026/03-lightweight-file-based.md +567 -0
- package/research/task-management-2026/04-established-tools-ai-features.md +541 -0
- package/research/task-management-2026/linear/01-core-features-workflow.md +771 -0
- package/research/task-management-2026/linear/02-api-integrations.md +930 -0
- package/research/task-management-2026/linear/03-ai-features.md +368 -0
- package/research/task-management-2026/linear/04-pricing-setup.md +205 -0
- package/research/task-management-2026/linear/05-usage-patterns-best-practices.md +605 -0
- package/research/test-path-issues.md +276 -0
- package/review/ALP-76/1-error-type-design.md +962 -0
- package/review/ALP-76/2-error-handling-patterns.md +906 -0
- package/review/ALP-76/3-error-presentation.md +624 -0
- package/review/ALP-76/4-test-coverage.md +625 -0
- package/review/ALP-76/5-migration-completeness.md +440 -0
- package/review/ALP-76/6-effect-best-practices.md +755 -0
- package/scripts/apply-branch-protection.sh +47 -0
- package/scripts/branch-protection-templates.json +79 -0
- package/scripts/prototype-summarization.ts +346 -0
- package/scripts/rebuild-hnswlib.js +58 -0
- package/scripts/setup-branch-protection.sh +64 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
- package/src/cli/argv-preprocessor.test.ts +210 -0
- package/src/cli/argv-preprocessor.ts +202 -0
- package/src/cli/cli.test.ts +627 -0
- package/src/cli/commands/backlinks.ts +54 -0
- package/src/cli/commands/config-cmd.ts +642 -0
- package/src/cli/commands/context.ts +285 -0
- package/src/cli/commands/duplicates.ts +122 -0
- package/src/cli/commands/embeddings.ts +529 -0
- package/src/cli/commands/index-cmd.ts +480 -0
- package/src/cli/commands/index.ts +16 -0
- package/src/cli/commands/links.ts +52 -0
- package/src/cli/commands/search.ts +1281 -0
- package/src/cli/commands/stats.ts +149 -0
- package/src/cli/commands/tree.ts +128 -0
- package/src/cli/config-layer.ts +176 -0
- package/src/cli/error-handler.test.ts +235 -0
- package/src/cli/error-handler.ts +655 -0
- package/src/cli/flag-schemas.ts +341 -0
- package/src/cli/help.ts +588 -0
- package/src/cli/index.ts +9 -0
- package/src/cli/main.ts +435 -0
- package/src/cli/options.ts +41 -0
- package/src/cli/shared-error-handling.ts +199 -0
- package/src/cli/typo-suggester.test.ts +105 -0
- package/src/cli/typo-suggester.ts +130 -0
- package/src/cli/utils.ts +259 -0
- package/src/config/file-provider.test.ts +320 -0
- package/src/config/file-provider.ts +273 -0
- package/src/config/index.ts +72 -0
- package/src/config/integration.test.ts +667 -0
- package/src/config/precedence.test.ts +277 -0
- package/src/config/precedence.ts +451 -0
- package/src/config/schema.test.ts +414 -0
- package/src/config/schema.ts +603 -0
- package/src/config/service.test.ts +320 -0
- package/src/config/service.ts +243 -0
- package/src/config/testing.test.ts +264 -0
- package/src/config/testing.ts +110 -0
- package/src/core/index.ts +1 -0
- package/src/core/types.ts +113 -0
- package/src/duplicates/detector.test.ts +183 -0
- package/src/duplicates/detector.ts +414 -0
- package/src/duplicates/index.ts +18 -0
- package/src/embeddings/embedding-namespace.test.ts +300 -0
- package/src/embeddings/embedding-namespace.ts +947 -0
- package/src/embeddings/heading-boost.test.ts +222 -0
- package/src/embeddings/hnsw-build-options.test.ts +198 -0
- package/src/embeddings/hyde.test.ts +272 -0
- package/src/embeddings/hyde.ts +264 -0
- package/src/embeddings/index.ts +10 -0
- package/src/embeddings/openai-provider.ts +414 -0
- package/src/embeddings/pricing.json +22 -0
- package/src/embeddings/provider-constants.ts +204 -0
- package/src/embeddings/provider-errors.test.ts +967 -0
- package/src/embeddings/provider-errors.ts +565 -0
- package/src/embeddings/provider-factory.test.ts +240 -0
- package/src/embeddings/provider-factory.ts +225 -0
- package/src/embeddings/provider-integration.test.ts +788 -0
- package/src/embeddings/query-preprocessing.test.ts +187 -0
- package/src/embeddings/semantic-search-threshold.test.ts +508 -0
- package/src/embeddings/semantic-search.ts +1270 -0
- package/src/embeddings/types.ts +359 -0
- package/src/embeddings/vector-store.ts +708 -0
- package/src/embeddings/voyage-provider.ts +313 -0
- package/src/errors/errors.test.ts +845 -0
- package/src/errors/index.ts +533 -0
- package/src/index/ignore-patterns.test.ts +354 -0
- package/src/index/ignore-patterns.ts +305 -0
- package/src/index/index.ts +4 -0
- package/src/index/indexer.ts +684 -0
- package/src/index/storage.ts +260 -0
- package/src/index/types.ts +147 -0
- package/src/index/watcher.ts +189 -0
- package/src/index.ts +30 -0
- package/src/integration/search-keyword.test.ts +678 -0
- package/src/mcp/server.ts +612 -0
- package/src/parser/index.ts +1 -0
- package/src/parser/parser.test.ts +291 -0
- package/src/parser/parser.ts +394 -0
- package/src/parser/section-filter.test.ts +277 -0
- package/src/parser/section-filter.ts +392 -0
- package/src/search/__tests__/hybrid-search.test.ts +650 -0
- package/src/search/bm25-store.ts +366 -0
- package/src/search/cross-encoder.test.ts +253 -0
- package/src/search/cross-encoder.ts +406 -0
- package/src/search/fuzzy-search.test.ts +419 -0
- package/src/search/fuzzy-search.ts +273 -0
- package/src/search/hybrid-search.ts +448 -0
- package/src/search/path-matcher.test.ts +276 -0
- package/src/search/path-matcher.ts +33 -0
- package/src/search/query-parser.test.ts +260 -0
- package/src/search/query-parser.ts +319 -0
- package/src/search/searcher.test.ts +280 -0
- package/src/search/searcher.ts +724 -0
- package/src/search/wink-bm25.d.ts +30 -0
- package/src/summarization/cli-providers/claude.ts +202 -0
- package/src/summarization/cli-providers/detection.test.ts +273 -0
- package/src/summarization/cli-providers/detection.ts +118 -0
- package/src/summarization/cli-providers/index.ts +8 -0
- package/src/summarization/cost.test.ts +139 -0
- package/src/summarization/cost.ts +102 -0
- package/src/summarization/error-handler.test.ts +127 -0
- package/src/summarization/error-handler.ts +111 -0
- package/src/summarization/index.ts +102 -0
- package/src/summarization/pipeline.test.ts +498 -0
- package/src/summarization/pipeline.ts +231 -0
- package/src/summarization/prompts.test.ts +269 -0
- package/src/summarization/prompts.ts +133 -0
- package/src/summarization/provider-factory.test.ts +396 -0
- package/src/summarization/provider-factory.ts +178 -0
- package/src/summarization/types.ts +184 -0
- package/src/summarize/budget-bugs.test.ts +620 -0
- package/src/summarize/formatters.ts +419 -0
- package/src/summarize/index.ts +20 -0
- package/src/summarize/summarizer.test.ts +275 -0
- package/src/summarize/summarizer.ts +597 -0
- package/src/summarize/verify-bugs.test.ts +238 -0
- package/src/types/huggingface-transformers.d.ts +66 -0
- package/src/utils/index.ts +1 -0
- package/src/utils/tokens.test.ts +142 -0
- package/src/utils/tokens.ts +186 -0
- package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
- package/tests/fixtures/cli/.mdcontext/config.json +8 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/indexes/documents.json +33 -0
- package/tests/fixtures/cli/.mdcontext/indexes/links.json +12 -0
- package/tests/fixtures/cli/.mdcontext/indexes/sections.json +247 -0
- package/tests/fixtures/cli/README.md +9 -0
- package/tests/fixtures/cli/api-reference.md +11 -0
- package/tests/fixtures/cli/getting-started.md +11 -0
- package/tests/integration/embed-index.test.ts +712 -0
- package/tests/integration/search-context.test.ts +469 -0
- package/tests/integration/search-semantic.test.ts +522 -0
- package/tsconfig.json +26 -0
- package/vitest.config.ts +16 -0
- package/vitest.setup.ts +12 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Verification tests for the three token budget bugs
|
|
3
|
+
*
|
|
4
|
+
* Bug 1: Orphaned children - if parent is truncated, children are lost
|
|
5
|
+
* Bug 2: Token estimation inaccuracy - 4 chars/token can be ±30% off
|
|
6
|
+
* Bug 3: Formatting overhead under-estimated - 50 token reserve insufficient
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { Effect } from 'effect'
|
|
10
|
+
import { describe, expect, it } from 'vitest'
|
|
11
|
+
import { countTokens, countTokensApprox } from '../utils/tokens.js'
|
|
12
|
+
import { formatSummary } from './formatters.js'
|
|
13
|
+
import type { DocumentSummary } from './summarizer.js'
|
|
14
|
+
|
|
15
|
+
describe('verify token budget bugs', () => {
|
|
16
|
+
describe('Bug 1: Orphaned children', () => {
|
|
17
|
+
it('should rescue children when parent is too large', () => {
|
|
18
|
+
// Create a parent with a very large summary that won't fit budget
|
|
19
|
+
// but with small children that would fit
|
|
20
|
+
const mockSummary: DocumentSummary = {
|
|
21
|
+
path: '/test/file.md',
|
|
22
|
+
title: 'Test',
|
|
23
|
+
originalTokens: 1000,
|
|
24
|
+
summaryTokens: 500,
|
|
25
|
+
compressionRatio: 0.5,
|
|
26
|
+
sections: [
|
|
27
|
+
{
|
|
28
|
+
heading: 'Huge Parent',
|
|
29
|
+
level: 2,
|
|
30
|
+
originalTokens: 500,
|
|
31
|
+
summaryTokens: 400,
|
|
32
|
+
summary: 'Large content '.repeat(100), // ~400 tokens
|
|
33
|
+
children: [
|
|
34
|
+
{
|
|
35
|
+
heading: 'Tiny Child',
|
|
36
|
+
level: 3,
|
|
37
|
+
originalTokens: 20,
|
|
38
|
+
summaryTokens: 5,
|
|
39
|
+
summary: 'Small.',
|
|
40
|
+
children: [],
|
|
41
|
+
hasCode: false,
|
|
42
|
+
hasList: false,
|
|
43
|
+
hasTable: false,
|
|
44
|
+
},
|
|
45
|
+
],
|
|
46
|
+
hasCode: false,
|
|
47
|
+
hasList: false,
|
|
48
|
+
hasTable: false,
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
keyTopics: [],
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Budget can't fit parent (~400 tokens) but can fit child (~5 tokens)
|
|
55
|
+
// Increased to account for enhanced truncation warning with section lists
|
|
56
|
+
const output = formatSummary(mockSummary, { maxTokens: 180 })
|
|
57
|
+
|
|
58
|
+
// Child should be rescued even though parent doesn't fit
|
|
59
|
+
expect(output).toContain('Tiny Child')
|
|
60
|
+
// Parent should NOT be included
|
|
61
|
+
expect(output).not.toContain('Huge Parent')
|
|
62
|
+
expect(output).not.toContain('Large content')
|
|
63
|
+
})
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
describe('Bug 2: Token estimation accuracy', () => {
|
|
67
|
+
it('approximation should never under-count vs tiktoken', async () => {
|
|
68
|
+
const testCases = [
|
|
69
|
+
'Hello world, this is a simple test.',
|
|
70
|
+
'```typescript\nfunction foo() { return 42; }\n```',
|
|
71
|
+
'/very/long/path/to/deeply/nested/directory/structure/file.md',
|
|
72
|
+
'# Title\n\nSome prose with `code` and path /src/utils.ts.\n\n```js\nconst x = 1;\n```',
|
|
73
|
+
'Hello, world! How are you? Fine, thanks... Well: good! (Yes, really.)',
|
|
74
|
+
'这是中文文本测试。',
|
|
75
|
+
'👋 Hello 🌍 World 🎉',
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
for (const text of testCases) {
|
|
79
|
+
const approx = countTokensApprox(text)
|
|
80
|
+
const actual = await Effect.runPromise(countTokens(text))
|
|
81
|
+
|
|
82
|
+
// CRITICAL: Approximation must NEVER be less than actual
|
|
83
|
+
// Under-counting causes budget violations
|
|
84
|
+
expect(approx).toBeGreaterThanOrEqual(actual)
|
|
85
|
+
}
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
it('approximation should be reasonably close (within 2x)', async () => {
|
|
89
|
+
const testCases = [
|
|
90
|
+
'Hello world, this is a simple test.',
|
|
91
|
+
'```typescript\nfunction foo() { return 42; }\n```',
|
|
92
|
+
'/very/long/path/to/deeply/nested/directory/structure/file.md',
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
for (const text of testCases) {
|
|
96
|
+
const approx = countTokensApprox(text)
|
|
97
|
+
const actual = await Effect.runPromise(countTokens(text))
|
|
98
|
+
|
|
99
|
+
// Should not be more than 2x over-estimate
|
|
100
|
+
expect(approx).toBeLessThanOrEqual(actual * 2)
|
|
101
|
+
}
|
|
102
|
+
})
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
describe('Bug 3: Formatting overhead', () => {
|
|
106
|
+
it('output should stay within budget even with long paths', () => {
|
|
107
|
+
const mockSummary: DocumentSummary = {
|
|
108
|
+
path: '/very/long/path/to/some/deeply/nested/directory/structure/with/many/segments/file.md',
|
|
109
|
+
title:
|
|
110
|
+
'A Document With A Very Long Title That Takes Up Many Tokens In The Output',
|
|
111
|
+
originalTokens: 2000,
|
|
112
|
+
summaryTokens: 100,
|
|
113
|
+
compressionRatio: 0.95,
|
|
114
|
+
sections: [
|
|
115
|
+
{
|
|
116
|
+
heading: 'Section 1',
|
|
117
|
+
level: 2,
|
|
118
|
+
originalTokens: 100,
|
|
119
|
+
summaryTokens: 20,
|
|
120
|
+
summary: 'Some content.',
|
|
121
|
+
children: [],
|
|
122
|
+
hasCode: false,
|
|
123
|
+
hasList: false,
|
|
124
|
+
hasTable: false,
|
|
125
|
+
},
|
|
126
|
+
],
|
|
127
|
+
keyTopics: [
|
|
128
|
+
'topic1',
|
|
129
|
+
'topic2',
|
|
130
|
+
'topic3',
|
|
131
|
+
'another-very-long-topic-name',
|
|
132
|
+
'yet-another-long-topic',
|
|
133
|
+
],
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const budgets = [100, 150, 200, 300]
|
|
137
|
+
|
|
138
|
+
for (const budget of budgets) {
|
|
139
|
+
const output = formatSummary(mockSummary, { maxTokens: budget })
|
|
140
|
+
const actualTokens = countTokensApprox(output)
|
|
141
|
+
|
|
142
|
+
expect(actualTokens).toBeLessThanOrEqual(budget)
|
|
143
|
+
}
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
it('output should stay within budget with many topics', () => {
|
|
147
|
+
const mockSummary: DocumentSummary = {
|
|
148
|
+
path: '/test/file.md',
|
|
149
|
+
title: 'Test Document',
|
|
150
|
+
originalTokens: 500,
|
|
151
|
+
summaryTokens: 50,
|
|
152
|
+
compressionRatio: 0.9,
|
|
153
|
+
sections: [],
|
|
154
|
+
keyTopics: [
|
|
155
|
+
'topic-one',
|
|
156
|
+
'topic-two',
|
|
157
|
+
'topic-three',
|
|
158
|
+
'topic-four',
|
|
159
|
+
'topic-five',
|
|
160
|
+
'topic-six',
|
|
161
|
+
'topic-seven',
|
|
162
|
+
'topic-eight',
|
|
163
|
+
'topic-nine',
|
|
164
|
+
'topic-ten',
|
|
165
|
+
],
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const output = formatSummary(mockSummary, { maxTokens: 100 })
|
|
169
|
+
const actualTokens = countTokensApprox(output)
|
|
170
|
+
|
|
171
|
+
expect(actualTokens).toBeLessThanOrEqual(100)
|
|
172
|
+
})
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
describe('strict budget enforcement', () => {
|
|
176
|
+
it('MUST stay within budget for realistic scenarios', () => {
|
|
177
|
+
const scenarios: DocumentSummary[] = [
|
|
178
|
+
{
|
|
179
|
+
// Scenario 1: Long path and title
|
|
180
|
+
path: '/project/src/components/deeply/nested/module/submodule/component.tsx',
|
|
181
|
+
title: 'A React Component With Authentication And Session Management',
|
|
182
|
+
originalTokens: 2000,
|
|
183
|
+
summaryTokens: 800,
|
|
184
|
+
compressionRatio: 0.6,
|
|
185
|
+
sections: [
|
|
186
|
+
{
|
|
187
|
+
heading: 'Overview',
|
|
188
|
+
level: 2,
|
|
189
|
+
originalTokens: 200,
|
|
190
|
+
summaryTokens: 80,
|
|
191
|
+
summary:
|
|
192
|
+
'This component handles user authentication and session management.',
|
|
193
|
+
children: [],
|
|
194
|
+
hasCode: false,
|
|
195
|
+
hasList: false,
|
|
196
|
+
hasTable: false,
|
|
197
|
+
},
|
|
198
|
+
],
|
|
199
|
+
keyTopics: ['react', 'authentication', 'session', 'security'],
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
// Scenario 2: Code-heavy content
|
|
203
|
+
path: '/src/utils/parser.ts',
|
|
204
|
+
title: 'Parser Utilities',
|
|
205
|
+
originalTokens: 1500,
|
|
206
|
+
summaryTokens: 600,
|
|
207
|
+
compressionRatio: 0.6,
|
|
208
|
+
sections: [
|
|
209
|
+
{
|
|
210
|
+
heading: 'parse',
|
|
211
|
+
level: 2,
|
|
212
|
+
originalTokens: 500,
|
|
213
|
+
summaryTokens: 200,
|
|
214
|
+
summary:
|
|
215
|
+
'```typescript\nfunction parse(input: string): AST {\n const tokens = tokenize(input);\n return buildTree(tokens);\n}\n```',
|
|
216
|
+
children: [],
|
|
217
|
+
hasCode: true,
|
|
218
|
+
hasList: false,
|
|
219
|
+
hasTable: false,
|
|
220
|
+
},
|
|
221
|
+
],
|
|
222
|
+
keyTopics: ['parser', 'ast', 'typescript'],
|
|
223
|
+
},
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
const budgets = [100, 150, 200, 300, 500]
|
|
227
|
+
|
|
228
|
+
for (const scenario of scenarios) {
|
|
229
|
+
for (const budget of budgets) {
|
|
230
|
+
const output = formatSummary(scenario, { maxTokens: budget })
|
|
231
|
+
const actualTokens = countTokensApprox(output)
|
|
232
|
+
|
|
233
|
+
expect(actualTokens).toBeLessThanOrEqual(budget)
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
})
|
|
237
|
+
})
|
|
238
|
+
})
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type declarations for @huggingface/transformers (optional dependency)
|
|
3
|
+
*
|
|
4
|
+
* This package is an optional peer dependency used for cross-encoder re-ranking.
|
|
5
|
+
* Users who want re-ranking can install it with: npm install @huggingface/transformers
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
declare module '@huggingface/transformers' {
|
|
9
|
+
export interface ProgressCallbackData {
|
|
10
|
+
file?: string
|
|
11
|
+
progress?: number
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export type ProgressCallback = (data: ProgressCallbackData) => void
|
|
15
|
+
|
|
16
|
+
export interface AutoModelOptions {
|
|
17
|
+
progress_callback?: ProgressCallback | undefined
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface AutoTokenizerOptions {
|
|
21
|
+
progress_callback?: ProgressCallback | undefined
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface TokenizerOutput {
|
|
25
|
+
input_ids: unknown
|
|
26
|
+
attention_mask: unknown
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface ModelOutput {
|
|
30
|
+
logits: {
|
|
31
|
+
data: Float32Array
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export const env: {
|
|
36
|
+
cacheDir: string
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export type AutoTokenizerInstance = (
|
|
40
|
+
texts: string[],
|
|
41
|
+
options: {
|
|
42
|
+
text_pair?: string[]
|
|
43
|
+
padding?: boolean
|
|
44
|
+
truncation?: boolean
|
|
45
|
+
max_length?: number
|
|
46
|
+
},
|
|
47
|
+
) => TokenizerOutput
|
|
48
|
+
|
|
49
|
+
export type AutoModelInstance = (
|
|
50
|
+
input: TokenizerOutput,
|
|
51
|
+
) => Promise<ModelOutput>
|
|
52
|
+
|
|
53
|
+
export const AutoTokenizer: {
|
|
54
|
+
from_pretrained(
|
|
55
|
+
model: string,
|
|
56
|
+
options?: AutoTokenizerOptions,
|
|
57
|
+
): Promise<AutoTokenizerInstance>
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export const AutoModelForSequenceClassification: {
|
|
61
|
+
from_pretrained(
|
|
62
|
+
model: string,
|
|
63
|
+
options?: AutoModelOptions,
|
|
64
|
+
): Promise<AutoModelInstance>
|
|
65
|
+
}
|
|
66
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './tokens.js'
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import { Effect } from 'effect'
|
|
2
|
+
import { describe, expect, it } from 'vitest'
|
|
3
|
+
import { countTokens, countTokensApprox, countWords } from './tokens.js'
|
|
4
|
+
|
|
5
|
+
describe('token utilities', () => {
|
|
6
|
+
describe('countWords', () => {
|
|
7
|
+
it('counts words in a simple sentence', () => {
|
|
8
|
+
expect(countWords('Hello world')).toBe(2)
|
|
9
|
+
})
|
|
10
|
+
|
|
11
|
+
it('handles empty string', () => {
|
|
12
|
+
expect(countWords('')).toBe(0)
|
|
13
|
+
})
|
|
14
|
+
|
|
15
|
+
it('handles whitespace only', () => {
|
|
16
|
+
expect(countWords(' ')).toBe(0)
|
|
17
|
+
})
|
|
18
|
+
|
|
19
|
+
it('handles multiple spaces between words', () => {
|
|
20
|
+
expect(countWords('hello world')).toBe(2)
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
it('handles newlines and tabs', () => {
|
|
24
|
+
expect(countWords('hello\nworld\there')).toBe(3)
|
|
25
|
+
})
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
describe('countTokensApprox', () => {
|
|
29
|
+
it('estimates tokens for short text', () => {
|
|
30
|
+
// Approximation includes safety margin, so we check it's reasonable
|
|
31
|
+
const text = 'Hello world' // 11 chars
|
|
32
|
+
const estimate = countTokensApprox(text)
|
|
33
|
+
expect(estimate).toBeGreaterThan(0)
|
|
34
|
+
// With 20% safety margin, should be around 4-5 tokens
|
|
35
|
+
expect(estimate).toBeGreaterThanOrEqual(3)
|
|
36
|
+
expect(estimate).toBeLessThanOrEqual(10)
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
it('handles empty string', () => {
|
|
40
|
+
expect(countTokensApprox('')).toBe(0)
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
it('estimates longer text', () => {
|
|
44
|
+
const text =
|
|
45
|
+
'This is a longer piece of text that should have more tokens.'
|
|
46
|
+
const estimate = countTokensApprox(text)
|
|
47
|
+
expect(estimate).toBeGreaterThan(10)
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
it('is conservative for code blocks', () => {
|
|
51
|
+
const code = '```javascript\nfunction foo() {\n return bar;\n}\n```'
|
|
52
|
+
const estimate = countTokensApprox(code)
|
|
53
|
+
// Code block is ~50 chars, actual tiktoken count is 13 tokens
|
|
54
|
+
// Estimate should be >= actual (conservative) - we prioritize never under-counting
|
|
55
|
+
// for budget enforcement, so we allow up to 100% over-estimation
|
|
56
|
+
expect(estimate).toBeGreaterThanOrEqual(13)
|
|
57
|
+
expect(estimate).toBeLessThanOrEqual(26) // up to 2x actual
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
it('accounts for punctuation', () => {
|
|
61
|
+
const textWithPunctuation = 'Hello, world! How are you? Fine, thanks.'
|
|
62
|
+
const plainText = 'Hello world How are you Fine thanks'
|
|
63
|
+
const withPunc = countTokensApprox(textWithPunctuation)
|
|
64
|
+
const withoutPunc = countTokensApprox(plainText)
|
|
65
|
+
// Punctuation should add some token overhead
|
|
66
|
+
expect(withPunc).toBeGreaterThan(withoutPunc)
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
it('accounts for newlines', () => {
|
|
70
|
+
const singleLine = 'Hello world'
|
|
71
|
+
const multiLine = 'Hello\nworld'
|
|
72
|
+
const singleEst = countTokensApprox(singleLine)
|
|
73
|
+
const multiEst = countTokensApprox(multiLine)
|
|
74
|
+
// Newlines add token overhead
|
|
75
|
+
expect(multiEst).toBeGreaterThanOrEqual(singleEst)
|
|
76
|
+
})
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
describe('countTokensApprox accuracy vs tiktoken', () => {
|
|
80
|
+
// These tests verify the approximation is conservative (never under-estimates)
|
|
81
|
+
// We allow up to 2x over-estimation to ensure we NEVER violate token budgets
|
|
82
|
+
// Being over is safe (wastes some budget), under is dangerous (budget violations)
|
|
83
|
+
|
|
84
|
+
it('is conservative (never under-estimates) for prose', async () => {
|
|
85
|
+
const text =
|
|
86
|
+
'This is a simple sentence with some common words that form a typical paragraph.'
|
|
87
|
+
const approx = countTokensApprox(text)
|
|
88
|
+
const actual = await Effect.runPromise(countTokens(text))
|
|
89
|
+
// Approximation should be >= actual (conservative) - this is the critical requirement
|
|
90
|
+
expect(approx).toBeGreaterThanOrEqual(actual)
|
|
91
|
+
// Allow up to 2x over to ensure we never under-count
|
|
92
|
+
expect(approx).toBeLessThanOrEqual(actual * 2)
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
it('is conservative for code blocks', async () => {
|
|
96
|
+
const code =
|
|
97
|
+
'```typescript\nfunction parseDocument(input: string): AST {\n const tokens = tokenize(input);\n return buildTree(tokens);\n}\n```'
|
|
98
|
+
const approx = countTokensApprox(code)
|
|
99
|
+
const actual = await Effect.runPromise(countTokens(code))
|
|
100
|
+
// Approximation should be >= actual (conservative)
|
|
101
|
+
expect(approx).toBeGreaterThanOrEqual(actual)
|
|
102
|
+
// Allow up to 2.5x for code blocks (they're hardest to estimate)
|
|
103
|
+
expect(approx).toBeLessThanOrEqual(actual * 2.5)
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
it('is conservative for inline code', async () => {
|
|
107
|
+
const text =
|
|
108
|
+
'Use the `countTokens` function to count tokens in a `string`.'
|
|
109
|
+
const approx = countTokensApprox(text)
|
|
110
|
+
const actual = await Effect.runPromise(countTokens(text))
|
|
111
|
+
expect(approx).toBeGreaterThanOrEqual(actual)
|
|
112
|
+
expect(approx).toBeLessThanOrEqual(actual * 2)
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
it('is conservative for file paths', async () => {
|
|
116
|
+
const path =
|
|
117
|
+
'/very/long/path/to/deeply/nested/directory/structure/that/keeps/going/file.md'
|
|
118
|
+
const approx = countTokensApprox(path)
|
|
119
|
+
const actual = await Effect.runPromise(countTokens(path))
|
|
120
|
+
expect(approx).toBeGreaterThanOrEqual(actual)
|
|
121
|
+
expect(approx).toBeLessThanOrEqual(actual * 2)
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
it('is conservative for mixed content', async () => {
|
|
125
|
+
const text =
|
|
126
|
+
'# Title\n\nSome prose with `code` and a path `/src/utils.ts`.\n\n```js\nconst x = 1;\n```'
|
|
127
|
+
const approx = countTokensApprox(text)
|
|
128
|
+
const actual = await Effect.runPromise(countTokens(text))
|
|
129
|
+
expect(approx).toBeGreaterThanOrEqual(actual)
|
|
130
|
+
expect(approx).toBeLessThanOrEqual(actual * 2)
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
it('is conservative for punctuation-heavy text', async () => {
|
|
134
|
+
const text =
|
|
135
|
+
'Hello, world! How are you? Fine, thanks... Well: good! (Yes, really.)'
|
|
136
|
+
const approx = countTokensApprox(text)
|
|
137
|
+
const actual = await Effect.runPromise(countTokens(text))
|
|
138
|
+
expect(approx).toBeGreaterThanOrEqual(actual)
|
|
139
|
+
expect(approx).toBeLessThanOrEqual(actual * 2)
|
|
140
|
+
})
|
|
141
|
+
})
|
|
142
|
+
})
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token counting utilities using tiktoken
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { Effect } from 'effect'
|
|
6
|
+
|
|
7
|
+
// Lazy-loaded tiktoken encoder
|
|
8
|
+
let encoder: Awaited<
|
|
9
|
+
ReturnType<typeof import('tiktoken').get_encoding>
|
|
10
|
+
> | null = null
|
|
11
|
+
|
|
12
|
+
const getEncoder = Effect.gen(function* () {
|
|
13
|
+
if (encoder === null) {
|
|
14
|
+
const { get_encoding } = yield* Effect.promise(() => import('tiktoken'))
|
|
15
|
+
encoder = get_encoding('cl100k_base')
|
|
16
|
+
}
|
|
17
|
+
return encoder
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Count tokens in a string using the cl100k_base encoding
|
|
22
|
+
* (compatible with GPT-4, GPT-3.5-turbo, and Claude models)
|
|
23
|
+
*/
|
|
24
|
+
export const countTokens = (
|
|
25
|
+
text: string,
|
|
26
|
+
): Effect.Effect<number, never, never> =>
|
|
27
|
+
Effect.gen(function* () {
|
|
28
|
+
const enc = yield* getEncoder
|
|
29
|
+
const tokens = enc.encode(text)
|
|
30
|
+
return tokens.length
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Synchronous token counting with improved approximation
|
|
35
|
+
*
|
|
36
|
+
* Uses heuristics calibrated against cl100k_base encoding:
|
|
37
|
+
* - Base prose: ~3.5 chars/token (conservative to never under-count)
|
|
38
|
+
* - Code blocks: Content at ~2.8 chars/token + fixed overhead per block
|
|
39
|
+
* - Inline code: ~2.5 chars/token + 2 tokens per backtick pair
|
|
40
|
+
* - Paths: ~3.0 chars/token (slashes tokenize separately)
|
|
41
|
+
* - Newlines: ~1 token each (they often become separate tokens)
|
|
42
|
+
* - Punctuation/symbols: adds ~0.8 tokens per mark
|
|
43
|
+
* - CJK characters: ~1.2 tokens per character
|
|
44
|
+
* - Emojis: ~2.5 tokens per emoji
|
|
45
|
+
*
|
|
46
|
+
* Safety margin of 10% to handle edge cases and ensure budget compliance.
|
|
47
|
+
* The conservative ratios combined with safety margin ensure we NEVER under-count.
|
|
48
|
+
*/
|
|
49
|
+
export const countTokensApprox = (text: string): number => {
|
|
50
|
+
if (text.length === 0) return 0
|
|
51
|
+
|
|
52
|
+
// Count CJK characters (Chinese, Japanese, Korean)
|
|
53
|
+
// These typically tokenize to 1-2 tokens per character
|
|
54
|
+
// Unicode ranges: CJK Unified Ideographs, Hiragana, Katakana, Hangul
|
|
55
|
+
const cjkPattern =
|
|
56
|
+
/[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf]/g
|
|
57
|
+
const cjkMatches = text.match(cjkPattern) || []
|
|
58
|
+
const cjkCount = cjkMatches.length
|
|
59
|
+
|
|
60
|
+
// Count emojis and symbols (they often tokenize to 2-4 tokens each)
|
|
61
|
+
// This pattern catches most common emojis, symbols, and dingbats
|
|
62
|
+
// Also count variation selectors (FE0E/FE0F) which add extra tokens
|
|
63
|
+
const emojiPattern =
|
|
64
|
+
/[\u{1F300}-\u{1F9FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F600}-\u{1F64F}\u{1F680}-\u{1F6FF}\u{2300}-\u{23FF}\u{2190}-\u{21FF}\u{25A0}-\u{25FF}\u{2B00}-\u{2BFF}]/gu
|
|
65
|
+
const emojiMatches = text.match(emojiPattern) || []
|
|
66
|
+
const emojiCount = emojiMatches.length
|
|
67
|
+
|
|
68
|
+
// Count variation selectors (they add extra tokens to base characters)
|
|
69
|
+
const variationSelectorPattern = /[\uFE0E\uFE0F]/g
|
|
70
|
+
const variationMatches = text.match(variationSelectorPattern) || []
|
|
71
|
+
const variationCount = variationMatches.length
|
|
72
|
+
|
|
73
|
+
// Extract and analyze code blocks
|
|
74
|
+
// Each code block has fixed overhead: ``` markers + language identifier
|
|
75
|
+
let workingText = text
|
|
76
|
+
const codeBlockMatches = text.match(/```[\s\S]*?```/g) || []
|
|
77
|
+
let codeBlockTokens = 0
|
|
78
|
+
|
|
79
|
+
for (const block of codeBlockMatches) {
|
|
80
|
+
// Check if it has a language identifier
|
|
81
|
+
const hasLang = /^```\w+/.test(block)
|
|
82
|
+
// Fixed overhead: opening backticks (1) + lang (1-2) + newline after lang (1) + closing backticks (1) + newline before close (1)
|
|
83
|
+
const overhead = hasLang ? 6 : 4
|
|
84
|
+
// Content between backticks (excluding the markers themselves)
|
|
85
|
+
const content = block.replace(/^```\w*\n?/, '').replace(/\n?```$/, '')
|
|
86
|
+
// Content newlines - each is typically 1 token
|
|
87
|
+
const contentNewlines = (content.match(/\n/g) || []).length
|
|
88
|
+
// Code content at ~2.5 chars/token (code has many symbols that become separate tokens)
|
|
89
|
+
const contentTokens = content.length > 0 ? content.length / 2.5 : 0
|
|
90
|
+
// Minimum 6 tokens for any code block (overhead alone)
|
|
91
|
+
codeBlockTokens += Math.max(
|
|
92
|
+
overhead,
|
|
93
|
+
overhead + contentNewlines + contentTokens,
|
|
94
|
+
)
|
|
95
|
+
workingText = workingText.replace(block, '')
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Extract inline code from the remaining text (single backticks)
|
|
99
|
+
const inlineCodeMatches = workingText.match(/`[^`]+`/g) || []
|
|
100
|
+
let inlineCodeTokens = 0
|
|
101
|
+
for (const match of inlineCodeMatches) {
|
|
102
|
+
// Each inline code has 2 tokens overhead (opening and closing backticks)
|
|
103
|
+
const content = match.slice(1, -1)
|
|
104
|
+
inlineCodeTokens += 2 + content.length / 2.5
|
|
105
|
+
workingText = workingText.replace(match, '')
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Count path-like sequences (consecutive /word patterns)
|
|
109
|
+
const pathMatches = workingText.match(/(?:\/[\w.-]+)+/g) || []
|
|
110
|
+
let pathTokens = 0
|
|
111
|
+
for (const match of pathMatches) {
|
|
112
|
+
// Each slash is typically a separate token, plus the path segments
|
|
113
|
+
const slashCount = (match.match(/\//g) || []).length
|
|
114
|
+
const contentLength = match.length - slashCount
|
|
115
|
+
pathTokens += slashCount + contentLength / 3.5
|
|
116
|
+
workingText = workingText.replace(match, '')
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Count punctuation and symbols in prose - these often become separate tokens
|
|
120
|
+
// Include more symbol characters that commonly appear in technical content
|
|
121
|
+
const punctuationMatches =
|
|
122
|
+
workingText.match(/[!?,.:;'"()[\]{}@#$%^&*+=|\\<>~\-/]/g) || []
|
|
123
|
+
const punctuationCount = punctuationMatches.length
|
|
124
|
+
|
|
125
|
+
// Count newlines in remaining prose
|
|
126
|
+
const proseNewlines = (workingText.match(/\n/g) || []).length
|
|
127
|
+
|
|
128
|
+
// Remaining prose length (excluding special characters already counted)
|
|
129
|
+
const proseLength = Math.max(
|
|
130
|
+
0,
|
|
131
|
+
workingText.length -
|
|
132
|
+
proseNewlines -
|
|
133
|
+
cjkCount -
|
|
134
|
+
emojiCount -
|
|
135
|
+
variationCount -
|
|
136
|
+
punctuationCount,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
// Estimate tokens with calibrated ratios:
|
|
140
|
+
// - ASCII prose: ~3.5 chars/token (conservative - actual is ~4-5 but we want safety margin)
|
|
141
|
+
// - Newlines in prose: ~1 token each
|
|
142
|
+
// - Punctuation: ~0.8 tokens per mark (most become separate tokens or affect adjacent)
|
|
143
|
+
// - CJK: ~1.2 tokens per character (conservative estimate)
|
|
144
|
+
// - Emojis: ~2.5 tokens per emoji (conservative for compound emojis)
|
|
145
|
+
// - Variation selectors: ~1 token each
|
|
146
|
+
const proseTokens = proseLength / 3.5
|
|
147
|
+
const proseNewlineTokens = proseNewlines * 1
|
|
148
|
+
const punctuationBonus = punctuationCount * 0.8
|
|
149
|
+
const cjkTokens = cjkCount * 1.2
|
|
150
|
+
const emojiTokens = emojiCount * 2.5
|
|
151
|
+
const variationTokens = variationCount * 1
|
|
152
|
+
|
|
153
|
+
const estimate =
|
|
154
|
+
proseTokens +
|
|
155
|
+
proseNewlineTokens +
|
|
156
|
+
codeBlockTokens +
|
|
157
|
+
inlineCodeTokens +
|
|
158
|
+
pathTokens +
|
|
159
|
+
punctuationBonus +
|
|
160
|
+
cjkTokens +
|
|
161
|
+
emojiTokens +
|
|
162
|
+
variationTokens
|
|
163
|
+
|
|
164
|
+
// Add 10% safety margin to ensure we never under-count (critical for budget enforcement)
|
|
165
|
+
return Math.ceil(estimate * 1.1)
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Count words in text
|
|
170
|
+
*/
|
|
171
|
+
export const countWords = (text: string): number => {
|
|
172
|
+
const trimmed = text.trim()
|
|
173
|
+
if (trimmed.length === 0) return 0
|
|
174
|
+
return trimmed.split(/\s+/).length
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Free the tiktoken encoder to release WebAssembly resources.
|
|
179
|
+
* Call this in test teardown to prevent process hang.
|
|
180
|
+
*/
|
|
181
|
+
export const freeEncoder = (): void => {
|
|
182
|
+
if (encoder !== null) {
|
|
183
|
+
encoder.free()
|
|
184
|
+
encoder = null
|
|
185
|
+
}
|
|
186
|
+
}
|
package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin
ADDED
|
Binary file
|
package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin
ADDED
|
Binary file
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"rootPath": "tests/fixtures/cli",
|
|
4
|
+
"documents": {
|
|
5
|
+
"README.md": {
|
|
6
|
+
"id": "04c6e90faac2",
|
|
7
|
+
"path": "README.md",
|
|
8
|
+
"title": "Test Project",
|
|
9
|
+
"mtime": 1769492431399,
|
|
10
|
+
"hash": "54872b0fdbf6858a",
|
|
11
|
+
"tokenCount": 76,
|
|
12
|
+
"sectionCount": 2
|
|
13
|
+
},
|
|
14
|
+
"api-reference.md": {
|
|
15
|
+
"id": "392e93c0f22d",
|
|
16
|
+
"path": "api-reference.md",
|
|
17
|
+
"title": "API Reference",
|
|
18
|
+
"mtime": 1769492431399,
|
|
19
|
+
"hash": "4a879da54a831235",
|
|
20
|
+
"tokenCount": 109,
|
|
21
|
+
"sectionCount": 2
|
|
22
|
+
},
|
|
23
|
+
"getting-started.md": {
|
|
24
|
+
"id": "b6885e1f8555",
|
|
25
|
+
"path": "getting-started.md",
|
|
26
|
+
"title": "Getting Started",
|
|
27
|
+
"mtime": 1769492431400,
|
|
28
|
+
"hash": "2d44a41d5d2579f2",
|
|
29
|
+
"tokenCount": 66,
|
|
30
|
+
"sectionCount": 3
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|