mdcontext 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.changeset/README.md +28 -0
- package/.changeset/config.json +11 -0
- package/.claude/settings.local.json +25 -0
- package/.github/workflows/ci.yml +83 -0
- package/.github/workflows/claude-code-review.yml +44 -0
- package/.github/workflows/claude.yml +85 -0
- package/.github/workflows/release.yml +113 -0
- package/.tldrignore +112 -0
- package/BACKLOG.md +338 -0
- package/CONTRIBUTING.md +186 -0
- package/NOTES/NOTES +44 -0
- package/README.md +434 -11
- package/biome.json +36 -0
- package/cspell.config.yaml +14 -0
- package/dist/chunk-23UPXDNL.js +3044 -0
- package/dist/chunk-2W7MO2DL.js +1366 -0
- package/dist/chunk-3NUAZGMA.js +1689 -0
- package/dist/chunk-7TOWB2XB.js +366 -0
- package/dist/chunk-7XOTOADQ.js +3065 -0
- package/dist/chunk-AH2PDM2K.js +3042 -0
- package/dist/chunk-BNXWSZ63.js +3742 -0
- package/dist/chunk-BTL5DJVU.js +3222 -0
- package/dist/chunk-HDHYG7E4.js +104 -0
- package/dist/chunk-HLR4KZBP.js +3234 -0
- package/dist/chunk-IP3FRFEB.js +1045 -0
- package/dist/chunk-KHU56VDO.js +3042 -0
- package/dist/chunk-KRYIFLQR.js +88 -0
- package/dist/chunk-LBSDNLEM.js +287 -0
- package/dist/chunk-MNTQ7HCP.js +2643 -0
- package/dist/chunk-MUJELQQ6.js +1387 -0
- package/dist/chunk-MXJGMSLV.js +2199 -0
- package/dist/chunk-N6QJGC3Z.js +2636 -0
- package/dist/chunk-OBELGBPM.js +1713 -0
- package/dist/chunk-OT7R5XTA.js +3192 -0
- package/dist/chunk-P7X4RA2T.js +106 -0
- package/dist/chunk-PIDUQNC2.js +3185 -0
- package/dist/chunk-POGCDIH4.js +3187 -0
- package/dist/chunk-PSIEOQGZ.js +3043 -0
- package/dist/chunk-PVRT3IHA.js +3238 -0
- package/dist/chunk-QNN4TT23.js +1430 -0
- package/dist/chunk-RE3R45RJ.js +3042 -0
- package/dist/chunk-S7E6TFX6.js +803 -0
- package/dist/chunk-SG6GLU4U.js +1378 -0
- package/dist/chunk-SJCDV2ST.js +274 -0
- package/dist/chunk-SYE5XLF3.js +104 -0
- package/dist/chunk-T5VLYBZD.js +103 -0
- package/dist/chunk-TOQB7VWU.js +3238 -0
- package/dist/chunk-VFNMZ4ZQ.js +3228 -0
- package/dist/chunk-VVTGZNBT.js +1629 -0
- package/dist/chunk-W7Q4RFEV.js +104 -0
- package/dist/chunk-XTYYVRLO.js +3190 -0
- package/dist/chunk-Y6MDYVJD.js +3063 -0
- package/dist/cli/main.d.ts +1 -0
- package/dist/cli/main.js +5458 -0
- package/dist/index.d.ts +653 -0
- package/dist/index.js +79 -0
- package/dist/mcp/server.d.ts +1 -0
- package/dist/mcp/server.js +472 -0
- package/dist/schema-BAWSG7KY.js +22 -0
- package/dist/schema-E3QUPL26.js +20 -0
- package/dist/schema-EHL7WUT6.js +20 -0
- package/docs/019-USAGE.md +625 -0
- package/docs/020-current-implementation.md +364 -0
- package/docs/021-DOGFOODING-FINDINGS.md +175 -0
- package/docs/BACKLOG.md +80 -0
- package/docs/CONFIG.md +1123 -0
- package/docs/DESIGN.md +439 -0
- package/docs/ERRORS.md +383 -0
- package/docs/PROJECT.md +88 -0
- package/docs/ROADMAP.md +407 -0
- package/docs/summarization.md +320 -0
- package/docs/test-links.md +9 -0
- package/justfile +40 -0
- package/package.json +74 -9
- package/pnpm-workspace.yaml +5 -0
- package/research/INDEX.md +315 -0
- package/research/code-review/README.md +90 -0
- package/research/code-review/cli-error-handling-review.md +979 -0
- package/research/code-review/code-review-validation-report.md +464 -0
- package/research/code-review/main-ts-review.md +1128 -0
- package/research/config-analysis/01-current-implementation.md +470 -0
- package/research/config-analysis/02-strategy-recommendation.md +428 -0
- package/research/config-analysis/03-task-candidates.md +715 -0
- package/research/config-analysis/033-research-configuration-management.md +828 -0
- package/research/config-analysis/034-research-effect-cli-config.md +1504 -0
- package/research/config-analysis/04-consolidated-task-candidates.md +277 -0
- package/research/config-docs/SUMMARY.md +357 -0
- package/research/config-docs/TEST-RESULTS.md +776 -0
- package/research/config-docs/TODO.md +542 -0
- package/research/config-docs/analysis.md +744 -0
- package/research/config-docs/fix-validation.md +502 -0
- package/research/config-docs/help-audit.md +264 -0
- package/research/config-docs/help-system-analysis.md +890 -0
- package/research/dogfood/consolidated-tool-evaluation.md +373 -0
- package/research/dogfood/strategy-a/a-synthesis.md +184 -0
- package/research/dogfood/strategy-a/a1-docs.md +226 -0
- package/research/dogfood/strategy-a/a2-amorphic.md +156 -0
- package/research/dogfood/strategy-a/a3-llm.md +164 -0
- package/research/dogfood/strategy-b/b-synthesis.md +228 -0
- package/research/dogfood/strategy-b/b1-architecture.md +207 -0
- package/research/dogfood/strategy-b/b2-gaps.md +258 -0
- package/research/dogfood/strategy-b/b3-workflows.md +250 -0
- package/research/dogfood/strategy-c/c-synthesis.md +451 -0
- package/research/dogfood/strategy-c/c1-explorer.md +192 -0
- package/research/dogfood/strategy-c/c2-diver-memory.md +145 -0
- package/research/dogfood/strategy-c/c3-diver-control.md +148 -0
- package/research/dogfood/strategy-c/c4-diver-failure.md +151 -0
- package/research/dogfood/strategy-c/c5-diver-execution.md +221 -0
- package/research/dogfood/strategy-c/c6-diver-org.md +221 -0
- package/research/effect-cli-error-handling.md +845 -0
- package/research/effect-errors-as-values.md +943 -0
- package/research/errors-task-analysis/00-consolidated-tasks.md +207 -0
- package/research/errors-task-analysis/cli-commands-analysis.md +909 -0
- package/research/errors-task-analysis/embeddings-analysis.md +709 -0
- package/research/errors-task-analysis/index-search-analysis.md +812 -0
- package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
- package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
- package/research/issue-review.md +603 -0
- package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
- package/research/llm-summarization/alternative-providers-2026.md +1428 -0
- package/research/llm-summarization/anthropic-2026.md +367 -0
- package/research/llm-summarization/claude-cli-integration.md +1706 -0
- package/research/llm-summarization/cli-integration-patterns.md +3155 -0
- package/research/llm-summarization/openai-2026.md +473 -0
- package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
- package/research/llm-summarization/opencode-cli-integration.md +1552 -0
- package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
- package/research/llm-summarization/prototype-results.md +56 -0
- package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
- package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
- package/research/mdcontext-error-analysis.md +521 -0
- package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
- package/research/mdcontext-pudding/01-index-embed.md +956 -0
- package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
- package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
- package/research/mdcontext-pudding/02-search.md +970 -0
- package/research/mdcontext-pudding/03-context.md +779 -0
- package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
- package/research/mdcontext-pudding/04-tree.md +704 -0
- package/research/mdcontext-pudding/05-config.md +1038 -0
- package/research/mdcontext-pudding/06-links-summary.txt +87 -0
- package/research/mdcontext-pudding/06-links.md +679 -0
- package/research/mdcontext-pudding/07-stats.md +693 -0
- package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
- package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
- package/research/mdcontext-pudding/README.md +168 -0
- package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
- package/research/npm_publish/011-npm-workflow-research-agent2.md +792 -0
- package/research/npm_publish/012-npm-workflow-research-agent1.md +530 -0
- package/research/npm_publish/013-npm-workflow-research-agent3.md +722 -0
- package/research/npm_publish/014-npm-workflow-synthesis.md +556 -0
- package/research/npm_publish/031-npm-workflow-task-analysis.md +134 -0
- package/research/research-quality-review.md +834 -0
- package/research/semantic-search/002-research-embedding-models.md +490 -0
- package/research/semantic-search/003-research-rag-alternatives.md +523 -0
- package/research/semantic-search/004-research-vector-search.md +841 -0
- package/research/semantic-search/032-research-semantic-search.md +427 -0
- package/research/semantic-search/embedding-text-analysis.md +156 -0
- package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
- package/research/semantic-search/query-processing-analysis.md +207 -0
- package/research/semantic-search/root-cause-and-solution.md +114 -0
- package/research/semantic-search/threshold-validation-report.md +69 -0
- package/research/semantic-search/vector-search-analysis.md +63 -0
- package/research/task-management-2026/00-synthesis-recommendations.md +295 -0
- package/research/task-management-2026/01-ai-workflow-tools.md +416 -0
- package/research/task-management-2026/02-agent-framework-patterns.md +476 -0
- package/research/task-management-2026/03-lightweight-file-based.md +567 -0
- package/research/task-management-2026/04-established-tools-ai-features.md +541 -0
- package/research/task-management-2026/linear/01-core-features-workflow.md +771 -0
- package/research/task-management-2026/linear/02-api-integrations.md +930 -0
- package/research/task-management-2026/linear/03-ai-features.md +368 -0
- package/research/task-management-2026/linear/04-pricing-setup.md +205 -0
- package/research/task-management-2026/linear/05-usage-patterns-best-practices.md +605 -0
- package/research/test-path-issues.md +276 -0
- package/review/ALP-76/1-error-type-design.md +962 -0
- package/review/ALP-76/2-error-handling-patterns.md +906 -0
- package/review/ALP-76/3-error-presentation.md +624 -0
- package/review/ALP-76/4-test-coverage.md +625 -0
- package/review/ALP-76/5-migration-completeness.md +440 -0
- package/review/ALP-76/6-effect-best-practices.md +755 -0
- package/scripts/apply-branch-protection.sh +47 -0
- package/scripts/branch-protection-templates.json +79 -0
- package/scripts/prototype-summarization.ts +346 -0
- package/scripts/rebuild-hnswlib.js +58 -0
- package/scripts/setup-branch-protection.sh +64 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
- package/src/cli/argv-preprocessor.test.ts +210 -0
- package/src/cli/argv-preprocessor.ts +202 -0
- package/src/cli/cli.test.ts +627 -0
- package/src/cli/commands/backlinks.ts +54 -0
- package/src/cli/commands/config-cmd.ts +642 -0
- package/src/cli/commands/context.ts +285 -0
- package/src/cli/commands/duplicates.ts +122 -0
- package/src/cli/commands/embeddings.ts +529 -0
- package/src/cli/commands/index-cmd.ts +480 -0
- package/src/cli/commands/index.ts +16 -0
- package/src/cli/commands/links.ts +52 -0
- package/src/cli/commands/search.ts +1281 -0
- package/src/cli/commands/stats.ts +149 -0
- package/src/cli/commands/tree.ts +128 -0
- package/src/cli/config-layer.ts +176 -0
- package/src/cli/error-handler.test.ts +235 -0
- package/src/cli/error-handler.ts +655 -0
- package/src/cli/flag-schemas.ts +341 -0
- package/src/cli/help.ts +588 -0
- package/src/cli/index.ts +9 -0
- package/src/cli/main.ts +435 -0
- package/src/cli/options.ts +41 -0
- package/src/cli/shared-error-handling.ts +199 -0
- package/src/cli/typo-suggester.test.ts +105 -0
- package/src/cli/typo-suggester.ts +130 -0
- package/src/cli/utils.ts +259 -0
- package/src/config/file-provider.test.ts +320 -0
- package/src/config/file-provider.ts +273 -0
- package/src/config/index.ts +72 -0
- package/src/config/integration.test.ts +667 -0
- package/src/config/precedence.test.ts +277 -0
- package/src/config/precedence.ts +451 -0
- package/src/config/schema.test.ts +414 -0
- package/src/config/schema.ts +603 -0
- package/src/config/service.test.ts +320 -0
- package/src/config/service.ts +243 -0
- package/src/config/testing.test.ts +264 -0
- package/src/config/testing.ts +110 -0
- package/src/core/index.ts +1 -0
- package/src/core/types.ts +113 -0
- package/src/duplicates/detector.test.ts +183 -0
- package/src/duplicates/detector.ts +414 -0
- package/src/duplicates/index.ts +18 -0
- package/src/embeddings/embedding-namespace.test.ts +300 -0
- package/src/embeddings/embedding-namespace.ts +947 -0
- package/src/embeddings/heading-boost.test.ts +222 -0
- package/src/embeddings/hnsw-build-options.test.ts +198 -0
- package/src/embeddings/hyde.test.ts +272 -0
- package/src/embeddings/hyde.ts +264 -0
- package/src/embeddings/index.ts +10 -0
- package/src/embeddings/openai-provider.ts +414 -0
- package/src/embeddings/pricing.json +22 -0
- package/src/embeddings/provider-constants.ts +204 -0
- package/src/embeddings/provider-errors.test.ts +967 -0
- package/src/embeddings/provider-errors.ts +565 -0
- package/src/embeddings/provider-factory.test.ts +240 -0
- package/src/embeddings/provider-factory.ts +225 -0
- package/src/embeddings/provider-integration.test.ts +788 -0
- package/src/embeddings/query-preprocessing.test.ts +187 -0
- package/src/embeddings/semantic-search-threshold.test.ts +508 -0
- package/src/embeddings/semantic-search.ts +1270 -0
- package/src/embeddings/types.ts +359 -0
- package/src/embeddings/vector-store.ts +708 -0
- package/src/embeddings/voyage-provider.ts +313 -0
- package/src/errors/errors.test.ts +845 -0
- package/src/errors/index.ts +533 -0
- package/src/index/ignore-patterns.test.ts +354 -0
- package/src/index/ignore-patterns.ts +305 -0
- package/src/index/index.ts +4 -0
- package/src/index/indexer.ts +684 -0
- package/src/index/storage.ts +260 -0
- package/src/index/types.ts +147 -0
- package/src/index/watcher.ts +189 -0
- package/src/index.ts +30 -0
- package/src/integration/search-keyword.test.ts +678 -0
- package/src/mcp/server.ts +612 -0
- package/src/parser/index.ts +1 -0
- package/src/parser/parser.test.ts +291 -0
- package/src/parser/parser.ts +394 -0
- package/src/parser/section-filter.test.ts +277 -0
- package/src/parser/section-filter.ts +392 -0
- package/src/search/__tests__/hybrid-search.test.ts +650 -0
- package/src/search/bm25-store.ts +366 -0
- package/src/search/cross-encoder.test.ts +253 -0
- package/src/search/cross-encoder.ts +406 -0
- package/src/search/fuzzy-search.test.ts +419 -0
- package/src/search/fuzzy-search.ts +273 -0
- package/src/search/hybrid-search.ts +448 -0
- package/src/search/path-matcher.test.ts +276 -0
- package/src/search/path-matcher.ts +33 -0
- package/src/search/query-parser.test.ts +260 -0
- package/src/search/query-parser.ts +319 -0
- package/src/search/searcher.test.ts +280 -0
- package/src/search/searcher.ts +724 -0
- package/src/search/wink-bm25.d.ts +30 -0
- package/src/summarization/cli-providers/claude.ts +202 -0
- package/src/summarization/cli-providers/detection.test.ts +273 -0
- package/src/summarization/cli-providers/detection.ts +118 -0
- package/src/summarization/cli-providers/index.ts +8 -0
- package/src/summarization/cost.test.ts +139 -0
- package/src/summarization/cost.ts +102 -0
- package/src/summarization/error-handler.test.ts +127 -0
- package/src/summarization/error-handler.ts +111 -0
- package/src/summarization/index.ts +102 -0
- package/src/summarization/pipeline.test.ts +498 -0
- package/src/summarization/pipeline.ts +231 -0
- package/src/summarization/prompts.test.ts +269 -0
- package/src/summarization/prompts.ts +133 -0
- package/src/summarization/provider-factory.test.ts +396 -0
- package/src/summarization/provider-factory.ts +178 -0
- package/src/summarization/types.ts +184 -0
- package/src/summarize/budget-bugs.test.ts +620 -0
- package/src/summarize/formatters.ts +419 -0
- package/src/summarize/index.ts +20 -0
- package/src/summarize/summarizer.test.ts +275 -0
- package/src/summarize/summarizer.ts +597 -0
- package/src/summarize/verify-bugs.test.ts +238 -0
- package/src/types/huggingface-transformers.d.ts +66 -0
- package/src/utils/index.ts +1 -0
- package/src/utils/tokens.test.ts +142 -0
- package/src/utils/tokens.ts +186 -0
- package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
- package/tests/fixtures/cli/.mdcontext/config.json +8 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/indexes/documents.json +33 -0
- package/tests/fixtures/cli/.mdcontext/indexes/links.json +12 -0
- package/tests/fixtures/cli/.mdcontext/indexes/sections.json +247 -0
- package/tests/fixtures/cli/README.md +9 -0
- package/tests/fixtures/cli/api-reference.md +11 -0
- package/tests/fixtures/cli/getting-started.md +11 -0
- package/tests/integration/embed-index.test.ts +712 -0
- package/tests/integration/search-context.test.ts +469 -0
- package/tests/integration/search-semantic.test.ts +522 -0
- package/tsconfig.json +26 -0
- package/vitest.config.ts +16 -0
- package/vitest.setup.ts +12 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import { Effect } from 'effect'
|
|
2
|
+
import { describe, expect, it } from 'vitest'
|
|
3
|
+
import { parse } from './parser.js'
|
|
4
|
+
|
|
5
|
+
describe('markdown parser', () => {
|
|
6
|
+
describe('basic parsing', () => {
|
|
7
|
+
it('parses a simple markdown document', async () => {
|
|
8
|
+
const content = `# Hello World
|
|
9
|
+
|
|
10
|
+
This is a paragraph.
|
|
11
|
+
|
|
12
|
+
## Section One
|
|
13
|
+
|
|
14
|
+
Content for section one.
|
|
15
|
+
|
|
16
|
+
## Section Two
|
|
17
|
+
|
|
18
|
+
Content for section two.
|
|
19
|
+
`
|
|
20
|
+
|
|
21
|
+
const result = await Effect.runPromise(parse(content))
|
|
22
|
+
|
|
23
|
+
expect(result.title).toBe('Hello World')
|
|
24
|
+
expect(result.sections).toHaveLength(1) // One H1 as root
|
|
25
|
+
expect(result.sections[0]?.heading).toBe('Hello World')
|
|
26
|
+
expect(result.sections[0]?.children).toHaveLength(2) // Two H2 children
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
it('extracts frontmatter', async () => {
|
|
30
|
+
const content = `---
|
|
31
|
+
title: Custom Title
|
|
32
|
+
author: Test Author
|
|
33
|
+
tags:
|
|
34
|
+
- markdown
|
|
35
|
+
- parser
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
# Heading
|
|
39
|
+
|
|
40
|
+
Content here.
|
|
41
|
+
`
|
|
42
|
+
|
|
43
|
+
const result = await Effect.runPromise(parse(content))
|
|
44
|
+
|
|
45
|
+
expect(result.frontmatter).toEqual({
|
|
46
|
+
title: 'Custom Title',
|
|
47
|
+
author: 'Test Author',
|
|
48
|
+
tags: ['markdown', 'parser'],
|
|
49
|
+
})
|
|
50
|
+
expect(result.title).toBe('Heading') // H1 takes precedence
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
it('uses frontmatter title when no H1 present', async () => {
|
|
54
|
+
const content = `---
|
|
55
|
+
title: Frontmatter Title
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Only an H2
|
|
59
|
+
|
|
60
|
+
Some content.
|
|
61
|
+
`
|
|
62
|
+
|
|
63
|
+
const result = await Effect.runPromise(parse(content))
|
|
64
|
+
|
|
65
|
+
expect(result.title).toBe('Frontmatter Title')
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
it('handles malformed YAML frontmatter gracefully', async () => {
|
|
69
|
+
const content = `---
|
|
70
|
+
title: Valid Start
|
|
71
|
+
But this is not valid YAML:
|
|
72
|
+
- missing colon here
|
|
73
|
+
invalid: [unclosed bracket
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
# Actual Content
|
|
77
|
+
|
|
78
|
+
This should still parse.
|
|
79
|
+
`
|
|
80
|
+
|
|
81
|
+
const result = await Effect.runPromise(parse(content))
|
|
82
|
+
|
|
83
|
+
// Should not throw, should parse with empty frontmatter
|
|
84
|
+
expect(result.frontmatter).toEqual({})
|
|
85
|
+
expect(result.title).toBe('Actual Content')
|
|
86
|
+
})
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
describe('section hierarchy', () => {
|
|
90
|
+
it('builds proper section hierarchy', async () => {
|
|
91
|
+
const content = `# Root
|
|
92
|
+
|
|
93
|
+
## Level 2 A
|
|
94
|
+
|
|
95
|
+
### Level 3 A1
|
|
96
|
+
|
|
97
|
+
Content
|
|
98
|
+
|
|
99
|
+
### Level 3 A2
|
|
100
|
+
|
|
101
|
+
Content
|
|
102
|
+
|
|
103
|
+
## Level 2 B
|
|
104
|
+
|
|
105
|
+
Content
|
|
106
|
+
`
|
|
107
|
+
|
|
108
|
+
const result = await Effect.runPromise(parse(content))
|
|
109
|
+
|
|
110
|
+
expect(result.sections).toHaveLength(1)
|
|
111
|
+
|
|
112
|
+
const root = result.sections[0]!
|
|
113
|
+
expect(root.heading).toBe('Root')
|
|
114
|
+
expect(root.children).toHaveLength(2)
|
|
115
|
+
|
|
116
|
+
const level2A = root.children[0]!
|
|
117
|
+
expect(level2A.heading).toBe('Level 2 A')
|
|
118
|
+
expect(level2A.children).toHaveLength(2)
|
|
119
|
+
|
|
120
|
+
const level3A1 = level2A.children[0]!
|
|
121
|
+
expect(level3A1.heading).toBe('Level 3 A1')
|
|
122
|
+
expect(level3A1.children).toHaveLength(0)
|
|
123
|
+
})
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
describe('links', () => {
|
|
127
|
+
it('extracts internal links', async () => {
|
|
128
|
+
const content = `# Links
|
|
129
|
+
|
|
130
|
+
Check out [other doc](./other.md).
|
|
131
|
+
|
|
132
|
+
And [section link](#section).
|
|
133
|
+
`
|
|
134
|
+
|
|
135
|
+
const result = await Effect.runPromise(parse(content))
|
|
136
|
+
|
|
137
|
+
expect(result.links).toHaveLength(2)
|
|
138
|
+
expect(result.links[0]?.type).toBe('internal')
|
|
139
|
+
expect(result.links[0]?.href).toBe('./other.md')
|
|
140
|
+
expect(result.links[1]?.type).toBe('internal')
|
|
141
|
+
expect(result.links[1]?.href).toBe('#section')
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
it('extracts external links', async () => {
|
|
145
|
+
const content = `# External Links
|
|
146
|
+
|
|
147
|
+
Visit [Google](https://google.com).
|
|
148
|
+
`
|
|
149
|
+
|
|
150
|
+
const result = await Effect.runPromise(parse(content))
|
|
151
|
+
|
|
152
|
+
expect(result.links).toHaveLength(1)
|
|
153
|
+
expect(result.links[0]?.type).toBe('external')
|
|
154
|
+
expect(result.links[0]?.href).toBe('https://google.com')
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
it('extracts image links', async () => {
|
|
158
|
+
const content = `# Images
|
|
159
|
+
|
|
160
|
+

|
|
161
|
+
`
|
|
162
|
+
|
|
163
|
+
const result = await Effect.runPromise(parse(content))
|
|
164
|
+
|
|
165
|
+
expect(result.links).toHaveLength(1)
|
|
166
|
+
expect(result.links[0]?.type).toBe('image')
|
|
167
|
+
expect(result.links[0]?.text).toBe('Alt text')
|
|
168
|
+
})
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
describe('code blocks', () => {
|
|
172
|
+
it('extracts code blocks with language', async () => {
|
|
173
|
+
const content = `# Code
|
|
174
|
+
|
|
175
|
+
\`\`\`typescript
|
|
176
|
+
const x = 1;
|
|
177
|
+
\`\`\`
|
|
178
|
+
`
|
|
179
|
+
|
|
180
|
+
const result = await Effect.runPromise(parse(content))
|
|
181
|
+
|
|
182
|
+
expect(result.codeBlocks).toHaveLength(1)
|
|
183
|
+
expect(result.codeBlocks[0]?.language).toBe('typescript')
|
|
184
|
+
expect(result.codeBlocks[0]?.content).toBe('const x = 1;')
|
|
185
|
+
})
|
|
186
|
+
|
|
187
|
+
it('extracts code blocks without language', async () => {
|
|
188
|
+
const content = `# Code
|
|
189
|
+
|
|
190
|
+
\`\`\`
|
|
191
|
+
plain text
|
|
192
|
+
\`\`\`
|
|
193
|
+
`
|
|
194
|
+
|
|
195
|
+
const result = await Effect.runPromise(parse(content))
|
|
196
|
+
|
|
197
|
+
expect(result.codeBlocks).toHaveLength(1)
|
|
198
|
+
expect(result.codeBlocks[0]?.language).toBeNull()
|
|
199
|
+
})
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
describe('GFM features', () => {
|
|
203
|
+
it('detects tables in sections', async () => {
|
|
204
|
+
const content = `# Tables
|
|
205
|
+
|
|
206
|
+
| Header 1 | Header 2 |
|
|
207
|
+
| -------- | -------- |
|
|
208
|
+
| Cell 1 | Cell 2 |
|
|
209
|
+
`
|
|
210
|
+
|
|
211
|
+
const result = await Effect.runPromise(parse(content))
|
|
212
|
+
|
|
213
|
+
expect(result.sections[0]?.metadata.hasTable).toBe(true)
|
|
214
|
+
})
|
|
215
|
+
|
|
216
|
+
it('detects lists in sections', async () => {
|
|
217
|
+
const content = `# Lists
|
|
218
|
+
|
|
219
|
+
- Item 1
|
|
220
|
+
- Item 2
|
|
221
|
+
- Item 3
|
|
222
|
+
`
|
|
223
|
+
|
|
224
|
+
const result = await Effect.runPromise(parse(content))
|
|
225
|
+
|
|
226
|
+
expect(result.sections[0]?.metadata.hasList).toBe(true)
|
|
227
|
+
})
|
|
228
|
+
|
|
229
|
+
it('detects task lists', async () => {
|
|
230
|
+
const content = `# Tasks
|
|
231
|
+
|
|
232
|
+
- [ ] Todo item
|
|
233
|
+
- [x] Completed item
|
|
234
|
+
`
|
|
235
|
+
|
|
236
|
+
const result = await Effect.runPromise(parse(content))
|
|
237
|
+
|
|
238
|
+
expect(result.sections[0]?.metadata.hasList).toBe(true)
|
|
239
|
+
})
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
describe('metadata', () => {
|
|
243
|
+
it('counts tokens and words', async () => {
|
|
244
|
+
const content = `# Document
|
|
245
|
+
|
|
246
|
+
This is some text content for testing token counting.
|
|
247
|
+
`
|
|
248
|
+
|
|
249
|
+
const result = await Effect.runPromise(parse(content))
|
|
250
|
+
|
|
251
|
+
expect(result.metadata.tokenCount).toBeGreaterThan(0)
|
|
252
|
+
expect(result.metadata.wordCount).toBeGreaterThan(0)
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
it('counts links and code blocks', async () => {
|
|
256
|
+
const content = `# Test
|
|
257
|
+
|
|
258
|
+
[Link 1](./a.md)
|
|
259
|
+
[Link 2](./b.md)
|
|
260
|
+
|
|
261
|
+
\`\`\`js
|
|
262
|
+
code
|
|
263
|
+
\`\`\`
|
|
264
|
+
|
|
265
|
+
\`\`\`py
|
|
266
|
+
code
|
|
267
|
+
\`\`\`
|
|
268
|
+
`
|
|
269
|
+
|
|
270
|
+
const result = await Effect.runPromise(parse(content))
|
|
271
|
+
|
|
272
|
+
expect(result.metadata.linkCount).toBe(2)
|
|
273
|
+
expect(result.metadata.codeBlockCount).toBe(2)
|
|
274
|
+
})
|
|
275
|
+
|
|
276
|
+
it('counts headings', async () => {
|
|
277
|
+
const content = `# H1
|
|
278
|
+
|
|
279
|
+
## H2
|
|
280
|
+
|
|
281
|
+
### H3
|
|
282
|
+
|
|
283
|
+
## Another H2
|
|
284
|
+
`
|
|
285
|
+
|
|
286
|
+
const result = await Effect.runPromise(parse(content))
|
|
287
|
+
|
|
288
|
+
expect(result.metadata.headingCount).toBe(4)
|
|
289
|
+
})
|
|
290
|
+
})
|
|
291
|
+
})
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown parser using remark/unified
|
|
3
|
+
* Handles GFM (tables, task lists) and YAML frontmatter
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import * as crypto from 'node:crypto'
|
|
7
|
+
import { Effect } from 'effect'
|
|
8
|
+
import matter from 'gray-matter'
|
|
9
|
+
import type { Code, Heading, Image, Link, Parent, Root, Text } from 'mdast'
|
|
10
|
+
import remarkGfm from 'remark-gfm'
|
|
11
|
+
import remarkParse from 'remark-parse'
|
|
12
|
+
import { unified } from 'unified'
|
|
13
|
+
import { visit } from 'unist-util-visit'
|
|
14
|
+
|
|
15
|
+
import type {
|
|
16
|
+
DocumentMetadata,
|
|
17
|
+
HeadingLevel,
|
|
18
|
+
MdCodeBlock,
|
|
19
|
+
MdDocument,
|
|
20
|
+
MdLink,
|
|
21
|
+
MdSection,
|
|
22
|
+
ParseError,
|
|
23
|
+
} from '../core/types.js'
|
|
24
|
+
import { FileReadError } from '../errors/index.js'
|
|
25
|
+
import { countTokensApprox, countWords } from '../utils/tokens.js'
|
|
26
|
+
|
|
27
|
+
// ============================================================================
|
|
28
|
+
// Parser Configuration
|
|
29
|
+
// ============================================================================
|
|
30
|
+
|
|
31
|
+
const processor = unified().use(remarkParse).use(remarkGfm)
|
|
32
|
+
|
|
33
|
+
// ============================================================================
|
|
34
|
+
// Helper Functions
|
|
35
|
+
// ============================================================================
|
|
36
|
+
|
|
37
|
+
const generateId = (input: string): string => {
|
|
38
|
+
return crypto.createHash('md5').update(input).digest('hex').slice(0, 12)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const slugify = (text: string): string => {
|
|
42
|
+
return text
|
|
43
|
+
.toLowerCase()
|
|
44
|
+
.replace(/[^\w\s-]/g, '')
|
|
45
|
+
.replace(/\s+/g, '-')
|
|
46
|
+
.replace(/-+/g, '-')
|
|
47
|
+
.trim()
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const isInternalLink = (href: string): boolean => {
|
|
51
|
+
if (href.startsWith('http://') || href.startsWith('https://')) return false
|
|
52
|
+
if (href.startsWith('mailto:')) return false
|
|
53
|
+
if (href.startsWith('#')) return true
|
|
54
|
+
if (href.endsWith('.md') || href.includes('.md#')) return true
|
|
55
|
+
return !href.includes('://')
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const extractPlainText = (node: Parent | Root): string => {
|
|
59
|
+
const texts: string[] = []
|
|
60
|
+
visit(node, 'text', (textNode: Text) => {
|
|
61
|
+
texts.push(textNode.value)
|
|
62
|
+
})
|
|
63
|
+
return texts.join(' ')
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
67
|
+
const getNodeEndLine = (node: any): number => {
|
|
68
|
+
return node?.position?.end?.line ?? 0
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
72
|
+
const getNodeStartLine = (node: any): number => {
|
|
73
|
+
return node?.position?.start?.line ?? 0
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// ============================================================================
|
|
77
|
+
// Section Extraction
|
|
78
|
+
// ============================================================================
|
|
79
|
+
|
|
80
|
+
interface RawSection {
|
|
81
|
+
heading: string
|
|
82
|
+
level: HeadingLevel
|
|
83
|
+
startLine: number
|
|
84
|
+
endLine: number
|
|
85
|
+
contentStartLine: number
|
|
86
|
+
contentNodes: unknown[]
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const extractRawSections = (tree: Root): RawSection[] => {
|
|
90
|
+
const sections: RawSection[] = []
|
|
91
|
+
const headings: {
|
|
92
|
+
heading: string
|
|
93
|
+
level: HeadingLevel
|
|
94
|
+
line: number
|
|
95
|
+
index: number
|
|
96
|
+
}[] = []
|
|
97
|
+
|
|
98
|
+
// First pass: collect all headings with their positions
|
|
99
|
+
tree.children.forEach((node, index) => {
|
|
100
|
+
if (node.type === 'heading') {
|
|
101
|
+
const heading = node as Heading
|
|
102
|
+
headings.push({
|
|
103
|
+
heading: extractPlainText(heading),
|
|
104
|
+
level: heading.depth as HeadingLevel,
|
|
105
|
+
line: getNodeStartLine(node),
|
|
106
|
+
index,
|
|
107
|
+
})
|
|
108
|
+
}
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
// Second pass: create sections from headings
|
|
112
|
+
headings.forEach((h, i) => {
|
|
113
|
+
const nextHeading = headings[i + 1]
|
|
114
|
+
const endIndex = nextHeading ? nextHeading.index : tree.children.length
|
|
115
|
+
|
|
116
|
+
// Get content nodes between this heading and the next
|
|
117
|
+
const contentNodes = tree.children.slice(h.index + 1, endIndex)
|
|
118
|
+
const lastContentNode = contentNodes[contentNodes.length - 1]
|
|
119
|
+
const endLine = lastContentNode ? getNodeEndLine(lastContentNode) : h.line
|
|
120
|
+
|
|
121
|
+
sections.push({
|
|
122
|
+
heading: h.heading,
|
|
123
|
+
level: h.level,
|
|
124
|
+
startLine: h.line,
|
|
125
|
+
endLine,
|
|
126
|
+
contentStartLine: h.line + 1,
|
|
127
|
+
contentNodes,
|
|
128
|
+
})
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
return sections
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const buildSectionHierarchy = (
|
|
135
|
+
rawSections: RawSection[],
|
|
136
|
+
docId: string,
|
|
137
|
+
lines: string[],
|
|
138
|
+
): MdSection[] => {
|
|
139
|
+
const result: MdSection[] = []
|
|
140
|
+
const stack: { section: MdSection; level: number }[] = []
|
|
141
|
+
|
|
142
|
+
for (const raw of rawSections) {
|
|
143
|
+
const contentLines = lines.slice(raw.startLine - 1, raw.endLine)
|
|
144
|
+
const content = contentLines.join('\n')
|
|
145
|
+
const plainText = extractSectionPlainText(raw.contentNodes as Parent[])
|
|
146
|
+
|
|
147
|
+
const hasCode = (raw.contentNodes as { type: string }[]).some(
|
|
148
|
+
(n) => n.type === 'code',
|
|
149
|
+
)
|
|
150
|
+
const hasList = (raw.contentNodes as { type: string }[]).some(
|
|
151
|
+
(n) => n.type === 'list',
|
|
152
|
+
)
|
|
153
|
+
const hasTable = (raw.contentNodes as { type: string }[]).some(
|
|
154
|
+
(n) => n.type === 'table',
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
const section: MdSection = {
|
|
158
|
+
id: `${docId}-${slugify(raw.heading)}`,
|
|
159
|
+
heading: raw.heading,
|
|
160
|
+
level: raw.level,
|
|
161
|
+
content,
|
|
162
|
+
plainText,
|
|
163
|
+
startLine: raw.startLine,
|
|
164
|
+
endLine: raw.endLine,
|
|
165
|
+
children: [],
|
|
166
|
+
metadata: {
|
|
167
|
+
wordCount: countWords(plainText),
|
|
168
|
+
tokenCount: countTokensApprox(content),
|
|
169
|
+
hasCode,
|
|
170
|
+
hasList,
|
|
171
|
+
hasTable,
|
|
172
|
+
},
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Build hierarchy: find parent for this section
|
|
176
|
+
while (stack.length > 0 && stack[stack.length - 1]!.level >= raw.level) {
|
|
177
|
+
stack.pop()
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (stack.length === 0) {
|
|
181
|
+
result.push(section)
|
|
182
|
+
} else {
|
|
183
|
+
const parent = stack[stack.length - 1]!
|
|
184
|
+
;(parent.section.children as MdSection[]).push(section)
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
stack.push({ section, level: raw.level })
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return result
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const extractSectionPlainText = (nodes: Parent[]): string => {
|
|
194
|
+
const texts: string[] = []
|
|
195
|
+
for (const node of nodes) {
|
|
196
|
+
if ('value' in node && typeof node.value === 'string') {
|
|
197
|
+
texts.push(node.value)
|
|
198
|
+
} else if ('children' in node) {
|
|
199
|
+
texts.push(extractPlainText(node))
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return texts.join(' ')
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const countAllSections = (sections: MdSection[]): number => {
|
|
206
|
+
let count = 0
|
|
207
|
+
for (const section of sections) {
|
|
208
|
+
count += 1
|
|
209
|
+
count += countAllSections(section.children as MdSection[])
|
|
210
|
+
}
|
|
211
|
+
return count
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// ============================================================================
|
|
215
|
+
// Link Extraction
|
|
216
|
+
// ============================================================================
|
|
217
|
+
|
|
218
|
+
const extractLinks = (tree: Root, docId: string): MdLink[] => {
|
|
219
|
+
const links: MdLink[] = []
|
|
220
|
+
let currentSectionId = docId
|
|
221
|
+
|
|
222
|
+
visit(tree, (node) => {
|
|
223
|
+
if (node.type === 'heading') {
|
|
224
|
+
currentSectionId = `${docId}-${slugify(extractPlainText(node as Heading))}`
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if (node.type === 'link') {
|
|
228
|
+
const link = node as Link
|
|
229
|
+
const internal = isInternalLink(link.url)
|
|
230
|
+
links.push({
|
|
231
|
+
type: internal ? 'internal' : 'external',
|
|
232
|
+
href: link.url,
|
|
233
|
+
text: extractPlainText(link),
|
|
234
|
+
sectionId: currentSectionId,
|
|
235
|
+
line: getNodeStartLine(node),
|
|
236
|
+
})
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if (node.type === 'image') {
|
|
240
|
+
const img = node as Image
|
|
241
|
+
links.push({
|
|
242
|
+
type: 'image',
|
|
243
|
+
href: img.url,
|
|
244
|
+
text: img.alt ?? '',
|
|
245
|
+
sectionId: currentSectionId,
|
|
246
|
+
line: getNodeStartLine(node),
|
|
247
|
+
})
|
|
248
|
+
}
|
|
249
|
+
})
|
|
250
|
+
|
|
251
|
+
return links
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// ============================================================================
|
|
255
|
+
// Code Block Extraction
|
|
256
|
+
// ============================================================================
|
|
257
|
+
|
|
258
|
+
const extractCodeBlocks = (tree: Root, docId: string): MdCodeBlock[] => {
|
|
259
|
+
const codeBlocks: MdCodeBlock[] = []
|
|
260
|
+
let currentSectionId = docId
|
|
261
|
+
|
|
262
|
+
visit(tree, (node) => {
|
|
263
|
+
if (node.type === 'heading') {
|
|
264
|
+
currentSectionId = `${docId}-${slugify(extractPlainText(node as Heading))}`
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if (node.type === 'code') {
|
|
268
|
+
const code = node as Code
|
|
269
|
+
codeBlocks.push({
|
|
270
|
+
language: code.lang ?? null,
|
|
271
|
+
content: code.value,
|
|
272
|
+
sectionId: currentSectionId,
|
|
273
|
+
startLine: getNodeStartLine(node),
|
|
274
|
+
endLine: getNodeEndLine(node),
|
|
275
|
+
})
|
|
276
|
+
}
|
|
277
|
+
})
|
|
278
|
+
|
|
279
|
+
return codeBlocks
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// ============================================================================
|
|
283
|
+
// Main Parser Function
|
|
284
|
+
// ============================================================================
|
|
285
|
+
|
|
286
|
+
export interface ParseOptions {
|
|
287
|
+
readonly path?: string
|
|
288
|
+
readonly lastModified?: Date
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
export const parse = (
|
|
292
|
+
content: string,
|
|
293
|
+
options: ParseOptions = {},
|
|
294
|
+
): Effect.Effect<MdDocument, ParseError> =>
|
|
295
|
+
Effect.gen(function* () {
|
|
296
|
+
const path = options.path ?? 'unknown'
|
|
297
|
+
const docId = generateId(path)
|
|
298
|
+
const now = new Date()
|
|
299
|
+
|
|
300
|
+
// Extract frontmatter (graceful handling for malformed YAML)
|
|
301
|
+
let frontmatter: Record<string, unknown> = {}
|
|
302
|
+
let markdownContent: string = content
|
|
303
|
+
|
|
304
|
+
try {
|
|
305
|
+
const parsed = matter(content)
|
|
306
|
+
frontmatter = parsed.data
|
|
307
|
+
markdownContent = parsed.content
|
|
308
|
+
} catch (error) {
|
|
309
|
+
// Malformed frontmatter - treat entire content as markdown
|
|
310
|
+
const msg = error instanceof Error ? error.message : String(error)
|
|
311
|
+
console.warn(
|
|
312
|
+
`Warning: Malformed frontmatter in ${path}, skipping: ${msg.split('\n')[0]}`,
|
|
313
|
+
)
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Parse markdown to AST
|
|
317
|
+
const tree = processor.parse(markdownContent) as Root
|
|
318
|
+
|
|
319
|
+
// Split content into lines for reference
|
|
320
|
+
const lines = markdownContent.split('\n')
|
|
321
|
+
|
|
322
|
+
// Extract sections
|
|
323
|
+
const rawSections = extractRawSections(tree)
|
|
324
|
+
const sections = buildSectionHierarchy(rawSections, docId, lines)
|
|
325
|
+
|
|
326
|
+
// Extract links and code blocks
|
|
327
|
+
const links = extractLinks(tree, docId)
|
|
328
|
+
const codeBlocks = extractCodeBlocks(tree, docId)
|
|
329
|
+
|
|
330
|
+
// Determine title (first H1 or filename)
|
|
331
|
+
const firstH1 = sections.find((s) => s.level === 1)
|
|
332
|
+
const title =
|
|
333
|
+
firstH1?.heading ??
|
|
334
|
+
(typeof frontmatter.title === 'string' ? frontmatter.title : null) ??
|
|
335
|
+
path.split('/').pop()?.replace(/\.md$/, '') ??
|
|
336
|
+
'Untitled'
|
|
337
|
+
|
|
338
|
+
// Calculate metadata
|
|
339
|
+
const totalContent = sections.map((s) => s.content).join('\n')
|
|
340
|
+
const metadata: DocumentMetadata = {
|
|
341
|
+
wordCount: countWords(totalContent),
|
|
342
|
+
tokenCount: countTokensApprox(content),
|
|
343
|
+
headingCount: countAllSections(sections),
|
|
344
|
+
linkCount: links.length,
|
|
345
|
+
codeBlockCount: codeBlocks.length,
|
|
346
|
+
lastModified: options.lastModified ?? now,
|
|
347
|
+
indexedAt: now,
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
const document: MdDocument = {
|
|
351
|
+
id: docId,
|
|
352
|
+
path,
|
|
353
|
+
title,
|
|
354
|
+
frontmatter,
|
|
355
|
+
sections,
|
|
356
|
+
links,
|
|
357
|
+
codeBlocks,
|
|
358
|
+
metadata,
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
return document
|
|
362
|
+
})
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Parse a markdown file from the filesystem
|
|
366
|
+
*
|
|
367
|
+
* @throws ParseError - File content cannot be parsed
|
|
368
|
+
* @throws FileReadError - File cannot be read from filesystem
|
|
369
|
+
*/
|
|
370
|
+
export const parseFile = (
|
|
371
|
+
filePath: string,
|
|
372
|
+
): Effect.Effect<MdDocument, ParseError | FileReadError> =>
|
|
373
|
+
Effect.gen(function* () {
|
|
374
|
+
const fs = yield* Effect.promise(() => import('node:fs/promises'))
|
|
375
|
+
|
|
376
|
+
const [content, stats] = yield* Effect.tryPromise({
|
|
377
|
+
try: () =>
|
|
378
|
+
Promise.all([
|
|
379
|
+
fs.readFile(filePath, 'utf-8'),
|
|
380
|
+
fs.stat(filePath),
|
|
381
|
+
] as const),
|
|
382
|
+
catch: (error) =>
|
|
383
|
+
new FileReadError({
|
|
384
|
+
path: filePath,
|
|
385
|
+
message: error instanceof Error ? error.message : 'Unknown error',
|
|
386
|
+
cause: error,
|
|
387
|
+
}),
|
|
388
|
+
})
|
|
389
|
+
|
|
390
|
+
return yield* parse(content, {
|
|
391
|
+
path: filePath,
|
|
392
|
+
lastModified: stats.mtime,
|
|
393
|
+
})
|
|
394
|
+
})
|