mdcontext 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.changeset/config.json +9 -9
- package/.claude/settings.local.json +25 -0
- package/.github/workflows/claude-code-review.yml +44 -0
- package/.github/workflows/claude.yml +85 -0
- package/CONTRIBUTING.md +186 -0
- package/NOTES/NOTES +44 -0
- package/README.md +206 -3
- package/biome.json +1 -1
- package/dist/chunk-23UPXDNL.js +3044 -0
- package/dist/chunk-2W7MO2DL.js +1366 -0
- package/dist/chunk-3NUAZGMA.js +1689 -0
- package/dist/chunk-7TOWB2XB.js +366 -0
- package/dist/chunk-7XOTOADQ.js +3065 -0
- package/dist/chunk-AH2PDM2K.js +3042 -0
- package/dist/chunk-BNXWSZ63.js +3742 -0
- package/dist/chunk-BTL5DJVU.js +3222 -0
- package/dist/chunk-HDHYG7E4.js +104 -0
- package/dist/chunk-HLR4KZBP.js +3234 -0
- package/dist/chunk-IP3FRFEB.js +1045 -0
- package/dist/chunk-KHU56VDO.js +3042 -0
- package/dist/chunk-KRYIFLQR.js +85 -89
- package/dist/chunk-LBSDNLEM.js +287 -0
- package/dist/chunk-MNTQ7HCP.js +2643 -0
- package/dist/chunk-MUJELQQ6.js +1387 -0
- package/dist/chunk-MXJGMSLV.js +2199 -0
- package/dist/chunk-N6QJGC3Z.js +2636 -0
- package/dist/chunk-OBELGBPM.js +1713 -0
- package/dist/chunk-OT7R5XTA.js +3192 -0
- package/dist/chunk-P7X4RA2T.js +106 -0
- package/dist/chunk-PIDUQNC2.js +3185 -0
- package/dist/chunk-POGCDIH4.js +3187 -0
- package/dist/chunk-PSIEOQGZ.js +3043 -0
- package/dist/chunk-PVRT3IHA.js +3238 -0
- package/dist/chunk-QNN4TT23.js +1430 -0
- package/dist/chunk-RE3R45RJ.js +3042 -0
- package/dist/chunk-S7E6TFX6.js +718 -657
- package/dist/chunk-SG6GLU4U.js +1378 -0
- package/dist/chunk-SJCDV2ST.js +274 -0
- package/dist/chunk-SYE5XLF3.js +104 -0
- package/dist/chunk-T5VLYBZD.js +103 -0
- package/dist/chunk-TOQB7VWU.js +3238 -0
- package/dist/chunk-VFNMZ4ZQ.js +3228 -0
- package/dist/chunk-VVTGZNBT.js +1533 -1423
- package/dist/chunk-W7Q4RFEV.js +104 -0
- package/dist/chunk-XTYYVRLO.js +3190 -0
- package/dist/chunk-Y6MDYVJD.js +3063 -0
- package/dist/cli/main.js +4072 -629
- package/dist/index.d.ts +420 -33
- package/dist/index.js +8 -15
- package/dist/mcp/server.js +103 -7
- package/dist/schema-BAWSG7KY.js +22 -0
- package/dist/schema-E3QUPL26.js +20 -0
- package/dist/schema-EHL7WUT6.js +20 -0
- package/docs/019-USAGE.md +44 -5
- package/docs/020-current-implementation.md +8 -8
- package/docs/021-DOGFOODING-FINDINGS.md +1 -1
- package/docs/CONFIG.md +1123 -0
- package/docs/ERRORS.md +383 -0
- package/docs/summarization.md +320 -0
- package/justfile +40 -0
- package/package.json +39 -33
- package/research/INDEX.md +315 -0
- package/research/code-review/README.md +90 -0
- package/research/code-review/cli-error-handling-review.md +979 -0
- package/research/code-review/code-review-validation-report.md +464 -0
- package/research/code-review/main-ts-review.md +1128 -0
- package/research/config-docs/SUMMARY.md +357 -0
- package/research/config-docs/TEST-RESULTS.md +776 -0
- package/research/config-docs/TODO.md +542 -0
- package/research/config-docs/analysis.md +744 -0
- package/research/config-docs/fix-validation.md +502 -0
- package/research/config-docs/help-audit.md +264 -0
- package/research/config-docs/help-system-analysis.md +890 -0
- package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
- package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
- package/research/issue-review.md +603 -0
- package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
- package/research/llm-summarization/alternative-providers-2026.md +1428 -0
- package/research/llm-summarization/anthropic-2026.md +367 -0
- package/research/llm-summarization/claude-cli-integration.md +1706 -0
- package/research/llm-summarization/cli-integration-patterns.md +3155 -0
- package/research/llm-summarization/openai-2026.md +473 -0
- package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
- package/research/llm-summarization/opencode-cli-integration.md +1552 -0
- package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
- package/research/llm-summarization/prototype-results.md +56 -0
- package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
- package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
- package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
- package/research/mdcontext-pudding/01-index-embed.md +956 -0
- package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
- package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
- package/research/mdcontext-pudding/02-search.md +970 -0
- package/research/mdcontext-pudding/03-context.md +779 -0
- package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
- package/research/mdcontext-pudding/04-tree.md +704 -0
- package/research/mdcontext-pudding/05-config.md +1038 -0
- package/research/mdcontext-pudding/06-links-summary.txt +87 -0
- package/research/mdcontext-pudding/06-links.md +679 -0
- package/research/mdcontext-pudding/07-stats.md +693 -0
- package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
- package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
- package/research/mdcontext-pudding/README.md +168 -0
- package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
- package/research/research-quality-review.md +834 -0
- package/research/semantic-search/embedding-text-analysis.md +156 -0
- package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
- package/research/semantic-search/query-processing-analysis.md +207 -0
- package/research/semantic-search/root-cause-and-solution.md +114 -0
- package/research/semantic-search/threshold-validation-report.md +69 -0
- package/research/semantic-search/vector-search-analysis.md +63 -0
- package/research/test-path-issues.md +276 -0
- package/review/ALP-76/1-error-type-design.md +962 -0
- package/review/ALP-76/2-error-handling-patterns.md +906 -0
- package/review/ALP-76/3-error-presentation.md +624 -0
- package/review/ALP-76/4-test-coverage.md +625 -0
- package/review/ALP-76/5-migration-completeness.md +440 -0
- package/review/ALP-76/6-effect-best-practices.md +755 -0
- package/scripts/apply-branch-protection.sh +47 -0
- package/scripts/branch-protection-templates.json +79 -0
- package/scripts/prototype-summarization.ts +346 -0
- package/scripts/rebuild-hnswlib.js +32 -37
- package/scripts/setup-branch-protection.sh +64 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
- package/src/cli/argv-preprocessor.test.ts +2 -2
- package/src/cli/cli.test.ts +230 -33
- package/src/cli/commands/config-cmd.ts +642 -0
- package/src/cli/commands/context.ts +97 -9
- package/src/cli/commands/duplicates.ts +122 -0
- package/src/cli/commands/embeddings.ts +529 -0
- package/src/cli/commands/index-cmd.ts +210 -30
- package/src/cli/commands/index.ts +3 -0
- package/src/cli/commands/search.ts +894 -64
- package/src/cli/commands/stats.ts +3 -0
- package/src/cli/commands/tree.ts +26 -5
- package/src/cli/config-layer.ts +176 -0
- package/src/cli/error-handler.test.ts +235 -0
- package/src/cli/error-handler.ts +655 -0
- package/src/cli/flag-schemas.ts +66 -0
- package/src/cli/help.ts +209 -7
- package/src/cli/main.ts +348 -58
- package/src/cli/options.ts +10 -0
- package/src/cli/shared-error-handling.ts +199 -0
- package/src/cli/utils.ts +150 -17
- package/src/config/file-provider.test.ts +320 -0
- package/src/config/file-provider.ts +273 -0
- package/src/config/index.ts +72 -0
- package/src/config/integration.test.ts +667 -0
- package/src/config/precedence.test.ts +277 -0
- package/src/config/precedence.ts +451 -0
- package/src/config/schema.test.ts +414 -0
- package/src/config/schema.ts +603 -0
- package/src/config/service.test.ts +320 -0
- package/src/config/service.ts +243 -0
- package/src/config/testing.test.ts +264 -0
- package/src/config/testing.ts +110 -0
- package/src/core/types.ts +6 -33
- package/src/duplicates/detector.test.ts +183 -0
- package/src/duplicates/detector.ts +414 -0
- package/src/duplicates/index.ts +18 -0
- package/src/embeddings/embedding-namespace.test.ts +300 -0
- package/src/embeddings/embedding-namespace.ts +947 -0
- package/src/embeddings/heading-boost.test.ts +222 -0
- package/src/embeddings/hnsw-build-options.test.ts +198 -0
- package/src/embeddings/hyde.test.ts +272 -0
- package/src/embeddings/hyde.ts +264 -0
- package/src/embeddings/index.ts +2 -0
- package/src/embeddings/openai-provider.ts +332 -83
- package/src/embeddings/pricing.json +22 -0
- package/src/embeddings/provider-constants.ts +204 -0
- package/src/embeddings/provider-errors.test.ts +967 -0
- package/src/embeddings/provider-errors.ts +565 -0
- package/src/embeddings/provider-factory.test.ts +240 -0
- package/src/embeddings/provider-factory.ts +225 -0
- package/src/embeddings/provider-integration.test.ts +788 -0
- package/src/embeddings/query-preprocessing.test.ts +187 -0
- package/src/embeddings/semantic-search-threshold.test.ts +508 -0
- package/src/embeddings/semantic-search.ts +780 -93
- package/src/embeddings/types.ts +293 -16
- package/src/embeddings/vector-store.ts +486 -77
- package/src/embeddings/voyage-provider.ts +313 -0
- package/src/errors/errors.test.ts +845 -0
- package/src/errors/index.ts +533 -0
- package/src/index/ignore-patterns.test.ts +354 -0
- package/src/index/ignore-patterns.ts +305 -0
- package/src/index/indexer.ts +286 -48
- package/src/index/storage.ts +94 -30
- package/src/index/types.ts +40 -2
- package/src/index/watcher.ts +67 -9
- package/src/index.ts +22 -0
- package/src/integration/search-keyword.test.ts +678 -0
- package/src/mcp/server.ts +135 -6
- package/src/parser/parser.ts +18 -19
- package/src/parser/section-filter.test.ts +277 -0
- package/src/parser/section-filter.ts +125 -3
- package/src/search/__tests__/hybrid-search.test.ts +650 -0
- package/src/search/bm25-store.ts +366 -0
- package/src/search/cross-encoder.test.ts +253 -0
- package/src/search/cross-encoder.ts +406 -0
- package/src/search/fuzzy-search.test.ts +419 -0
- package/src/search/fuzzy-search.ts +273 -0
- package/src/search/hybrid-search.ts +448 -0
- package/src/search/path-matcher.test.ts +276 -0
- package/src/search/path-matcher.ts +33 -0
- package/src/search/searcher.test.ts +99 -1
- package/src/search/searcher.ts +189 -67
- package/src/search/wink-bm25.d.ts +30 -0
- package/src/summarization/cli-providers/claude.ts +202 -0
- package/src/summarization/cli-providers/detection.test.ts +273 -0
- package/src/summarization/cli-providers/detection.ts +118 -0
- package/src/summarization/cli-providers/index.ts +8 -0
- package/src/summarization/cost.test.ts +139 -0
- package/src/summarization/cost.ts +102 -0
- package/src/summarization/error-handler.test.ts +127 -0
- package/src/summarization/error-handler.ts +111 -0
- package/src/summarization/index.ts +102 -0
- package/src/summarization/pipeline.test.ts +498 -0
- package/src/summarization/pipeline.ts +231 -0
- package/src/summarization/prompts.test.ts +269 -0
- package/src/summarization/prompts.ts +133 -0
- package/src/summarization/provider-factory.test.ts +396 -0
- package/src/summarization/provider-factory.ts +178 -0
- package/src/summarization/types.ts +184 -0
- package/src/summarize/summarizer.ts +104 -35
- package/src/types/huggingface-transformers.d.ts +66 -0
- package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/indexes/documents.json +4 -4
- package/tests/fixtures/cli/.mdcontext/indexes/sections.json +14 -0
- package/tests/integration/embed-index.test.ts +712 -0
- package/tests/integration/search-context.test.ts +469 -0
- package/tests/integration/search-semantic.test.ts +522 -0
- package/vitest.config.ts +1 -6
- package/AGENTS.md +0 -46
- package/tests/fixtures/cli/.mdcontext/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/vectors.meta.json +0 -1264
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Duplicate Content Detection
|
|
3
|
+
*
|
|
4
|
+
* Detects duplicate and near-duplicate content across markdown sections.
|
|
5
|
+
* Uses both exact hash matching and embedding similarity for detection.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import * as crypto from 'node:crypto'
|
|
9
|
+
import * as fs from 'node:fs/promises'
|
|
10
|
+
import * as path from 'node:path'
|
|
11
|
+
import { Effect } from 'effect'
|
|
12
|
+
import type { FileReadError, IndexCorruptedError } from '../errors/index.js'
|
|
13
|
+
import { createStorage, loadSectionIndex } from '../index/storage.js'
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// Types
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* A group of duplicate sections, with one primary and zero or more duplicates.
|
|
21
|
+
*/
|
|
22
|
+
export interface DuplicateGroup {
|
|
23
|
+
/** The primary section (first encountered or highest-ranked) */
|
|
24
|
+
readonly primary: DuplicateSectionInfo
|
|
25
|
+
/** All sections that are duplicates of the primary */
|
|
26
|
+
readonly duplicates: readonly DuplicateSectionInfo[]
|
|
27
|
+
/** Detection method used */
|
|
28
|
+
readonly method: 'exact' | 'similar'
|
|
29
|
+
/** Similarity score (1.0 for exact matches, <1.0 for similar) */
|
|
30
|
+
readonly similarity: number
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Information about a section in a duplicate group.
|
|
35
|
+
*/
|
|
36
|
+
export interface DuplicateSectionInfo {
|
|
37
|
+
readonly sectionId: string
|
|
38
|
+
readonly documentPath: string
|
|
39
|
+
readonly heading: string
|
|
40
|
+
readonly startLine: number
|
|
41
|
+
readonly endLine: number
|
|
42
|
+
readonly tokenCount: number
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Options for duplicate detection.
|
|
47
|
+
*/
|
|
48
|
+
export interface DuplicateDetectionOptions {
|
|
49
|
+
/** Minimum content length (characters) to consider for duplicate detection */
|
|
50
|
+
readonly minContentLength?: number | undefined
|
|
51
|
+
/** Similarity threshold for near-duplicate detection (0-1, default: 0.85) */
|
|
52
|
+
readonly similarityThreshold?: number | undefined
|
|
53
|
+
/** Include exact matches only (skip similarity detection) */
|
|
54
|
+
readonly exactOnly?: boolean | undefined
|
|
55
|
+
/** Filter by document path pattern */
|
|
56
|
+
readonly pathPattern?: string | undefined
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Result of duplicate detection.
|
|
61
|
+
*/
|
|
62
|
+
export interface DuplicateDetectionResult {
|
|
63
|
+
/** Groups of duplicate sections */
|
|
64
|
+
readonly groups: readonly DuplicateGroup[]
|
|
65
|
+
/** Total sections analyzed */
|
|
66
|
+
readonly sectionsAnalyzed: number
|
|
67
|
+
/** Total duplicate pairs found */
|
|
68
|
+
readonly duplicatePairs: number
|
|
69
|
+
/** Sections involved in at least one duplicate relationship */
|
|
70
|
+
readonly sectionsWithDuplicates: number
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Options for collapsing search results.
|
|
75
|
+
*/
|
|
76
|
+
export interface CollapseOptions {
|
|
77
|
+
/** Show duplicate locations in output */
|
|
78
|
+
readonly showLocations?: boolean
|
|
79
|
+
/** Maximum duplicate locations to show */
|
|
80
|
+
readonly maxLocations?: number
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* A search result with collapsed duplicate information.
|
|
85
|
+
*/
|
|
86
|
+
export interface CollapsedResult<T> {
|
|
87
|
+
/** The primary result */
|
|
88
|
+
readonly result: T
|
|
89
|
+
/** Number of duplicates collapsed */
|
|
90
|
+
readonly duplicateCount: number
|
|
91
|
+
/** Locations of duplicates (if showLocations enabled) */
|
|
92
|
+
readonly duplicateLocations:
|
|
93
|
+
| readonly {
|
|
94
|
+
readonly documentPath: string
|
|
95
|
+
readonly heading: string
|
|
96
|
+
}[]
|
|
97
|
+
| undefined
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ============================================================================
|
|
101
|
+
// Content Hashing
|
|
102
|
+
// ============================================================================
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Normalize content for comparison by removing whitespace variations
|
|
106
|
+
* and normalizing line endings.
|
|
107
|
+
*/
|
|
108
|
+
const normalizeContent = (content: string): string => {
|
|
109
|
+
return content
|
|
110
|
+
.trim()
|
|
111
|
+
.replace(/\r\n/g, '\n')
|
|
112
|
+
.replace(/[ \t]+/g, ' ')
|
|
113
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Compute a content hash for exact duplicate detection.
|
|
118
|
+
*/
|
|
119
|
+
const computeContentHash = (content: string): string => {
|
|
120
|
+
const normalized = normalizeContent(content)
|
|
121
|
+
return crypto.createHash('sha256').update(normalized).digest('hex')
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ============================================================================
|
|
125
|
+
// Section Content Loading
|
|
126
|
+
// ============================================================================
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* File content cache for efficient section loading.
|
|
130
|
+
* Multiple sections from the same file share the cached content.
|
|
131
|
+
*/
|
|
132
|
+
interface FileContentCache {
|
|
133
|
+
readonly cache: Map<string, string | null>
|
|
134
|
+
get: (
|
|
135
|
+
rootPath: string,
|
|
136
|
+
documentPath: string,
|
|
137
|
+
) => Effect.Effect<string | null, never>
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Create a file content cache for efficient repeated lookups.
|
|
142
|
+
*/
|
|
143
|
+
const createFileContentCache = (): FileContentCache => {
|
|
144
|
+
const cache = new Map<string, string | null>()
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
cache,
|
|
148
|
+
get: (rootPath: string, documentPath: string) =>
|
|
149
|
+
Effect.gen(function* () {
|
|
150
|
+
if (cache.has(documentPath)) {
|
|
151
|
+
return cache.get(documentPath)!
|
|
152
|
+
}
|
|
153
|
+
const content = yield* Effect.promise(async () => {
|
|
154
|
+
try {
|
|
155
|
+
const filePath = path.join(rootPath, documentPath)
|
|
156
|
+
return await fs.readFile(filePath, 'utf-8')
|
|
157
|
+
} catch {
|
|
158
|
+
return null
|
|
159
|
+
}
|
|
160
|
+
})
|
|
161
|
+
cache.set(documentPath, content)
|
|
162
|
+
return content
|
|
163
|
+
}),
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Extract section content from cached file content.
|
|
169
|
+
*/
|
|
170
|
+
const extractSectionFromContent = (
|
|
171
|
+
content: string,
|
|
172
|
+
startLine: number,
|
|
173
|
+
endLine: number,
|
|
174
|
+
): string => {
|
|
175
|
+
const lines = content.split('\n')
|
|
176
|
+
return lines.slice(startLine - 1, endLine).join('\n')
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ============================================================================
|
|
180
|
+
// Duplicate Detection
|
|
181
|
+
// ============================================================================
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Detect duplicate sections using content hashing (exact matches).
|
|
185
|
+
* This is fast and doesn't require embeddings.
|
|
186
|
+
*/
|
|
187
|
+
export const detectExactDuplicates = (
|
|
188
|
+
rootPath: string,
|
|
189
|
+
options: DuplicateDetectionOptions = {},
|
|
190
|
+
): Effect.Effect<
|
|
191
|
+
DuplicateDetectionResult,
|
|
192
|
+
FileReadError | IndexCorruptedError
|
|
193
|
+
> =>
|
|
194
|
+
Effect.gen(function* () {
|
|
195
|
+
const minContentLength = options.minContentLength ?? 50
|
|
196
|
+
const storage = createStorage(rootPath)
|
|
197
|
+
|
|
198
|
+
// Load section index
|
|
199
|
+
const sectionIndex = yield* loadSectionIndex(storage)
|
|
200
|
+
if (!sectionIndex) {
|
|
201
|
+
return {
|
|
202
|
+
groups: [],
|
|
203
|
+
sectionsAnalyzed: 0,
|
|
204
|
+
duplicatePairs: 0,
|
|
205
|
+
sectionsWithDuplicates: 0,
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const sections = Object.values(sectionIndex.sections)
|
|
210
|
+
|
|
211
|
+
// Filter sections by path pattern if specified
|
|
212
|
+
const filteredSections = options.pathPattern
|
|
213
|
+
? sections.filter((s) =>
|
|
214
|
+
matchPathPattern(s.documentPath, options.pathPattern!),
|
|
215
|
+
)
|
|
216
|
+
: sections
|
|
217
|
+
|
|
218
|
+
// Map: hash -> list of sections with that hash
|
|
219
|
+
const hashGroups = new Map<string, DuplicateSectionInfo[]>()
|
|
220
|
+
|
|
221
|
+
// Create file content cache to avoid re-reading files
|
|
222
|
+
const fileCache = createFileContentCache()
|
|
223
|
+
|
|
224
|
+
// Process sections in parallel batches, grouped by file for cache efficiency
|
|
225
|
+
// First, group sections by file to maximize cache hits
|
|
226
|
+
const sectionsByFile = new Map<string, typeof filteredSections>()
|
|
227
|
+
for (const section of filteredSections) {
|
|
228
|
+
const existing = sectionsByFile.get(section.documentPath)
|
|
229
|
+
if (existing) {
|
|
230
|
+
existing.push(section)
|
|
231
|
+
} else {
|
|
232
|
+
sectionsByFile.set(section.documentPath, [section])
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Process all files in parallel with concurrency limit
|
|
237
|
+
yield* Effect.all(
|
|
238
|
+
Array.from(sectionsByFile.entries()).map(([documentPath, sections]) =>
|
|
239
|
+
Effect.gen(function* () {
|
|
240
|
+
// Load file content once (cached)
|
|
241
|
+
const fileContent = yield* fileCache.get(rootPath, documentPath)
|
|
242
|
+
if (!fileContent) return
|
|
243
|
+
|
|
244
|
+
// Process all sections from this file
|
|
245
|
+
for (const section of sections) {
|
|
246
|
+
const content = extractSectionFromContent(
|
|
247
|
+
fileContent,
|
|
248
|
+
section.startLine,
|
|
249
|
+
section.endLine,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if (content.length < minContentLength) {
|
|
253
|
+
continue
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const hash = computeContentHash(content)
|
|
257
|
+
const info: DuplicateSectionInfo = {
|
|
258
|
+
sectionId: section.id,
|
|
259
|
+
documentPath: section.documentPath,
|
|
260
|
+
heading: section.heading,
|
|
261
|
+
startLine: section.startLine,
|
|
262
|
+
endLine: section.endLine,
|
|
263
|
+
tokenCount: section.tokenCount,
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
const existing = hashGroups.get(hash)
|
|
267
|
+
if (existing) {
|
|
268
|
+
existing.push(info)
|
|
269
|
+
} else {
|
|
270
|
+
hashGroups.set(hash, [info])
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}),
|
|
274
|
+
),
|
|
275
|
+
{ concurrency: 10 },
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
// Convert to DuplicateGroup format
|
|
279
|
+
const groups: DuplicateGroup[] = []
|
|
280
|
+
let duplicatePairs = 0
|
|
281
|
+
const sectionsInDuplicates = new Set<string>()
|
|
282
|
+
|
|
283
|
+
for (const members of hashGroups.values()) {
|
|
284
|
+
if (members.length > 1) {
|
|
285
|
+
const [primary, ...duplicates] = members
|
|
286
|
+
groups.push({
|
|
287
|
+
primary: primary!,
|
|
288
|
+
duplicates,
|
|
289
|
+
method: 'exact',
|
|
290
|
+
similarity: 1.0,
|
|
291
|
+
})
|
|
292
|
+
|
|
293
|
+
// Track stats
|
|
294
|
+
duplicatePairs += duplicates.length
|
|
295
|
+
for (const m of members) {
|
|
296
|
+
sectionsInDuplicates.add(m.sectionId)
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// Sort by number of duplicates (descending)
|
|
302
|
+
groups.sort((a, b) => b.duplicates.length - a.duplicates.length)
|
|
303
|
+
|
|
304
|
+
return {
|
|
305
|
+
groups,
|
|
306
|
+
sectionsAnalyzed: filteredSections.length,
|
|
307
|
+
duplicatePairs,
|
|
308
|
+
sectionsWithDuplicates: sectionsInDuplicates.size,
|
|
309
|
+
}
|
|
310
|
+
})
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Simple path pattern matching (supports glob-like patterns).
|
|
314
|
+
*/
|
|
315
|
+
const matchPathPattern = (filePath: string, pattern: string): boolean => {
|
|
316
|
+
// Simple glob support: * matches any sequence, ** matches any path segments
|
|
317
|
+
const regexPattern = pattern
|
|
318
|
+
.replace(/[.+^${}()|[\]\\]/g, '\\$&') // Escape regex special chars
|
|
319
|
+
.replace(/\*\*/g, '.*') // ** matches anything
|
|
320
|
+
.replace(/\*/g, '[^/]*') // * matches within a segment
|
|
321
|
+
const regex = new RegExp(`^${regexPattern}`)
|
|
322
|
+
return regex.test(filePath)
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// ============================================================================
|
|
326
|
+
// Search Result Collapsing
|
|
327
|
+
// ============================================================================
|
|
328
|
+
|
|
329
|
+
/**
|
|
330
|
+
* Collapse duplicate search results.
|
|
331
|
+
* Takes search results and duplicate groups, returns collapsed results.
|
|
332
|
+
*
|
|
333
|
+
* @param results - Search results with sectionId property
|
|
334
|
+
* @param duplicateGroups - Pre-computed duplicate groups
|
|
335
|
+
* @param options - Collapse options
|
|
336
|
+
* @returns Collapsed results with duplicate counts
|
|
337
|
+
*/
|
|
338
|
+
export const collapseDuplicates = <
|
|
339
|
+
T extends { readonly sectionId: string; readonly documentPath: string },
|
|
340
|
+
>(
|
|
341
|
+
results: readonly T[],
|
|
342
|
+
duplicateGroups: readonly DuplicateGroup[],
|
|
343
|
+
options: CollapseOptions = {},
|
|
344
|
+
): readonly CollapsedResult<T>[] => {
|
|
345
|
+
const maxLocations = options.maxLocations ?? 3
|
|
346
|
+
|
|
347
|
+
// Build a map: sectionId -> primary sectionId (or self if not a duplicate)
|
|
348
|
+
const primaryMap = new Map<string, string>()
|
|
349
|
+
const duplicateMap = new Map<string, DuplicateSectionInfo[]>()
|
|
350
|
+
|
|
351
|
+
for (const group of duplicateGroups) {
|
|
352
|
+
// Map primary to itself
|
|
353
|
+
primaryMap.set(group.primary.sectionId, group.primary.sectionId)
|
|
354
|
+
duplicateMap.set(group.primary.sectionId, [...group.duplicates])
|
|
355
|
+
|
|
356
|
+
// Map all duplicates to primary
|
|
357
|
+
for (const dup of group.duplicates) {
|
|
358
|
+
primaryMap.set(dup.sectionId, group.primary.sectionId)
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Track which primaries we've already added
|
|
363
|
+
const seenPrimaries = new Set<string>()
|
|
364
|
+
const collapsedResults: CollapsedResult<T>[] = []
|
|
365
|
+
|
|
366
|
+
for (const result of results) {
|
|
367
|
+
const primaryId = primaryMap.get(result.sectionId) ?? result.sectionId
|
|
368
|
+
|
|
369
|
+
if (seenPrimaries.has(primaryId)) {
|
|
370
|
+
// Skip - we've already added this duplicate group
|
|
371
|
+
continue
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
seenPrimaries.add(primaryId)
|
|
375
|
+
|
|
376
|
+
// Get duplicate info
|
|
377
|
+
const duplicates = duplicateMap.get(primaryId) ?? []
|
|
378
|
+
const duplicateLocations =
|
|
379
|
+
options.showLocations && duplicates.length > 0
|
|
380
|
+
? duplicates.slice(0, maxLocations).map((d) => ({
|
|
381
|
+
documentPath: d.documentPath,
|
|
382
|
+
heading: d.heading,
|
|
383
|
+
}))
|
|
384
|
+
: undefined
|
|
385
|
+
|
|
386
|
+
collapsedResults.push({
|
|
387
|
+
result,
|
|
388
|
+
duplicateCount: duplicates.length,
|
|
389
|
+
duplicateLocations,
|
|
390
|
+
})
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
return collapsedResults
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// ============================================================================
|
|
397
|
+
// Detection from Index (no content loading needed for hash-only)
|
|
398
|
+
// ============================================================================
|
|
399
|
+
|
|
400
|
+
/**
|
|
401
|
+
* Get duplicate groups from the section index.
|
|
402
|
+
* This is the main entry point for duplicate detection.
|
|
403
|
+
*/
|
|
404
|
+
export const detectDuplicates = (
|
|
405
|
+
rootPath: string,
|
|
406
|
+
options: DuplicateDetectionOptions = {},
|
|
407
|
+
): Effect.Effect<
|
|
408
|
+
DuplicateDetectionResult,
|
|
409
|
+
FileReadError | IndexCorruptedError
|
|
410
|
+
> => {
|
|
411
|
+
// For now, we only support exact duplicate detection via content hashing.
|
|
412
|
+
// Future: Add embedding-based similarity detection for near-duplicates.
|
|
413
|
+
return detectExactDuplicates(rootPath, options)
|
|
414
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Duplicate detection module exports
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export type {
|
|
6
|
+
CollapsedResult,
|
|
7
|
+
CollapseOptions,
|
|
8
|
+
DuplicateDetectionOptions,
|
|
9
|
+
DuplicateDetectionResult,
|
|
10
|
+
DuplicateGroup,
|
|
11
|
+
DuplicateSectionInfo,
|
|
12
|
+
} from './detector.js'
|
|
13
|
+
|
|
14
|
+
export {
|
|
15
|
+
collapseDuplicates,
|
|
16
|
+
detectDuplicates,
|
|
17
|
+
detectExactDuplicates,
|
|
18
|
+
} from './detector.js'
|