@comfanion/usethis_search 0.1.5 → 0.2.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -7
- package/file-indexer.ts +21 -1
- package/package.json +12 -2
- package/tools/codeindex.ts +135 -16
- package/tools/search.ts +46 -11
- package/vectorizer/bm25-index.ts +155 -0
- package/vectorizer/chunkers/chunker-factory.ts +98 -0
- package/vectorizer/chunkers/code-chunker.ts +325 -0
- package/vectorizer/chunkers/markdown-chunker.ts +177 -0
- package/vectorizer/content-cleaner.ts +136 -0
- package/vectorizer/hybrid-search.ts +97 -0
- package/vectorizer/index.js +395 -16
- package/vectorizer/metadata-extractor.ts +125 -0
- package/vectorizer/query-cache.ts +126 -0
- package/vectorizer/search-metrics.ts +155 -0
- package/vectorizer.yaml +81 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown Chunker — splits Markdown by heading structure.
|
|
3
|
+
*
|
|
4
|
+
* Preserves heading hierarchy ("API > Auth > JWT") in metadata,
|
|
5
|
+
* merges small sections, and splits oversized ones.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface MarkdownChunkConfig {
|
|
9
|
+
min_chunk_size: number // merge sections smaller than this (chars)
|
|
10
|
+
max_chunk_size: number // split sections larger than this (chars)
|
|
11
|
+
split_by_headings: boolean
|
|
12
|
+
preserve_heading_hierarchy: boolean
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
|
|
16
|
+
min_chunk_size: 200,
|
|
17
|
+
max_chunk_size: 2000,
|
|
18
|
+
split_by_headings: true,
|
|
19
|
+
preserve_heading_hierarchy: true,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface MarkdownChunk {
|
|
23
|
+
content: string
|
|
24
|
+
heading_context: string // "H1 > H2 > H3"
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── Internal types ──────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
interface Section {
|
|
30
|
+
level: number // 1-6 for headings, 0 for preamble
|
|
31
|
+
heading: string
|
|
32
|
+
body: string
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// ── Parsing ─────────────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
/** Parse Markdown into sections keyed by heading. */
|
|
38
|
+
function parseSections(content: string): Section[] {
|
|
39
|
+
const lines = content.split("\n")
|
|
40
|
+
const sections: Section[] = []
|
|
41
|
+
let currentSection: Section = { level: 0, heading: "", body: "" }
|
|
42
|
+
|
|
43
|
+
for (const line of lines) {
|
|
44
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
|
|
45
|
+
if (headingMatch) {
|
|
46
|
+
// Push previous section
|
|
47
|
+
if (currentSection.body.trim() || currentSection.heading) {
|
|
48
|
+
sections.push(currentSection)
|
|
49
|
+
}
|
|
50
|
+
currentSection = {
|
|
51
|
+
level: headingMatch[1].length,
|
|
52
|
+
heading: headingMatch[2].trim(),
|
|
53
|
+
body: "",
|
|
54
|
+
}
|
|
55
|
+
} else {
|
|
56
|
+
currentSection.body += line + "\n"
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Push last section
|
|
61
|
+
if (currentSection.body.trim() || currentSection.heading) {
|
|
62
|
+
sections.push(currentSection)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return sections
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Build heading hierarchy path for a section given the heading stack. */
|
|
69
|
+
function buildHeadingContext(stack: { level: number; heading: string }[]): string {
|
|
70
|
+
return stack.map((h) => h.heading).join(" > ")
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// ── Splitting oversized sections ────────────────────────────────────────────
|
|
74
|
+
|
|
75
|
+
function splitLargeText(text: string, maxSize: number): string[] {
|
|
76
|
+
if (text.length <= maxSize) return [text]
|
|
77
|
+
|
|
78
|
+
const chunks: string[] = []
|
|
79
|
+
const lines = text.split("\n")
|
|
80
|
+
let current: string[] = []
|
|
81
|
+
let currentLen = 0
|
|
82
|
+
|
|
83
|
+
for (const line of lines) {
|
|
84
|
+
if (currentLen + line.length + 1 > maxSize && current.length > 0) {
|
|
85
|
+
chunks.push(current.join("\n"))
|
|
86
|
+
current = []
|
|
87
|
+
currentLen = 0
|
|
88
|
+
}
|
|
89
|
+
current.push(line)
|
|
90
|
+
currentLen += line.length + 1
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (current.length > 0) {
|
|
94
|
+
chunks.push(current.join("\n"))
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return chunks
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Chunk Markdown content into semantic sections.
|
|
104
|
+
*/
|
|
105
|
+
export function chunkMarkdown(
|
|
106
|
+
content: string,
|
|
107
|
+
config: MarkdownChunkConfig = DEFAULT_MD_CONFIG,
|
|
108
|
+
): MarkdownChunk[] {
|
|
109
|
+
if (!config.split_by_headings) {
|
|
110
|
+
// Fallback: single chunk (caller can use fixed chunker)
|
|
111
|
+
return [{ content, heading_context: "" }]
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const sections = parseSections(content)
|
|
115
|
+
const rawChunks: MarkdownChunk[] = []
|
|
116
|
+
|
|
117
|
+
// Heading stack for hierarchy tracking
|
|
118
|
+
const headingStack: { level: number; heading: string }[] = []
|
|
119
|
+
|
|
120
|
+
for (const section of sections) {
|
|
121
|
+
// Update heading stack
|
|
122
|
+
if (section.level > 0) {
|
|
123
|
+
// Pop headings at same or deeper level
|
|
124
|
+
while (
|
|
125
|
+
headingStack.length > 0 &&
|
|
126
|
+
headingStack[headingStack.length - 1].level >= section.level
|
|
127
|
+
) {
|
|
128
|
+
headingStack.pop()
|
|
129
|
+
}
|
|
130
|
+
headingStack.push({ level: section.level, heading: section.heading })
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const headingContext = config.preserve_heading_hierarchy
|
|
134
|
+
? buildHeadingContext(headingStack)
|
|
135
|
+
: section.heading
|
|
136
|
+
|
|
137
|
+
const sectionText = section.heading
|
|
138
|
+
? `${"#".repeat(section.level)} ${section.heading}\n${section.body}`
|
|
139
|
+
: section.body
|
|
140
|
+
|
|
141
|
+
rawChunks.push({ content: sectionText.trim(), heading_context: headingContext })
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Merge small sections with previous
|
|
145
|
+
const merged: MarkdownChunk[] = []
|
|
146
|
+
for (const chunk of rawChunks) {
|
|
147
|
+
if (
|
|
148
|
+
merged.length > 0 &&
|
|
149
|
+
chunk.content.length < config.min_chunk_size
|
|
150
|
+
) {
|
|
151
|
+
const prev = merged[merged.length - 1]
|
|
152
|
+
prev.content += "\n\n" + chunk.content
|
|
153
|
+
// Keep the deepest heading context
|
|
154
|
+
if (chunk.heading_context) {
|
|
155
|
+
prev.heading_context = chunk.heading_context
|
|
156
|
+
}
|
|
157
|
+
} else {
|
|
158
|
+
merged.push({ ...chunk })
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Split oversized sections
|
|
163
|
+
const result: MarkdownChunk[] = []
|
|
164
|
+
for (const chunk of merged) {
|
|
165
|
+
if (chunk.content.length > config.max_chunk_size) {
|
|
166
|
+
const parts = splitLargeText(chunk.content, config.max_chunk_size)
|
|
167
|
+
for (const part of parts) {
|
|
168
|
+
result.push({ content: part, heading_context: chunk.heading_context })
|
|
169
|
+
}
|
|
170
|
+
} else {
|
|
171
|
+
result.push(chunk)
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Filter empties
|
|
176
|
+
return result.filter((c) => c.content.trim().length > 0)
|
|
177
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content Cleaner — removes noise from file content before chunking.
|
|
3
|
+
*
|
|
4
|
+
* Strips TOC blocks, breadcrumbs, repeated headers, auto-generated markers,
|
|
5
|
+
* and optionally imports/comments so the embedding model sees only signal.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface CleaningConfig {
|
|
9
|
+
remove_toc: boolean
|
|
10
|
+
remove_frontmatter_metadata: boolean
|
|
11
|
+
remove_imports: boolean
|
|
12
|
+
remove_comments: boolean
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export const DEFAULT_CLEANING_CONFIG: CleaningConfig = {
|
|
16
|
+
remove_toc: true,
|
|
17
|
+
remove_frontmatter_metadata: false,
|
|
18
|
+
remove_imports: false,
|
|
19
|
+
remove_comments: false,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// ── Markdown noise ──────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
/** Remove YAML front-matter (---…---) from Markdown. */
|
|
25
|
+
function stripFrontmatter(text: string): string {
|
|
26
|
+
return text.replace(/^---\n[\s\S]*?\n---\n?/, "")
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Remove inline TOC blocks.
|
|
31
|
+
* Matches patterns like:
|
|
32
|
+
* ## Table of Contents
|
|
33
|
+
* - [Section](#section)
|
|
34
|
+
* …blank line
|
|
35
|
+
*/
|
|
36
|
+
function stripToc(text: string): string {
|
|
37
|
+
// Pattern: heading containing "table of contents" or "contents" followed by link-list
|
|
38
|
+
return text.replace(
|
|
39
|
+
/^#{1,3}\s*(Table of Contents|Contents|TOC)\s*\n([\t ]*[-*]\s*\[.*?\]\(#.*?\)\s*\n?)+/gim,
|
|
40
|
+
"",
|
|
41
|
+
)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Remove HTML-style TOC markers like <!-- TOC --> … <!-- /TOC --> */
|
|
45
|
+
function stripHtmlTocMarkers(text: string): string {
|
|
46
|
+
return text.replace(/<!--\s*TOC\s*-->[\s\S]*?<!--\s*\/TOC\s*-->\n?/gi, "")
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Remove breadcrumb lines, e.g. `Home > Docs > API` at the top. */
|
|
50
|
+
function stripBreadcrumbs(text: string): string {
|
|
51
|
+
// Matches lines that look like breadcrumbs (word > word > word) at start
|
|
52
|
+
return text.replace(/^(?:[\w\s]+>\s*){2,}[\w\s]+\n{1,2}/gm, "")
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Remove auto-generated code markers like `// AUTO-GENERATED` blocks. */
|
|
56
|
+
function stripAutoGenMarkers(text: string): string {
|
|
57
|
+
return text.replace(
|
|
58
|
+
/\/[/*]\s*(?:AUTO[- ]?GENERATED|DO NOT (?:EDIT|MODIFY)|GENERATED BY|This file (?:is|was) (?:auto-?)?generated)[^\n]*/gi,
|
|
59
|
+
"",
|
|
60
|
+
)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ── Code noise ──────────────────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
/** Remove import/require statements (JS/TS/Python/Go). */
|
|
66
|
+
function stripImports(text: string): string {
|
|
67
|
+
// JS/TS imports
|
|
68
|
+
let result = text.replace(/^import\s[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, "")
|
|
69
|
+
result = result.replace(/^import\s+['"][^'"]+['"];?\s*$/gm, "")
|
|
70
|
+
// require
|
|
71
|
+
result = result.replace(/^(?:const|let|var)\s+.*?=\s*require\s*\(.*?\);?\s*$/gm, "")
|
|
72
|
+
// Python
|
|
73
|
+
result = result.replace(/^(?:from\s+\S+\s+)?import\s+.+$/gm, "")
|
|
74
|
+
// Go
|
|
75
|
+
result = result.replace(/^import\s*\(\s*\n(?:[\t ]*"[^"]*"\s*\n?)*\s*\)/gm, "")
|
|
76
|
+
result = result.replace(/^import\s+"[^"]*"\s*$/gm, "")
|
|
77
|
+
return result
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Remove single-line and block comments (JS/TS style). */
|
|
81
|
+
function stripComments(text: string): string {
|
|
82
|
+
// Block comments
|
|
83
|
+
let result = text.replace(/\/\*[\s\S]*?\*\//g, "")
|
|
84
|
+
// Single-line // comments (only full-line, not inline URLs etc.)
|
|
85
|
+
result = result.replace(/^\s*\/\/[^\n]*$/gm, "")
|
|
86
|
+
// Python/Ruby # comments (full line only)
|
|
87
|
+
result = result.replace(/^\s*#[^\n!]*$/gm, "")
|
|
88
|
+
return result
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ── Shared ──────────────────────────────────────────────────────────────────
|
|
92
|
+
|
|
93
|
+
/** Collapse 3+ consecutive blank lines into 2. */
|
|
94
|
+
function collapseBlankLines(text: string): string {
|
|
95
|
+
return text.replace(/\n{3,}/g, "\n\n")
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Clean file content according to the supplied config.
|
|
102
|
+
* @param content Raw file content
|
|
103
|
+
* @param fileType 'docs' | 'code' | 'config'
|
|
104
|
+
* @param config Cleaning options
|
|
105
|
+
*/
|
|
106
|
+
export function cleanContent(
|
|
107
|
+
content: string,
|
|
108
|
+
fileType: "docs" | "code" | "config",
|
|
109
|
+
config: CleaningConfig = DEFAULT_CLEANING_CONFIG,
|
|
110
|
+
): string {
|
|
111
|
+
let result = content
|
|
112
|
+
|
|
113
|
+
if (fileType === "docs") {
|
|
114
|
+
if (config.remove_frontmatter_metadata) {
|
|
115
|
+
result = stripFrontmatter(result)
|
|
116
|
+
}
|
|
117
|
+
if (config.remove_toc) {
|
|
118
|
+
result = stripToc(result)
|
|
119
|
+
result = stripHtmlTocMarkers(result)
|
|
120
|
+
}
|
|
121
|
+
result = stripBreadcrumbs(result)
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
if (fileType === "code") {
|
|
125
|
+
result = stripAutoGenMarkers(result)
|
|
126
|
+
if (config.remove_imports) {
|
|
127
|
+
result = stripImports(result)
|
|
128
|
+
}
|
|
129
|
+
if (config.remove_comments) {
|
|
130
|
+
result = stripComments(result)
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
result = collapseBlankLines(result).trim()
|
|
135
|
+
return result
|
|
136
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hybrid Search — merges vector similarity and BM25 keyword scores.
|
|
3
|
+
*
|
|
4
|
+
* Uses Reciprocal Rank Fusion (RRF) or weighted linear combination
|
|
5
|
+
* to merge results from two search backends.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// ── Types ───────────────────────────────────────────────────────────────────
|
|
9
|
+
|
|
10
|
+
export interface HybridSearchConfig {
|
|
11
|
+
enabled: boolean
|
|
12
|
+
bm25_weight: number // 0.0–1.0, vector_weight = 1 - bm25_weight
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export const DEFAULT_HYBRID_CONFIG: HybridSearchConfig = {
|
|
16
|
+
enabled: false,
|
|
17
|
+
bm25_weight: 0.3,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface ScoredResult {
|
|
21
|
+
id: number // index into the results array
|
|
22
|
+
vectorScore: number // 0–1 (1 = best)
|
|
23
|
+
bm25Score: number // raw BM25 score (unnormalized)
|
|
24
|
+
combinedScore: number
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── Merge logic ─────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Normalize BM25 scores to 0–1 range using min-max scaling.
|
|
31
|
+
*/
|
|
32
|
+
function normalizeBM25Scores(scores: Map<number, number>): Map<number, number> {
|
|
33
|
+
if (scores.size === 0) return scores
|
|
34
|
+
|
|
35
|
+
let min = Infinity
|
|
36
|
+
let max = -Infinity
|
|
37
|
+
for (const s of scores.values()) {
|
|
38
|
+
if (s < min) min = s
|
|
39
|
+
if (s > max) max = s
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const range = max - min
|
|
43
|
+
if (range === 0) {
|
|
44
|
+
// All same score → normalize to 0.5
|
|
45
|
+
const result = new Map<number, number>()
|
|
46
|
+
for (const [id] of scores) result.set(id, 0.5)
|
|
47
|
+
return result
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const result = new Map<number, number>()
|
|
51
|
+
for (const [id, score] of scores) {
|
|
52
|
+
result.set(id, (score - min) / range)
|
|
53
|
+
}
|
|
54
|
+
return result
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Merge vector and BM25 results using weighted linear combination.
|
|
59
|
+
*
|
|
60
|
+
* @param vectorResults Map of chunkIndex → vectorScore (0–1, higher = better)
|
|
61
|
+
* @param bm25Results Map of chunkIndex → raw BM25 score
|
|
62
|
+
* @param config Hybrid search config (weights)
|
|
63
|
+
* @param limit Max results to return
|
|
64
|
+
*/
|
|
65
|
+
export function mergeResults(
|
|
66
|
+
vectorResults: Map<number, number>,
|
|
67
|
+
bm25Results: Map<number, number>,
|
|
68
|
+
config: HybridSearchConfig = DEFAULT_HYBRID_CONFIG,
|
|
69
|
+
limit: number = 10,
|
|
70
|
+
): ScoredResult[] {
|
|
71
|
+
const vectorWeight = 1 - config.bm25_weight
|
|
72
|
+
const bm25Weight = config.bm25_weight
|
|
73
|
+
|
|
74
|
+
const normalizedBM25 = normalizeBM25Scores(bm25Results)
|
|
75
|
+
|
|
76
|
+
// Collect all unique IDs
|
|
77
|
+
const allIds = new Set<number>()
|
|
78
|
+
for (const id of vectorResults.keys()) allIds.add(id)
|
|
79
|
+
for (const id of normalizedBM25.keys()) allIds.add(id)
|
|
80
|
+
|
|
81
|
+
const results: ScoredResult[] = []
|
|
82
|
+
|
|
83
|
+
for (const id of allIds) {
|
|
84
|
+
const vs = vectorResults.get(id) ?? 0
|
|
85
|
+
const bs = normalizedBM25.get(id) ?? 0
|
|
86
|
+
|
|
87
|
+
results.push({
|
|
88
|
+
id,
|
|
89
|
+
vectorScore: vs,
|
|
90
|
+
bm25Score: bm25Results.get(id) ?? 0,
|
|
91
|
+
combinedScore: vectorWeight * vs + bm25Weight * bs,
|
|
92
|
+
})
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
results.sort((a, b) => b.combinedScore - a.combinedScore)
|
|
96
|
+
return results.slice(0, limit)
|
|
97
|
+
}
|