@comfanion/usethis_search 0.1.5 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ /**
2
+ * Markdown Chunker — splits Markdown by heading structure.
3
+ *
4
+ * Preserves heading hierarchy ("API > Auth > JWT") in metadata,
5
+ * merges small sections, and splits oversized ones.
6
+ */
7
+
8
+ export interface MarkdownChunkConfig {
9
+ min_chunk_size: number // merge sections smaller than this (chars)
10
+ max_chunk_size: number // split sections larger than this (chars)
11
+ split_by_headings: boolean
12
+ preserve_heading_hierarchy: boolean
13
+ }
14
+
15
+ export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
16
+ min_chunk_size: 200,
17
+ max_chunk_size: 2000,
18
+ split_by_headings: true,
19
+ preserve_heading_hierarchy: true,
20
+ }
21
+
22
+ export interface MarkdownChunk {
23
+ content: string
24
+ heading_context: string // "H1 > H2 > H3"
25
+ }
26
+
27
+ // ── Internal types ──────────────────────────────────────────────────────────
28
+
29
+ interface Section {
30
+ level: number // 1-6 for headings, 0 for preamble
31
+ heading: string
32
+ body: string
33
+ }
34
+
35
+ // ── Parsing ─────────────────────────────────────────────────────────────────
36
+
37
+ /** Parse Markdown into sections keyed by heading. */
38
+ function parseSections(content: string): Section[] {
39
+ const lines = content.split("\n")
40
+ const sections: Section[] = []
41
+ let currentSection: Section = { level: 0, heading: "", body: "" }
42
+
43
+ for (const line of lines) {
44
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
45
+ if (headingMatch) {
46
+ // Push previous section
47
+ if (currentSection.body.trim() || currentSection.heading) {
48
+ sections.push(currentSection)
49
+ }
50
+ currentSection = {
51
+ level: headingMatch[1].length,
52
+ heading: headingMatch[2].trim(),
53
+ body: "",
54
+ }
55
+ } else {
56
+ currentSection.body += line + "\n"
57
+ }
58
+ }
59
+
60
+ // Push last section
61
+ if (currentSection.body.trim() || currentSection.heading) {
62
+ sections.push(currentSection)
63
+ }
64
+
65
+ return sections
66
+ }
67
+
68
+ /** Build heading hierarchy path for a section given the heading stack. */
69
+ function buildHeadingContext(stack: { level: number; heading: string }[]): string {
70
+ return stack.map((h) => h.heading).join(" > ")
71
+ }
72
+
73
+ // ── Splitting oversized sections ────────────────────────────────────────────
74
+
75
+ function splitLargeText(text: string, maxSize: number): string[] {
76
+ if (text.length <= maxSize) return [text]
77
+
78
+ const chunks: string[] = []
79
+ const lines = text.split("\n")
80
+ let current: string[] = []
81
+ let currentLen = 0
82
+
83
+ for (const line of lines) {
84
+ if (currentLen + line.length + 1 > maxSize && current.length > 0) {
85
+ chunks.push(current.join("\n"))
86
+ current = []
87
+ currentLen = 0
88
+ }
89
+ current.push(line)
90
+ currentLen += line.length + 1
91
+ }
92
+
93
+ if (current.length > 0) {
94
+ chunks.push(current.join("\n"))
95
+ }
96
+
97
+ return chunks
98
+ }
99
+
100
+ // ── Public API ──────────────────────────────────────────────────────────────
101
+
102
+ /**
103
+ * Chunk Markdown content into semantic sections.
104
+ */
105
+ export function chunkMarkdown(
106
+ content: string,
107
+ config: MarkdownChunkConfig = DEFAULT_MD_CONFIG,
108
+ ): MarkdownChunk[] {
109
+ if (!config.split_by_headings) {
110
+ // Fallback: single chunk (caller can use fixed chunker)
111
+ return [{ content, heading_context: "" }]
112
+ }
113
+
114
+ const sections = parseSections(content)
115
+ const rawChunks: MarkdownChunk[] = []
116
+
117
+ // Heading stack for hierarchy tracking
118
+ const headingStack: { level: number; heading: string }[] = []
119
+
120
+ for (const section of sections) {
121
+ // Update heading stack
122
+ if (section.level > 0) {
123
+ // Pop headings at same or deeper level
124
+ while (
125
+ headingStack.length > 0 &&
126
+ headingStack[headingStack.length - 1].level >= section.level
127
+ ) {
128
+ headingStack.pop()
129
+ }
130
+ headingStack.push({ level: section.level, heading: section.heading })
131
+ }
132
+
133
+ const headingContext = config.preserve_heading_hierarchy
134
+ ? buildHeadingContext(headingStack)
135
+ : section.heading
136
+
137
+ const sectionText = section.heading
138
+ ? `${"#".repeat(section.level)} ${section.heading}\n${section.body}`
139
+ : section.body
140
+
141
+ rawChunks.push({ content: sectionText.trim(), heading_context: headingContext })
142
+ }
143
+
144
+ // Merge small sections with previous
145
+ const merged: MarkdownChunk[] = []
146
+ for (const chunk of rawChunks) {
147
+ if (
148
+ merged.length > 0 &&
149
+ chunk.content.length < config.min_chunk_size
150
+ ) {
151
+ const prev = merged[merged.length - 1]
152
+ prev.content += "\n\n" + chunk.content
153
+ // Keep the deepest heading context
154
+ if (chunk.heading_context) {
155
+ prev.heading_context = chunk.heading_context
156
+ }
157
+ } else {
158
+ merged.push({ ...chunk })
159
+ }
160
+ }
161
+
162
+ // Split oversized sections
163
+ const result: MarkdownChunk[] = []
164
+ for (const chunk of merged) {
165
+ if (chunk.content.length > config.max_chunk_size) {
166
+ const parts = splitLargeText(chunk.content, config.max_chunk_size)
167
+ for (const part of parts) {
168
+ result.push({ content: part, heading_context: chunk.heading_context })
169
+ }
170
+ } else {
171
+ result.push(chunk)
172
+ }
173
+ }
174
+
175
+ // Filter empties
176
+ return result.filter((c) => c.content.trim().length > 0)
177
+ }
@@ -0,0 +1,136 @@
1
+ /**
2
+ * Content Cleaner — removes noise from file content before chunking.
3
+ *
4
+ * Strips TOC blocks, breadcrumbs, repeated headers, auto-generated markers,
5
+ * and optionally imports/comments so the embedding model sees only signal.
6
+ */
7
+
8
+ export interface CleaningConfig {
9
+ remove_toc: boolean
10
+ remove_frontmatter_metadata: boolean
11
+ remove_imports: boolean
12
+ remove_comments: boolean
13
+ }
14
+
15
+ export const DEFAULT_CLEANING_CONFIG: CleaningConfig = {
16
+ remove_toc: true,
17
+ remove_frontmatter_metadata: false,
18
+ remove_imports: false,
19
+ remove_comments: false,
20
+ }
21
+
22
+ // ── Markdown noise ──────────────────────────────────────────────────────────
23
+
24
+ /** Remove YAML front-matter (---…---) from Markdown. */
25
+ function stripFrontmatter(text: string): string {
26
+ return text.replace(/^---\n[\s\S]*?\n---\n?/, "")
27
+ }
28
+
29
+ /**
30
+ * Remove inline TOC blocks.
31
+ * Matches patterns like:
32
+ * ## Table of Contents
33
+ * - [Section](#section)
34
+ * …blank line
35
+ */
36
+ function stripToc(text: string): string {
37
+ // Pattern: heading containing "table of contents" or "contents" followed by link-list
38
+ return text.replace(
39
+ /^#{1,3}\s*(Table of Contents|Contents|TOC)\s*\n([\t ]*[-*]\s*\[.*?\]\(#.*?\)\s*\n?)+/gim,
40
+ "",
41
+ )
42
+ }
43
+
44
+ /** Remove HTML-style TOC markers like <!-- TOC --> … <!-- /TOC --> */
45
+ function stripHtmlTocMarkers(text: string): string {
46
+ return text.replace(/<!--\s*TOC\s*-->[\s\S]*?<!--\s*\/TOC\s*-->\n?/gi, "")
47
+ }
48
+
49
+ /** Remove breadcrumb lines, e.g. `Home > Docs > API` at the top. */
50
+ function stripBreadcrumbs(text: string): string {
51
+ // Matches lines that look like breadcrumbs (word > word > word) at start
52
+ return text.replace(/^(?:[\w\s]+>\s*){2,}[\w\s]+\n{1,2}/gm, "")
53
+ }
54
+
55
+ /** Remove auto-generated code markers like `// AUTO-GENERATED` blocks. */
56
+ function stripAutoGenMarkers(text: string): string {
57
+ return text.replace(
58
+ /\/[/*]\s*(?:AUTO[- ]?GENERATED|DO NOT (?:EDIT|MODIFY)|GENERATED BY|This file (?:is|was) (?:auto-?)?generated)[^\n]*/gi,
59
+ "",
60
+ )
61
+ }
62
+
63
+ // ── Code noise ──────────────────────────────────────────────────────────────
64
+
65
+ /** Remove import/require statements (JS/TS/Python/Go). */
66
+ function stripImports(text: string): string {
67
+ // JS/TS imports
68
+ let result = text.replace(/^import\s[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, "")
69
+ result = result.replace(/^import\s+['"][^'"]+['"];?\s*$/gm, "")
70
+ // require
71
+ result = result.replace(/^(?:const|let|var)\s+.*?=\s*require\s*\(.*?\);?\s*$/gm, "")
72
+ // Python
73
+ result = result.replace(/^(?:from\s+\S+\s+)?import\s+.+$/gm, "")
74
+ // Go
75
+ result = result.replace(/^import\s*\(\s*\n(?:[\t ]*"[^"]*"\s*\n?)*\s*\)/gm, "")
76
+ result = result.replace(/^import\s+"[^"]*"\s*$/gm, "")
77
+ return result
78
+ }
79
+
80
+ /** Remove single-line and block comments (JS/TS style). */
81
+ function stripComments(text: string): string {
82
+ // Block comments
83
+ let result = text.replace(/\/\*[\s\S]*?\*\//g, "")
84
+ // Single-line // comments (only full-line, not inline URLs etc.)
85
+ result = result.replace(/^\s*\/\/[^\n]*$/gm, "")
86
+ // Python/Ruby # comments (full line only)
87
+ result = result.replace(/^\s*#[^\n!]*$/gm, "")
88
+ return result
89
+ }
90
+
91
+ // ── Shared ──────────────────────────────────────────────────────────────────
92
+
93
+ /** Collapse 3+ consecutive blank lines into 2. */
94
+ function collapseBlankLines(text: string): string {
95
+ return text.replace(/\n{3,}/g, "\n\n")
96
+ }
97
+
98
+ // ── Public API ──────────────────────────────────────────────────────────────
99
+
100
+ /**
101
+ * Clean file content according to the supplied config.
102
+ * @param content Raw file content
103
+ * @param fileType 'docs' | 'code' | 'config'
104
+ * @param config Cleaning options
105
+ */
106
+ export function cleanContent(
107
+ content: string,
108
+ fileType: "docs" | "code" | "config",
109
+ config: CleaningConfig = DEFAULT_CLEANING_CONFIG,
110
+ ): string {
111
+ let result = content
112
+
113
+ if (fileType === "docs") {
114
+ if (config.remove_frontmatter_metadata) {
115
+ result = stripFrontmatter(result)
116
+ }
117
+ if (config.remove_toc) {
118
+ result = stripToc(result)
119
+ result = stripHtmlTocMarkers(result)
120
+ }
121
+ result = stripBreadcrumbs(result)
122
+ }
123
+
124
+ if (fileType === "code") {
125
+ result = stripAutoGenMarkers(result)
126
+ if (config.remove_imports) {
127
+ result = stripImports(result)
128
+ }
129
+ if (config.remove_comments) {
130
+ result = stripComments(result)
131
+ }
132
+ }
133
+
134
+ result = collapseBlankLines(result).trim()
135
+ return result
136
+ }
@@ -0,0 +1,97 @@
1
+ /**
2
+ * Hybrid Search — merges vector similarity and BM25 keyword scores.
3
+ *
4
+ * Uses Reciprocal Rank Fusion (RRF) or weighted linear combination
5
+ * to merge results from two search backends.
6
+ */
7
+
8
+ // ── Types ───────────────────────────────────────────────────────────────────
9
+
10
+ export interface HybridSearchConfig {
11
+ enabled: boolean
12
+ bm25_weight: number // 0.0–1.0, vector_weight = 1 - bm25_weight
13
+ }
14
+
15
+ export const DEFAULT_HYBRID_CONFIG: HybridSearchConfig = {
16
+ enabled: false,
17
+ bm25_weight: 0.3,
18
+ }
19
+
20
+ export interface ScoredResult {
21
+ id: number // index into the results array
22
+ vectorScore: number // 0–1 (1 = best)
23
+ bm25Score: number // raw BM25 score (unnormalized)
24
+ combinedScore: number
25
+ }
26
+
27
+ // ── Merge logic ─────────────────────────────────────────────────────────────
28
+
29
+ /**
30
+ * Normalize BM25 scores to 0–1 range using min-max scaling.
31
+ */
32
+ function normalizeBM25Scores(scores: Map<number, number>): Map<number, number> {
33
+ if (scores.size === 0) return scores
34
+
35
+ let min = Infinity
36
+ let max = -Infinity
37
+ for (const s of scores.values()) {
38
+ if (s < min) min = s
39
+ if (s > max) max = s
40
+ }
41
+
42
+ const range = max - min
43
+ if (range === 0) {
44
+ // All same score → normalize to 0.5
45
+ const result = new Map<number, number>()
46
+ for (const [id] of scores) result.set(id, 0.5)
47
+ return result
48
+ }
49
+
50
+ const result = new Map<number, number>()
51
+ for (const [id, score] of scores) {
52
+ result.set(id, (score - min) / range)
53
+ }
54
+ return result
55
+ }
56
+
57
+ /**
58
+ * Merge vector and BM25 results using weighted linear combination.
59
+ *
60
+ * @param vectorResults Map of chunkIndex → vectorScore (0–1, higher = better)
61
+ * @param bm25Results Map of chunkIndex → raw BM25 score
62
+ * @param config Hybrid search config (weights)
63
+ * @param limit Max results to return
64
+ */
65
+ export function mergeResults(
66
+ vectorResults: Map<number, number>,
67
+ bm25Results: Map<number, number>,
68
+ config: HybridSearchConfig = DEFAULT_HYBRID_CONFIG,
69
+ limit: number = 10,
70
+ ): ScoredResult[] {
71
+ const vectorWeight = 1 - config.bm25_weight
72
+ const bm25Weight = config.bm25_weight
73
+
74
+ const normalizedBM25 = normalizeBM25Scores(bm25Results)
75
+
76
+ // Collect all unique IDs
77
+ const allIds = new Set<number>()
78
+ for (const id of vectorResults.keys()) allIds.add(id)
79
+ for (const id of normalizedBM25.keys()) allIds.add(id)
80
+
81
+ const results: ScoredResult[] = []
82
+
83
+ for (const id of allIds) {
84
+ const vs = vectorResults.get(id) ?? 0
85
+ const bs = normalizedBM25.get(id) ?? 0
86
+
87
+ results.push({
88
+ id,
89
+ vectorScore: vs,
90
+ bm25Score: bm25Results.get(id) ?? 0,
91
+ combinedScore: vectorWeight * vs + bm25Weight * bs,
92
+ })
93
+ }
94
+
95
+ results.sort((a, b) => b.combinedScore - a.combinedScore)
96
+ return results.slice(0, limit)
97
+ }