@comfanion/usethis_search 0.1.5 → 3.0.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ /**
2
+ * Markdown Chunker — splits Markdown by heading structure.
3
+ *
4
+ * Preserves heading hierarchy ("API > Auth > JWT") in metadata,
5
+ * merges small sections, and splits oversized ones.
6
+ */
7
+
8
+ export interface MarkdownChunkConfig {
9
+ min_chunk_size: number // merge sections smaller than this (chars)
10
+ max_chunk_size: number // split sections larger than this (chars)
11
+ split_by_headings: boolean
12
+ preserve_heading_hierarchy: boolean
13
+ }
14
+
15
+ export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
16
+ min_chunk_size: 200,
17
+ max_chunk_size: 2000,
18
+ split_by_headings: true,
19
+ preserve_heading_hierarchy: true,
20
+ }
21
+
22
+ export interface MarkdownChunk {
23
+ content: string
24
+ heading_context: string // "H1 > H2 > H3"
25
+ }
26
+
27
+ // ── Internal types ──────────────────────────────────────────────────────────
28
+
29
+ interface Section {
30
+ level: number // 1-6 for headings, 0 for preamble
31
+ heading: string
32
+ body: string
33
+ }
34
+
35
+ // ── Parsing ─────────────────────────────────────────────────────────────────
36
+
37
+ /** Parse Markdown into sections keyed by heading. */
38
+ function parseSections(content: string): Section[] {
39
+ const lines = content.split("\n")
40
+ const sections: Section[] = []
41
+ let currentSection: Section = { level: 0, heading: "", body: "" }
42
+
43
+ for (const line of lines) {
44
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
45
+ if (headingMatch) {
46
+ // Push previous section
47
+ if (currentSection.body.trim() || currentSection.heading) {
48
+ sections.push(currentSection)
49
+ }
50
+ currentSection = {
51
+ level: headingMatch[1].length,
52
+ heading: headingMatch[2].trim(),
53
+ body: "",
54
+ }
55
+ } else {
56
+ currentSection.body += line + "\n"
57
+ }
58
+ }
59
+
60
+ // Push last section
61
+ if (currentSection.body.trim() || currentSection.heading) {
62
+ sections.push(currentSection)
63
+ }
64
+
65
+ return sections
66
+ }
67
+
68
+ /** Build heading hierarchy path for a section given the heading stack. */
69
+ function buildHeadingContext(stack: { level: number; heading: string }[]): string {
70
+ return stack.map((h) => h.heading).join(" > ")
71
+ }
72
+
73
+ // ── Splitting oversized sections ────────────────────────────────────────────
74
+
75
+ function splitLargeText(text: string, maxSize: number): string[] {
76
+ if (text.length <= maxSize) return [text]
77
+
78
+ const chunks: string[] = []
79
+ const lines = text.split("\n")
80
+ let current: string[] = []
81
+ let currentLen = 0
82
+
83
+ for (const line of lines) {
84
+ if (currentLen + line.length + 1 > maxSize && current.length > 0) {
85
+ chunks.push(current.join("\n"))
86
+ current = []
87
+ currentLen = 0
88
+ }
89
+ current.push(line)
90
+ currentLen += line.length + 1
91
+ }
92
+
93
+ if (current.length > 0) {
94
+ chunks.push(current.join("\n"))
95
+ }
96
+
97
+ return chunks
98
+ }
99
+
100
+ // ── Public API ──────────────────────────────────────────────────────────────
101
+
102
+ /**
103
+ * Chunk Markdown content into semantic sections.
104
+ */
105
+ export function chunkMarkdown(
106
+ content: string,
107
+ config: MarkdownChunkConfig = DEFAULT_MD_CONFIG,
108
+ ): MarkdownChunk[] {
109
+ if (!config.split_by_headings) {
110
+ // Fallback: single chunk (caller can use fixed chunker)
111
+ return [{ content, heading_context: "" }]
112
+ }
113
+
114
+ const sections = parseSections(content)
115
+ const rawChunks: MarkdownChunk[] = []
116
+
117
+ // Heading stack for hierarchy tracking
118
+ const headingStack: { level: number; heading: string }[] = []
119
+
120
+ for (const section of sections) {
121
+ // Update heading stack
122
+ if (section.level > 0) {
123
+ // Pop headings at same or deeper level
124
+ while (
125
+ headingStack.length > 0 &&
126
+ headingStack[headingStack.length - 1].level >= section.level
127
+ ) {
128
+ headingStack.pop()
129
+ }
130
+ headingStack.push({ level: section.level, heading: section.heading })
131
+ }
132
+
133
+ const headingContext = config.preserve_heading_hierarchy
134
+ ? buildHeadingContext(headingStack)
135
+ : section.heading
136
+
137
+ const sectionText = section.heading
138
+ ? `${"#".repeat(section.level)} ${section.heading}\n${section.body}`
139
+ : section.body
140
+
141
+ rawChunks.push({ content: sectionText.trim(), heading_context: headingContext })
142
+ }
143
+
144
+ // Merge small sections with previous
145
+ const merged: MarkdownChunk[] = []
146
+ for (const chunk of rawChunks) {
147
+ if (
148
+ merged.length > 0 &&
149
+ chunk.content.length < config.min_chunk_size
150
+ ) {
151
+ const prev = merged[merged.length - 1]
152
+ prev.content += "\n\n" + chunk.content
153
+ // Keep the deepest heading context
154
+ if (chunk.heading_context) {
155
+ prev.heading_context = chunk.heading_context
156
+ }
157
+ } else {
158
+ merged.push({ ...chunk })
159
+ }
160
+ }
161
+
162
+ // Split oversized sections
163
+ const result: MarkdownChunk[] = []
164
+ for (const chunk of merged) {
165
+ if (chunk.content.length > config.max_chunk_size) {
166
+ const parts = splitLargeText(chunk.content, config.max_chunk_size)
167
+ for (const part of parts) {
168
+ result.push({ content: part, heading_context: chunk.heading_context })
169
+ }
170
+ } else {
171
+ result.push(chunk)
172
+ }
173
+ }
174
+
175
+ // Filter empties
176
+ return result.filter((c) => c.content.trim().length > 0)
177
+ }
@@ -0,0 +1,136 @@
1
+ /**
2
+ * Content Cleaner — removes noise from file content before chunking.
3
+ *
4
+ * Strips TOC blocks, breadcrumbs, repeated headers, auto-generated markers,
5
+ * and optionally imports/comments so the embedding model sees only signal.
6
+ */
7
+
8
+ export interface CleaningConfig {
9
+ remove_toc: boolean
10
+ remove_frontmatter_metadata: boolean
11
+ remove_imports: boolean
12
+ remove_comments: boolean
13
+ }
14
+
15
+ export const DEFAULT_CLEANING_CONFIG: CleaningConfig = {
16
+ remove_toc: true,
17
+ remove_frontmatter_metadata: false,
18
+ remove_imports: false,
19
+ remove_comments: false,
20
+ }
21
+
22
+ // ── Markdown noise ──────────────────────────────────────────────────────────
23
+
24
+ /** Remove YAML front-matter (---…---) from Markdown. */
25
+ function stripFrontmatter(text: string): string {
26
+ return text.replace(/^---\n[\s\S]*?\n---\n?/, "")
27
+ }
28
+
29
+ /**
30
+ * Remove inline TOC blocks.
31
+ * Matches patterns like:
32
+ * ## Table of Contents
33
+ * - [Section](#section)
34
+ * …blank line
35
+ */
36
+ function stripToc(text: string): string {
37
+ // Pattern: heading containing "table of contents" or "contents" followed by link-list
38
+ return text.replace(
39
+ /^#{1,3}\s*(Table of Contents|Contents|TOC)\s*\n([\t ]*[-*]\s*\[.*?\]\(#.*?\)\s*\n?)+/gim,
40
+ "",
41
+ )
42
+ }
43
+
44
+ /** Remove HTML-style TOC markers like <!-- TOC --> … <!-- /TOC --> */
45
+ function stripHtmlTocMarkers(text: string): string {
46
+ return text.replace(/<!--\s*TOC\s*-->[\s\S]*?<!--\s*\/TOC\s*-->\n?/gi, "")
47
+ }
48
+
49
+ /** Remove breadcrumb lines, e.g. `Home > Docs > API` at the top. */
50
+ function stripBreadcrumbs(text: string): string {
51
+ // Matches lines that look like breadcrumbs (word > word > word) at start
52
+ return text.replace(/^(?:[\w\s]+>\s*){2,}[\w\s]+\n{1,2}/gm, "")
53
+ }
54
+
55
+ /** Remove auto-generated code markers like `// AUTO-GENERATED` blocks. */
56
+ function stripAutoGenMarkers(text: string): string {
57
+ return text.replace(
58
+ /\/[/*]\s*(?:AUTO[- ]?GENERATED|DO NOT (?:EDIT|MODIFY)|GENERATED BY|This file (?:is|was) (?:auto-?)?generated)[^\n]*/gi,
59
+ "",
60
+ )
61
+ }
62
+
63
+ // ── Code noise ──────────────────────────────────────────────────────────────
64
+
65
+ /** Remove import/require statements (JS/TS/Python/Go). */
66
+ function stripImports(text: string): string {
67
+ // JS/TS imports
68
+ let result = text.replace(/^import\s[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, "")
69
+ result = result.replace(/^import\s+['"][^'"]+['"];?\s*$/gm, "")
70
+ // require
71
+ result = result.replace(/^(?:const|let|var)\s+.*?=\s*require\s*\(.*?\);?\s*$/gm, "")
72
+ // Python
73
+ result = result.replace(/^(?:from\s+\S+\s+)?import\s+.+$/gm, "")
74
+ // Go
75
+ result = result.replace(/^import\s*\(\s*\n(?:[\t ]*"[^"]*"\s*\n?)*\s*\)/gm, "")
76
+ result = result.replace(/^import\s+"[^"]*"\s*$/gm, "")
77
+ return result
78
+ }
79
+
80
+ /** Remove single-line and block comments (JS/TS style). */
81
+ function stripComments(text: string): string {
82
+ // Block comments
83
+ let result = text.replace(/\/\*[\s\S]*?\*\//g, "")
84
+ // Single-line // comments (only full-line, not inline URLs etc.)
85
+ result = result.replace(/^\s*\/\/[^\n]*$/gm, "")
86
+ // Python/Ruby # comments (full line only)
87
+ result = result.replace(/^\s*#[^\n!]*$/gm, "")
88
+ return result
89
+ }
90
+
91
+ // ── Shared ──────────────────────────────────────────────────────────────────
92
+
93
+ /** Collapse 3+ consecutive blank lines into 2. */
94
+ function collapseBlankLines(text: string): string {
95
+ return text.replace(/\n{3,}/g, "\n\n")
96
+ }
97
+
98
+ // ── Public API ──────────────────────────────────────────────────────────────
99
+
100
+ /**
101
+ * Clean file content according to the supplied config.
102
+ * @param content Raw file content
103
+ * @param fileType 'docs' | 'code' | 'config'
104
+ * @param config Cleaning options
105
+ */
106
+ export function cleanContent(
107
+ content: string,
108
+ fileType: "docs" | "code" | "config",
109
+ config: CleaningConfig = DEFAULT_CLEANING_CONFIG,
110
+ ): string {
111
+ let result = content
112
+
113
+ if (fileType === "docs") {
114
+ if (config.remove_frontmatter_metadata) {
115
+ result = stripFrontmatter(result)
116
+ }
117
+ if (config.remove_toc) {
118
+ result = stripToc(result)
119
+ result = stripHtmlTocMarkers(result)
120
+ }
121
+ result = stripBreadcrumbs(result)
122
+ }
123
+
124
+ if (fileType === "code") {
125
+ result = stripAutoGenMarkers(result)
126
+ if (config.remove_imports) {
127
+ result = stripImports(result)
128
+ }
129
+ if (config.remove_comments) {
130
+ result = stripComments(result)
131
+ }
132
+ }
133
+
134
+ result = collapseBlankLines(result).trim()
135
+ return result
136
+ }
@@ -0,0 +1,95 @@
1
+ import path from "path"
2
+ import { GraphDB, Triple } from "./graph-db"
3
+ import { RegexAnalyzer, Relation as RegexRelation } from "./analyzers/regex-analyzer"
4
+ import { LSPAnalyzer, Relation as LSPRelation } from "./analyzers/lsp-analyzer"
5
+
6
+ export interface ChunkWithId {
7
+ chunk_id: string
8
+ content: string
9
+ start_line?: number
10
+ end_line?: number
11
+ heading_context?: string
12
+ }
13
+
14
+ export class GraphBuilder {
15
+ private lspAnalyzer: LSPAnalyzer
16
+ private regexAnalyzer: RegexAnalyzer
17
+
18
+ constructor(
19
+ private graphDB: GraphDB,
20
+ private projectRoot: string
21
+ ) {
22
+ this.lspAnalyzer = new LSPAnalyzer()
23
+ this.regexAnalyzer = new RegexAnalyzer()
24
+ }
25
+
26
+ assignChunkIds(filePath: string, chunks: any[]): ChunkWithId[] {
27
+ const withoutExt = filePath.replace(/\.[^/.]+$/, "")
28
+ const normalizedPath = withoutExt.replace(/[^a-zA-Z0-9]/g, "_")
29
+
30
+ return chunks.map((chunk, index) => {
31
+ const chunkId = `chunk_${normalizedPath}_${index}`
32
+ return {
33
+ ...chunk,
34
+ chunk_id: chunkId
35
+ } as ChunkWithId
36
+ })
37
+ }
38
+
39
+ async buildEdges(
40
+ filePath: string,
41
+ content: string,
42
+ chunks: ChunkWithId[],
43
+ fileType: "code" | "docs"
44
+ ): Promise<void> {
45
+ let relations: Array<RegexRelation | LSPRelation> = []
46
+
47
+ if (fileType === "docs") {
48
+ relations = this.regexAnalyzer.analyzeMarkdown(filePath, content, chunks)
49
+ } else if (fileType === "code") {
50
+ const lspAvailable = await this.lspAnalyzer.isAvailable(filePath)
51
+
52
+ if (lspAvailable) {
53
+ try {
54
+ relations = await this.lspAnalyzer.analyzeFile(filePath, chunks)
55
+ } catch (error) {
56
+ relations = this.regexAnalyzer.analyzeCode(filePath, content, chunks)
57
+ }
58
+ } else {
59
+ relations = this.regexAnalyzer.analyzeCode(filePath, content, chunks)
60
+ }
61
+ }
62
+
63
+ const triples: Triple[] = relations.map(rel => ({
64
+ subject: rel.from,
65
+ predicate: rel.predicate,
66
+ object: rel.to,
67
+ weight: rel.weight,
68
+ source: rel.source,
69
+ file: filePath,
70
+ line: rel.line
71
+ }))
72
+
73
+ await this.graphDB.putEdges(triples)
74
+ }
75
+
76
+ resolveChunkId(filePath: string, line: number): string | null {
77
+ const withoutExt = filePath.replace(/\.[^/.]+$/, "")
78
+ const normalizedPath = withoutExt.replace(/[^a-zA-Z0-9]/g, "_")
79
+ return `chunk_${normalizedPath}_0`
80
+ }
81
+
82
+ async getRelatedChunks(chunkId: string): Promise<Array<{ chunk_id: string; predicate: string; weight: number; direction: "outgoing" | "incoming" }>> {
83
+ const [outgoing, incoming] = await Promise.all([
84
+ this.graphDB.getOutgoing(chunkId),
85
+ this.graphDB.getIncoming(chunkId)
86
+ ])
87
+
88
+ const result = [
89
+ ...outgoing.map(t => ({ chunk_id: t.object, predicate: t.predicate, weight: t.weight, direction: "outgoing" as const })),
90
+ ...incoming.map(t => ({ chunk_id: t.subject, predicate: t.predicate, weight: t.weight, direction: "incoming" as const }))
91
+ ]
92
+
93
+ return result
94
+ }
95
+ }
@@ -0,0 +1,97 @@
1
+ import levelgraph from "levelgraph"
2
+ import { Level } from "level"
3
+
4
+ export interface Triple {
5
+ subject: string
6
+ predicate: string
7
+ object: string
8
+ weight: number
9
+ source: string
10
+ file: string
11
+ line?: number
12
+ }
13
+
14
+ export class GraphDB {
15
+ private db: any
16
+ private initialized: boolean = false
17
+
18
+ constructor(private dbPath: string) {}
19
+
20
+ async init(): Promise<this> {
21
+ const levelDb = new Level(this.dbPath)
22
+ this.db = levelgraph(levelDb)
23
+ this.initialized = true
24
+ return this
25
+ }
26
+
27
+ async putEdges(triples: Triple[]): Promise<void> {
28
+ if (!this.initialized) {
29
+ throw new Error("GraphDB not initialized. Call init() first.")
30
+ }
31
+ await new Promise<void>((resolve, reject) => {
32
+ this.db.put(triples, (err: Error | undefined) => {
33
+ if (err) reject(err)
34
+ else resolve()
35
+ })
36
+ })
37
+ }
38
+
39
+ async getOutgoing(chunkId: string): Promise<Triple[]> {
40
+ if (!this.initialized) {
41
+ throw new Error("GraphDB not initialized. Call init() first.")
42
+ }
43
+ return new Promise<Triple[]>((resolve, reject) => {
44
+ this.db.get({ subject: chunkId }, (err: Error | undefined, triples: Triple[]) => {
45
+ if (err) reject(err)
46
+ else resolve(triples || [])
47
+ })
48
+ })
49
+ }
50
+
51
+ async getIncoming(chunkId: string): Promise<Triple[]> {
52
+ if (!this.initialized) {
53
+ throw new Error("GraphDB not initialized. Call init() first.")
54
+ }
55
+ return new Promise<Triple[]>((resolve, reject) => {
56
+ this.db.get({ object: chunkId }, (err: Error | undefined, triples: Triple[]) => {
57
+ if (err) reject(err)
58
+ else resolve(triples || [])
59
+ })
60
+ })
61
+ }
62
+
63
+ async deleteByFile(filePath: string): Promise<void> {
64
+ if (!this.initialized) {
65
+ throw new Error("GraphDB not initialized. Call init() first.")
66
+ }
67
+ const allTriples = await new Promise<Triple[]>((resolve, reject) => {
68
+ this.db.get({}, (err: Error | undefined, triples: Triple[]) => {
69
+ if (err) reject(err)
70
+ else resolve(triples || [])
71
+ })
72
+ })
73
+
74
+ const toDelete = allTriples.filter(t => t.file === filePath)
75
+
76
+ for (const t of toDelete) {
77
+ await new Promise<void>((resolve, reject) => {
78
+ this.db.del(t, (err: Error | undefined) => {
79
+ if (err) reject(err)
80
+ else resolve()
81
+ })
82
+ })
83
+ }
84
+ }
85
+
86
+ async close(): Promise<void> {
87
+ if (this.initialized && this.db) {
88
+ await new Promise<void>((resolve, reject) => {
89
+ this.db.close((err: Error | undefined) => {
90
+ if (err) reject(err)
91
+ else resolve()
92
+ })
93
+ })
94
+ this.initialized = false
95
+ }
96
+ }
97
+ }
@@ -0,0 +1,97 @@
1
+ /**
2
+ * Hybrid Search — merges vector similarity and BM25 keyword scores.
3
+ *
4
+ * Uses Reciprocal Rank Fusion (RRF) or weighted linear combination
5
+ * to merge results from two search backends.
6
+ */
7
+
8
+ // ── Types ───────────────────────────────────────────────────────────────────
9
+
10
+ export interface HybridSearchConfig {
11
+ enabled: boolean
12
+ bm25_weight: number // 0.0–1.0, vector_weight = 1 - bm25_weight
13
+ }
14
+
15
+ export const DEFAULT_HYBRID_CONFIG: HybridSearchConfig = {
16
+ enabled: false,
17
+ bm25_weight: 0.3,
18
+ }
19
+
20
+ export interface ScoredResult {
21
+ id: number // index into the results array
22
+ vectorScore: number // 0–1 (1 = best)
23
+ bm25Score: number // raw BM25 score (unnormalized)
24
+ combinedScore: number
25
+ }
26
+
27
+ // ── Merge logic ─────────────────────────────────────────────────────────────
28
+
29
+ /**
30
+ * Normalize BM25 scores to 0–1 range using min-max scaling.
31
+ */
32
+ function normalizeBM25Scores(scores: Map<number, number>): Map<number, number> {
33
+ if (scores.size === 0) return scores
34
+
35
+ let min = Infinity
36
+ let max = -Infinity
37
+ for (const s of scores.values()) {
38
+ if (s < min) min = s
39
+ if (s > max) max = s
40
+ }
41
+
42
+ const range = max - min
43
+ if (range === 0) {
44
+ // All same score → normalize to 0.5
45
+ const result = new Map<number, number>()
46
+ for (const [id] of scores) result.set(id, 0.5)
47
+ return result
48
+ }
49
+
50
+ const result = new Map<number, number>()
51
+ for (const [id, score] of scores) {
52
+ result.set(id, (score - min) / range)
53
+ }
54
+ return result
55
+ }
56
+
57
+ /**
58
+ * Merge vector and BM25 results using weighted linear combination.
59
+ *
60
+ * @param vectorResults Map of chunkIndex → vectorScore (0–1, higher = better)
61
+ * @param bm25Results Map of chunkIndex → raw BM25 score
62
+ * @param config Hybrid search config (weights)
63
+ * @param limit Max results to return
64
+ */
65
+ export function mergeResults(
66
+ vectorResults: Map<number, number>,
67
+ bm25Results: Map<number, number>,
68
+ config: HybridSearchConfig = DEFAULT_HYBRID_CONFIG,
69
+ limit: number = 10,
70
+ ): ScoredResult[] {
71
+ const vectorWeight = 1 - config.bm25_weight
72
+ const bm25Weight = config.bm25_weight
73
+
74
+ const normalizedBM25 = normalizeBM25Scores(bm25Results)
75
+
76
+ // Collect all unique IDs
77
+ const allIds = new Set<number>()
78
+ for (const id of vectorResults.keys()) allIds.add(id)
79
+ for (const id of normalizedBM25.keys()) allIds.add(id)
80
+
81
+ const results: ScoredResult[] = []
82
+
83
+ for (const id of allIds) {
84
+ const vs = vectorResults.get(id) ?? 0
85
+ const bs = normalizedBM25.get(id) ?? 0
86
+
87
+ results.push({
88
+ id,
89
+ vectorScore: vs,
90
+ bm25Score: bm25Results.get(id) ?? 0,
91
+ combinedScore: vectorWeight * vs + bm25Weight * bs,
92
+ })
93
+ }
94
+
95
+ results.sort((a, b) => b.combinedScore - a.combinedScore)
96
+ return results.slice(0, limit)
97
+ }