@comfanion/usethis_search 4.2.0-dev.4 → 4.3.0-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ export interface MarkdownChunkConfig {
10
10
  max_chunk_size: number // split sections larger than this (chars)
11
11
  split_by_headings: boolean
12
12
  preserve_heading_hierarchy: boolean
13
+ skip_low_priority: boolean // Skip low-priority sections (SQL, aggregates, etc.)
13
14
  }
14
15
 
15
16
  export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
@@ -17,6 +18,7 @@ export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
17
18
  max_chunk_size: 8000, // Large chunks for docs (SQL schemas, API specs, etc.)
18
19
  split_by_headings: true,
19
20
  preserve_heading_hierarchy: true,
21
+ skip_low_priority: true, // Skip SQL schemas, aggregates, views by default
20
22
  }
21
23
 
22
24
  export interface MarkdownChunk {
@@ -24,6 +26,7 @@ export interface MarkdownChunk {
24
26
  heading_context: string // "H1 > H2 > H3"
25
27
  start_line?: number
26
28
  end_line?: number
29
+ priority?: "high" | "normal" | "low" // Chunk priority for ranking
27
30
  }
28
31
 
29
32
  // ── Internal types ──────────────────────────────────────────────────────────
@@ -34,6 +37,55 @@ interface Section {
34
37
  body: string
35
38
  start_line: number
36
39
  end_line: number
40
+ priority: "high" | "normal" | "low"
41
+ }
42
+
43
+ // ── Priority detection ──────────────────────────────────────────────────────
44
+
45
+ /**
46
+ * Detect if heading indicates low-priority content (SQL schemas, aggregates, etc.)
47
+ * These sections are often "noise" when searching for business logic.
48
+ */
49
+ function isLowPriorityHeading(heading: string): boolean {
50
+ const lower = heading.toLowerCase()
51
+
52
+ // SQL-related sections (schemas, DDL, migrations)
53
+ if (lower.includes("sql schema") ||
54
+ lower.includes("database schema") ||
55
+ lower.includes("continuous aggregate") ||
56
+ lower.includes("materialized view") ||
57
+ lower.includes("ddl") ||
58
+ lower.includes("migration")) {
59
+ return true
60
+ }
61
+
62
+ // Generated/auto-generated content
63
+ if (lower.includes("auto-generated") ||
64
+ lower.includes("generated schema") ||
65
+ lower.includes("api reference") && lower.includes("generated")) {
66
+ return true
67
+ }
68
+
69
+ // Large reference tables (often boilerplate)
70
+ if (lower.includes("full reference") ||
71
+ lower.includes("complete list") ||
72
+ lower.includes("all endpoints")) {
73
+ return true
74
+ }
75
+
76
+ return false
77
+ }
78
+
79
+ /** Determine priority level for a section based on heading and context. */
80
+ function getSectionPriority(heading: string, body: string): "high" | "normal" | "low" {
81
+ if (isLowPriorityHeading(heading)) return "low"
82
+
83
+ // High-priority: short sections with code examples (tutorials, guides)
84
+ if (body.includes("```") && body.length < 2000) {
85
+ return "high"
86
+ }
87
+
88
+ return "normal"
37
89
  }
38
90
 
39
91
  // ── Parsing ─────────────────────────────────────────────────────────────────
@@ -42,15 +94,16 @@ interface Section {
42
94
  function parseSections(content: string): Section[] {
43
95
  const lines = content.split("\n")
44
96
  const sections: Section[] = []
45
- let currentSection: Section = { level: 0, heading: "", body: "", start_line: 0, end_line: 0 }
97
+ let currentSection: Section = { level: 0, heading: "", body: "", start_line: 0, end_line: 0, priority: "normal" }
46
98
 
47
99
  for (let i = 0; i < lines.length; i++) {
48
100
  const line = lines[i]
49
101
  const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
50
102
  if (headingMatch) {
51
- // Push previous section
103
+ // Push previous section (with priority calculated)
52
104
  if (currentSection.body.trim() || currentSection.heading) {
53
105
  currentSection.end_line = i - 1
106
+ currentSection.priority = getSectionPriority(currentSection.heading, currentSection.body)
54
107
  sections.push(currentSection)
55
108
  }
56
109
  currentSection = {
@@ -59,15 +112,17 @@ function parseSections(content: string): Section[] {
59
112
  body: "",
60
113
  start_line: i,
61
114
  end_line: 0,
115
+ priority: "normal", // Will be calculated when section ends
62
116
  }
63
117
  } else {
64
118
  currentSection.body += line + "\n"
65
119
  }
66
120
  }
67
121
 
68
- // Push last section
122
+ // Push last section (with priority calculated)
69
123
  if (currentSection.body.trim() || currentSection.heading) {
70
124
  currentSection.end_line = lines.length - 1
125
+ currentSection.priority = getSectionPriority(currentSection.heading, currentSection.body)
71
126
  sections.push(currentSection)
72
127
  }
73
128
 
@@ -191,12 +246,18 @@ export function chunkMarkdown(
191
246
  heading_context: headingContext,
192
247
  start_line: section.start_line,
193
248
  end_line: section.end_line,
249
+ priority: section.priority,
194
250
  })
195
251
  }
196
252
 
253
+ // Filter low-priority sections if configured
254
+ const filteredChunks = config.skip_low_priority
255
+ ? rawChunks.filter(chunk => chunk.priority !== "low")
256
+ : rawChunks
257
+
197
258
  // Merge small sections with previous
198
259
  const merged: MarkdownChunk[] = []
199
- for (const chunk of rawChunks) {
260
+ for (const chunk of filteredChunks) {
200
261
  if (
201
262
  merged.length > 0 &&
202
263
  chunk.content.length < config.min_chunk_size
@@ -211,6 +272,10 @@ export function chunkMarkdown(
211
272
  if (chunk.heading_context) {
212
273
  prev.heading_context = chunk.heading_context
213
274
  }
275
+ // Keep highest priority (high > normal > low)
276
+ if (chunk.priority === "high" || (chunk.priority === "normal" && prev.priority === "low")) {
277
+ prev.priority = chunk.priority
278
+ }
214
279
  } else {
215
280
  merged.push({ ...chunk })
216
281
  }
@@ -227,6 +292,7 @@ export function chunkMarkdown(
227
292
  heading_context: chunk.heading_context,
228
293
  start_line: part.start_line,
229
294
  end_line: part.end_line,
295
+ priority: chunk.priority, // Inherit priority from parent chunk
230
296
  })
231
297
  }
232
298
  } else {
package/vectorizer.yaml CHANGED
@@ -26,6 +26,7 @@ vectorizer:
26
26
  min_chunk_size: 1000 # Merge small sections (avoid header-only chunks)
27
27
  max_chunk_size: 8000 # Large chunks for docs (SQL schemas, API specs, etc.)
28
28
  preserve_heading_hierarchy: true
29
+ skip_low_priority: true # Skip SQL schemas, continuous aggregates (default: true)
29
30
  code:
30
31
  split_by_functions: true
31
32
  include_function_signature: true