sweet-search 2.6.0 → 2.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -349,7 +349,11 @@ export class ASTChunker {
349
349
  // chunker uses tree-sitter-cpp instead of tree-sitter-c.
350
350
  const langInfo = resolveLanguage(filePath, content);
351
351
  if (!langInfo || !langInfo.chunker) {
352
- return this.parseGenericFile(filePath, content);
352
+ // Mapped-but-chunkerless languages (e.g. Clojure: no tree-sitter grammar,
353
+ // and paren-delimited forms don't fit the brace/indent/endKeyword parsers)
354
+ // fall back to lossless generic windowing but keep their resolved language
355
+ // id so chunks are tagged 'clojure' rather than generic 'text'.
356
+ return this.parseGenericFile(filePath, content, langInfo?.id || 'text');
353
357
  }
354
358
 
355
359
  // Try tree-sitter WASM first for supported languages
@@ -909,7 +913,7 @@ export class ASTChunker {
909
913
  return subChunks;
910
914
  }
911
915
 
912
- parseGenericFile(filePath, content) {
916
+ parseGenericFile(filePath, content, language = 'text') {
913
917
  const lines = content.split('\n');
914
918
  const chunks = [];
915
919
  const CHUNK_SIZE = 50;
@@ -921,7 +925,7 @@ export class ASTChunker {
921
925
  const chunkContent = lines.slice(start, end).join('\n');
922
926
 
923
927
  if (chunkContent.trim().length > 20) {
924
- chunks.push(this.buildChunk(chunkContent, filePath, 'text', 'code', 'unknown', start, end - 1));
928
+ chunks.push(this.buildChunk(chunkContent, filePath, language, 'code', 'unknown', start, end - 1));
925
929
  }
926
930
 
927
931
  start = end - OVERLAP;
@@ -51,8 +51,29 @@ export const ROUTING_CONFIG = {
51
51
  export const FILE_PATTERNS = {
52
52
  include: [
53
53
  // Source code (all major languages)
54
- '**/*.{js,jsx,ts,tsx,mjs,cjs}', // JavaScript/TypeScript
54
+ '**/*.{js,jsx,ts,tsx,mjs,cjs,cts,mts}', // JavaScript/TypeScript (incl. CommonJS/ESM TS)
55
55
  '**/*.{java,kt,kts,scala,groovy}', // JVM
56
+ '**/*.{clj,cljc,cljs,edn}', // Clojure / ClojureScript / EDN
57
+ '**/*.jl', // Julia
58
+ '**/*.{R,r,Rd,rd,Rmd,rmd}', // R (case-sensitive matcher: list both cases)
59
+ '**/*.{ml,mli,mll,mly}', // OCaml
60
+ '**/*.{res,resi}', // ReScript
61
+ '**/*.{hs,lhs}', // Haskell
62
+ '**/*.{erl,hrl}', // Erlang
63
+ '**/*.{pl,pm,pod}', // Perl
64
+ '**/*.{f,for,f90,f95,f03,f08,F,F90,F95}', // Fortran (case-sensitive)
65
+ '**/*.{cob,cbl}', // COBOL
66
+ '**/*.{asm,s,S}', // Assembly (case-sensitive)
67
+ '**/*.{cr,vala,hx,pas,nix,vim}', // Crystal / Vala / Haxe / Pascal / Nix / Vim
68
+ '**/*.{elm,sol,tla,rdl,el,ejs}', // Elm / Solidity / TLA+ / SystemRDL / Emacs Lisp / EJS
69
+ '**/*.{ql,qll}', // CodeQL
70
+ '**/*.{zeek,bro}', // Zeek
71
+ '**/*.{tcl,tk}', // Tcl
72
+ '**/*.astro', // Astro (SFC)
73
+ // GPU shaders
74
+ '**/*.{glsl,vert,frag,comp,geom,tesc,tese}', // GLSL
75
+ '**/*.{hlsl,hlsli}', // HLSL
76
+ '**/*.{metal,wgsl,shader,cg,cginc}', // Metal / WGSL / ShaderLab / Cg
56
77
  '**/*.{py,pyi}', // Python
57
78
  '**/*.go', // Go
58
79
  '**/*.rs', // Rust
@@ -71,7 +92,10 @@ export const FILE_PATTERNS = {
71
92
  '**/*.{yaml,yml}', // YAML
72
93
  '**/*.toml', // TOML
73
94
  '**/*.{xml,xsl,xsd,wsdl,pom,csproj}', // XML
74
- '**/*.{md,mdx,mdc,rst,txt}', // Documentation + Cursor rules
95
+ '**/*.{tf,tfvars,hcl}', // Terraform / HCL
96
+ '**/*.{ini,cfg}', // INI / config
97
+ '**/*.properties', // Java properties
98
+ '**/*.{md,mdx,mdc,rst,txt,markdown}', // Documentation + Cursor rules
75
99
  '**/*.{html,htm,xhtml,vue,svelte}', // Web markup/SFC
76
100
  '**/*.{css,scss,sass,less}', // Stylesheets
77
101
  '**/*.svg', // SVG
@@ -81,6 +105,15 @@ export const FILE_PATTERNS = {
81
105
  '**/*.dockerfile', // Dockerfile alt extension
82
106
  '**/Makefile', // Makefile
83
107
  '**/*.mk', // Makefile includes
108
+ '**/*.cmake', // CMake modules
109
+ '**/*.gradle', // Gradle (Groovy DSL)
110
+ '**/*.ninja', // Ninja
111
+ '**/*.{bzl,star}', // Bazel / Starlark
112
+ '**/BUILD', '**/BUILD.bazel', // Bazel BUILD
113
+ '**/WORKSPACE', '**/WORKSPACE.bazel', // Bazel WORKSPACE
114
+ '**/meson.build', // Meson
115
+ '**/Earthfile', // Earthly
116
+ '**/justfile', '**/Justfile', // Just
84
117
  // Project markers
85
118
  '**/CLAUDE.md',
86
119
  '**/AGENTS.md',
@@ -42,6 +42,15 @@ export const EXTENSION_MAP = {
42
42
  // Scala
43
43
  '.scala': 'scala',
44
44
 
45
+ // Clojure / ClojureScript / EDN
46
+ // No tree-sitter-clojure grammar ships in the bundle and the brace-based
47
+ // regex chunker would fragment paren-delimited Lisp forms (and drop sub-30-char
48
+ // defns via MIN_CONTENT_LENGTH), so there is intentionally no LANGUAGES.clojure
49
+ // entry — getLanguageByExtension returns the chunker-less fallback and
50
+ // ast-chunker routes these through parseGenericFile (lossless 50-line windows),
51
+ // tagged with this 'clojure' language id rather than generic 'text'.
52
+ '.clj': 'clojure', '.cljc': 'clojure', '.cljs': 'clojure', '.edn': 'clojure',
53
+
45
54
  // Dart
46
55
  '.dart': 'dart',
47
56
 
@@ -110,8 +119,72 @@ export const EXTENSION_MAP = {
110
119
  // F# / VB (.NET additional)
111
120
  // '.vb' omitted — no LANGUAGES.vb entry exists
112
121
 
122
+ // Julia
123
+ '.jl': 'julia',
124
+
125
+ // R — the lookup lowercases the key, so '.r'/'.rd'/'.rmd' also resolve the
126
+ // canonical uppercase on-disk forms (.R/.Rd/.Rmd). Keep keys lowercase only.
127
+ '.r': 'r', '.rd': 'r', '.rmd': 'r',
128
+
129
+ // OCaml (tree-sitter-ocaml ships in the bundle but is unwired → generic chunking)
130
+ '.ml': 'ocaml', '.mli': 'ocaml', '.mll': 'ocaml', '.mly': 'ocaml',
131
+
132
+ // TypeScript modules (CommonJS / ESM) — reuse the wired typescript grammar.
133
+ // Map to 'typescript' not 'tsx': .cts/.mts cannot contain JSX.
134
+ '.cts': 'typescript', '.mts': 'typescript',
135
+
136
+ // Other bundled-grammar languages (unwired → lossless generic chunking)
137
+ '.elm': 'elm',
138
+ '.sol': 'solidity',
139
+ '.res': 'rescript', '.resi': 'rescript',
140
+ '.ql': 'ql', '.qll': 'ql',
141
+ '.tla': 'tlaplus',
142
+ '.rdl': 'systemrdl',
143
+ '.el': 'elisp',
144
+ '.ejs': 'embedded_template',
145
+
146
+ // Functional / systems / scripting
147
+ '.hs': 'haskell', '.lhs': 'haskell',
148
+ '.erl': 'erlang', '.hrl': 'erlang',
149
+ '.cr': 'crystal',
150
+ '.vala': 'vala',
151
+ '.hx': 'haxe',
152
+ '.pas': 'pascal',
153
+ '.nix': 'nix',
154
+ '.pl': 'perl', '.pm': 'perl', '.pod': 'perl',
155
+ '.vim': 'vim',
156
+ '.tcl': 'tcl', '.tk': 'tcl',
157
+ '.zeek': 'zeek', '.bro': 'zeek',
158
+
159
+ // Scientific / legacy (lowercase keys cover the uppercase .F/.F90/.S forms)
160
+ '.f': 'fortran', '.for': 'fortran', '.f90': 'fortran', '.f95': 'fortran',
161
+ '.f03': 'fortran', '.f08': 'fortran',
162
+ '.cob': 'cobol', '.cbl': 'cobol',
163
+ '.asm': 'assembly', '.s': 'assembly',
164
+
165
+ // GPU shaders
166
+ '.glsl': 'glsl', '.vert': 'glsl', '.frag': 'glsl', '.comp': 'glsl',
167
+ '.geom': 'glsl', '.tesc': 'glsl', '.tese': 'glsl',
168
+ '.hlsl': 'hlsl', '.hlsli': 'hlsl',
169
+ '.metal': 'metal',
170
+ '.wgsl': 'wgsl',
171
+ '.shader': 'shaderlab',
172
+ '.cg': 'cg', '.cginc': 'cg',
173
+
174
+ // Web framework SFC — reuse the html SFC path (like .vue/.svelte)
175
+ '.astro': 'html',
176
+
177
+ // Infra / build / config DSLs
178
+ '.tf': 'hcl', '.tfvars': 'hcl', '.hcl': 'hcl',
179
+ '.ini': 'ini', '.cfg': 'ini',
180
+ '.properties': 'properties',
181
+ '.cmake': 'cmake',
182
+ '.gradle': 'gradle',
183
+ '.ninja': 'ninja',
184
+ '.bzl': 'starlark', '.star': 'starlark',
185
+
113
186
  // Document formats (dispatched to DocumentChunker in ast-chunker.js)
114
- '.md': 'markdown', '.mdx': 'markdown',
187
+ '.md': 'markdown', '.mdx': 'markdown', '.markdown': 'markdown',
115
188
  '.rst': 'rst',
116
189
  '.txt': 'plaintext',
117
190
  };
@@ -121,4 +194,11 @@ export const FILENAME_MAP = {
121
194
  Dockerfile: 'dockerfile',
122
195
  Makefile: 'makefile',
123
196
  GNUmakefile: 'makefile',
197
+ // Extensionless build / project files (mirrors the Dockerfile/Makefile pattern)
198
+ BUILD: 'starlark',
199
+ WORKSPACE: 'starlark',
200
+ 'meson.build': 'meson',
201
+ Earthfile: 'earthfile',
202
+ justfile: 'just',
203
+ Justfile: 'just',
124
204
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sweet-search",
3
- "version": "2.6.0",
3
+ "version": "2.6.2",
4
4
  "description": "Sweet Search - SOTA Hybrid Code Search Engine with WASM CatBoost Query Router, Semantic/Lexical/Structural Search, and Multilingual Support",
5
5
  "type": "module",
6
6
  "main": "core/search/sweet-search.js",
@@ -167,13 +167,13 @@
167
167
  "vitest": "^4.0.16"
168
168
  },
169
169
  "optionalDependencies": {
170
- "@sweet-search/native-darwin-arm64": "2.6.0",
171
- "@sweet-search/native-darwin-x64": "2.6.0",
172
- "@sweet-search/native-linux-arm64-gnu": "2.6.0",
173
- "@sweet-search/native-linux-arm64-gnu-cuda": "2.6.0",
174
- "@sweet-search/native-linux-x64-gnu": "2.6.0",
175
- "@sweet-search/native-linux-x64-gnu-cuda": "2.6.0",
176
- "@sweet-search/bg-priority": "2.6.0"
170
+ "@sweet-search/native-darwin-arm64": "2.6.2",
171
+ "@sweet-search/native-darwin-x64": "2.6.2",
172
+ "@sweet-search/native-linux-arm64-gnu": "2.6.2",
173
+ "@sweet-search/native-linux-arm64-gnu-cuda": "2.6.2",
174
+ "@sweet-search/native-linux-x64-gnu": "2.6.2",
175
+ "@sweet-search/native-linux-x64-gnu-cuda": "2.6.2",
176
+ "@sweet-search/bg-priority": "2.6.2"
177
177
  },
178
178
  "engines": {
179
179
  "node": ">=18.0.0"