sweet-search 2.6.0 → 2.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -349,7 +349,11 @@ export class ASTChunker {
|
|
|
349
349
|
// chunker uses tree-sitter-cpp instead of tree-sitter-c.
|
|
350
350
|
const langInfo = resolveLanguage(filePath, content);
|
|
351
351
|
if (!langInfo || !langInfo.chunker) {
|
|
352
|
-
|
|
352
|
+
// Mapped-but-chunkerless languages (e.g. Clojure: no tree-sitter grammar,
|
|
353
|
+
// and paren-delimited forms don't fit the brace/indent/endKeyword parsers)
|
|
354
|
+
// fall back to lossless generic windowing but keep their resolved language
|
|
355
|
+
// id so chunks are tagged 'clojure' rather than generic 'text'.
|
|
356
|
+
return this.parseGenericFile(filePath, content, langInfo?.id || 'text');
|
|
353
357
|
}
|
|
354
358
|
|
|
355
359
|
// Try tree-sitter WASM first for supported languages
|
|
@@ -909,7 +913,7 @@ export class ASTChunker {
|
|
|
909
913
|
return subChunks;
|
|
910
914
|
}
|
|
911
915
|
|
|
912
|
-
parseGenericFile(filePath, content) {
|
|
916
|
+
parseGenericFile(filePath, content, language = 'text') {
|
|
913
917
|
const lines = content.split('\n');
|
|
914
918
|
const chunks = [];
|
|
915
919
|
const CHUNK_SIZE = 50;
|
|
@@ -921,7 +925,7 @@ export class ASTChunker {
|
|
|
921
925
|
const chunkContent = lines.slice(start, end).join('\n');
|
|
922
926
|
|
|
923
927
|
if (chunkContent.trim().length > 20) {
|
|
924
|
-
chunks.push(this.buildChunk(chunkContent, filePath,
|
|
928
|
+
chunks.push(this.buildChunk(chunkContent, filePath, language, 'code', 'unknown', start, end - 1));
|
|
925
929
|
}
|
|
926
930
|
|
|
927
931
|
start = end - OVERLAP;
|
|
@@ -51,8 +51,29 @@ export const ROUTING_CONFIG = {
|
|
|
51
51
|
export const FILE_PATTERNS = {
|
|
52
52
|
include: [
|
|
53
53
|
// Source code (all major languages)
|
|
54
|
-
'**/*.{js,jsx,ts,tsx,mjs,cjs}',
|
|
54
|
+
'**/*.{js,jsx,ts,tsx,mjs,cjs,cts,mts}', // JavaScript/TypeScript (incl. CommonJS/ESM TS)
|
|
55
55
|
'**/*.{java,kt,kts,scala,groovy}', // JVM
|
|
56
|
+
'**/*.{clj,cljc,cljs,edn}', // Clojure / ClojureScript / EDN
|
|
57
|
+
'**/*.jl', // Julia
|
|
58
|
+
'**/*.{R,r,Rd,rd,Rmd,rmd}', // R (case-sensitive matcher: list both cases)
|
|
59
|
+
'**/*.{ml,mli,mll,mly}', // OCaml
|
|
60
|
+
'**/*.{res,resi}', // ReScript
|
|
61
|
+
'**/*.{hs,lhs}', // Haskell
|
|
62
|
+
'**/*.{erl,hrl}', // Erlang
|
|
63
|
+
'**/*.{pl,pm,pod}', // Perl
|
|
64
|
+
'**/*.{f,for,f90,f95,f03,f08,F,F90,F95}', // Fortran (case-sensitive)
|
|
65
|
+
'**/*.{cob,cbl}', // COBOL
|
|
66
|
+
'**/*.{asm,s,S}', // Assembly (case-sensitive)
|
|
67
|
+
'**/*.{cr,vala,hx,pas,nix,vim}', // Crystal / Vala / Haxe / Pascal / Nix / Vim
|
|
68
|
+
'**/*.{elm,sol,tla,rdl,el,ejs}', // Elm / Solidity / TLA+ / SystemRDL / Emacs Lisp / EJS
|
|
69
|
+
'**/*.{ql,qll}', // CodeQL
|
|
70
|
+
'**/*.{zeek,bro}', // Zeek
|
|
71
|
+
'**/*.{tcl,tk}', // Tcl
|
|
72
|
+
'**/*.astro', // Astro (SFC)
|
|
73
|
+
// GPU shaders
|
|
74
|
+
'**/*.{glsl,vert,frag,comp,geom,tesc,tese}', // GLSL
|
|
75
|
+
'**/*.{hlsl,hlsli}', // HLSL
|
|
76
|
+
'**/*.{metal,wgsl,shader,cg,cginc}', // Metal / WGSL / ShaderLab / Cg
|
|
56
77
|
'**/*.{py,pyi}', // Python
|
|
57
78
|
'**/*.go', // Go
|
|
58
79
|
'**/*.rs', // Rust
|
|
@@ -71,7 +92,10 @@ export const FILE_PATTERNS = {
|
|
|
71
92
|
'**/*.{yaml,yml}', // YAML
|
|
72
93
|
'**/*.toml', // TOML
|
|
73
94
|
'**/*.{xml,xsl,xsd,wsdl,pom,csproj}', // XML
|
|
74
|
-
'**/*.{
|
|
95
|
+
'**/*.{tf,tfvars,hcl}', // Terraform / HCL
|
|
96
|
+
'**/*.{ini,cfg}', // INI / config
|
|
97
|
+
'**/*.properties', // Java properties
|
|
98
|
+
'**/*.{md,mdx,mdc,rst,txt,markdown}', // Documentation + Cursor rules
|
|
75
99
|
'**/*.{html,htm,xhtml,vue,svelte}', // Web markup/SFC
|
|
76
100
|
'**/*.{css,scss,sass,less}', // Stylesheets
|
|
77
101
|
'**/*.svg', // SVG
|
|
@@ -81,6 +105,15 @@ export const FILE_PATTERNS = {
|
|
|
81
105
|
'**/*.dockerfile', // Dockerfile alt extension
|
|
82
106
|
'**/Makefile', // Makefile
|
|
83
107
|
'**/*.mk', // Makefile includes
|
|
108
|
+
'**/*.cmake', // CMake modules
|
|
109
|
+
'**/*.gradle', // Gradle (Groovy DSL)
|
|
110
|
+
'**/*.ninja', // Ninja
|
|
111
|
+
'**/*.{bzl,star}', // Bazel / Starlark
|
|
112
|
+
'**/BUILD', '**/BUILD.bazel', // Bazel BUILD
|
|
113
|
+
'**/WORKSPACE', '**/WORKSPACE.bazel', // Bazel WORKSPACE
|
|
114
|
+
'**/meson.build', // Meson
|
|
115
|
+
'**/Earthfile', // Earthly
|
|
116
|
+
'**/justfile', '**/Justfile', // Just
|
|
84
117
|
// Project markers
|
|
85
118
|
'**/CLAUDE.md',
|
|
86
119
|
'**/AGENTS.md',
|
|
@@ -42,6 +42,15 @@ export const EXTENSION_MAP = {
|
|
|
42
42
|
// Scala
|
|
43
43
|
'.scala': 'scala',
|
|
44
44
|
|
|
45
|
+
// Clojure / ClojureScript / EDN
|
|
46
|
+
// No tree-sitter-clojure grammar ships in the bundle and the brace-based
|
|
47
|
+
// regex chunker would fragment paren-delimited Lisp forms (and drop sub-30-char
|
|
48
|
+
// defns via MIN_CONTENT_LENGTH), so there is intentionally no LANGUAGES.clojure
|
|
49
|
+
// entry — getLanguageByExtension returns the chunker-less fallback and
|
|
50
|
+
// ast-chunker routes these through parseGenericFile (lossless 50-line windows),
|
|
51
|
+
// tagged with this 'clojure' language id rather than generic 'text'.
|
|
52
|
+
'.clj': 'clojure', '.cljc': 'clojure', '.cljs': 'clojure', '.edn': 'clojure',
|
|
53
|
+
|
|
45
54
|
// Dart
|
|
46
55
|
'.dart': 'dart',
|
|
47
56
|
|
|
@@ -110,8 +119,72 @@ export const EXTENSION_MAP = {
|
|
|
110
119
|
// F# / VB (.NET additional)
|
|
111
120
|
// '.vb' omitted — no LANGUAGES.vb entry exists
|
|
112
121
|
|
|
122
|
+
// Julia
|
|
123
|
+
'.jl': 'julia',
|
|
124
|
+
|
|
125
|
+
// R — the lookup lowercases the key, so '.r'/'.rd'/'.rmd' also resolve the
|
|
126
|
+
// canonical uppercase on-disk forms (.R/.Rd/.Rmd). Keep keys lowercase only.
|
|
127
|
+
'.r': 'r', '.rd': 'r', '.rmd': 'r',
|
|
128
|
+
|
|
129
|
+
// OCaml (tree-sitter-ocaml ships in the bundle but is unwired → generic chunking)
|
|
130
|
+
'.ml': 'ocaml', '.mli': 'ocaml', '.mll': 'ocaml', '.mly': 'ocaml',
|
|
131
|
+
|
|
132
|
+
// TypeScript modules (CommonJS / ESM) — reuse the wired typescript grammar.
|
|
133
|
+
// Map to 'typescript' not 'tsx': .cts/.mts cannot contain JSX.
|
|
134
|
+
'.cts': 'typescript', '.mts': 'typescript',
|
|
135
|
+
|
|
136
|
+
// Other bundled-grammar languages (unwired → lossless generic chunking)
|
|
137
|
+
'.elm': 'elm',
|
|
138
|
+
'.sol': 'solidity',
|
|
139
|
+
'.res': 'rescript', '.resi': 'rescript',
|
|
140
|
+
'.ql': 'ql', '.qll': 'ql',
|
|
141
|
+
'.tla': 'tlaplus',
|
|
142
|
+
'.rdl': 'systemrdl',
|
|
143
|
+
'.el': 'elisp',
|
|
144
|
+
'.ejs': 'embedded_template',
|
|
145
|
+
|
|
146
|
+
// Functional / systems / scripting
|
|
147
|
+
'.hs': 'haskell', '.lhs': 'haskell',
|
|
148
|
+
'.erl': 'erlang', '.hrl': 'erlang',
|
|
149
|
+
'.cr': 'crystal',
|
|
150
|
+
'.vala': 'vala',
|
|
151
|
+
'.hx': 'haxe',
|
|
152
|
+
'.pas': 'pascal',
|
|
153
|
+
'.nix': 'nix',
|
|
154
|
+
'.pl': 'perl', '.pm': 'perl', '.pod': 'perl',
|
|
155
|
+
'.vim': 'vim',
|
|
156
|
+
'.tcl': 'tcl', '.tk': 'tcl',
|
|
157
|
+
'.zeek': 'zeek', '.bro': 'zeek',
|
|
158
|
+
|
|
159
|
+
// Scientific / legacy (lowercase keys cover the uppercase .F/.F90/.S forms)
|
|
160
|
+
'.f': 'fortran', '.for': 'fortran', '.f90': 'fortran', '.f95': 'fortran',
|
|
161
|
+
'.f03': 'fortran', '.f08': 'fortran',
|
|
162
|
+
'.cob': 'cobol', '.cbl': 'cobol',
|
|
163
|
+
'.asm': 'assembly', '.s': 'assembly',
|
|
164
|
+
|
|
165
|
+
// GPU shaders
|
|
166
|
+
'.glsl': 'glsl', '.vert': 'glsl', '.frag': 'glsl', '.comp': 'glsl',
|
|
167
|
+
'.geom': 'glsl', '.tesc': 'glsl', '.tese': 'glsl',
|
|
168
|
+
'.hlsl': 'hlsl', '.hlsli': 'hlsl',
|
|
169
|
+
'.metal': 'metal',
|
|
170
|
+
'.wgsl': 'wgsl',
|
|
171
|
+
'.shader': 'shaderlab',
|
|
172
|
+
'.cg': 'cg', '.cginc': 'cg',
|
|
173
|
+
|
|
174
|
+
// Web framework SFC — reuse the html SFC path (like .vue/.svelte)
|
|
175
|
+
'.astro': 'html',
|
|
176
|
+
|
|
177
|
+
// Infra / build / config DSLs
|
|
178
|
+
'.tf': 'hcl', '.tfvars': 'hcl', '.hcl': 'hcl',
|
|
179
|
+
'.ini': 'ini', '.cfg': 'ini',
|
|
180
|
+
'.properties': 'properties',
|
|
181
|
+
'.cmake': 'cmake',
|
|
182
|
+
'.gradle': 'gradle',
|
|
183
|
+
'.ninja': 'ninja',
|
|
184
|
+
'.bzl': 'starlark', '.star': 'starlark',
|
|
185
|
+
|
|
113
186
|
// Document formats (dispatched to DocumentChunker in ast-chunker.js)
|
|
114
|
-
'.md': 'markdown', '.mdx': 'markdown',
|
|
187
|
+
'.md': 'markdown', '.mdx': 'markdown', '.markdown': 'markdown',
|
|
115
188
|
'.rst': 'rst',
|
|
116
189
|
'.txt': 'plaintext',
|
|
117
190
|
};
|
|
@@ -121,4 +194,11 @@ export const FILENAME_MAP = {
|
|
|
121
194
|
Dockerfile: 'dockerfile',
|
|
122
195
|
Makefile: 'makefile',
|
|
123
196
|
GNUmakefile: 'makefile',
|
|
197
|
+
// Extensionless build / project files (mirrors the Dockerfile/Makefile pattern)
|
|
198
|
+
BUILD: 'starlark',
|
|
199
|
+
WORKSPACE: 'starlark',
|
|
200
|
+
'meson.build': 'meson',
|
|
201
|
+
Earthfile: 'earthfile',
|
|
202
|
+
justfile: 'just',
|
|
203
|
+
Justfile: 'just',
|
|
124
204
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sweet-search",
|
|
3
|
-
"version": "2.6.
|
|
3
|
+
"version": "2.6.2",
|
|
4
4
|
"description": "Sweet Search - SOTA Hybrid Code Search Engine with WASM CatBoost Query Router, Semantic/Lexical/Structural Search, and Multilingual Support",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "core/search/sweet-search.js",
|
|
@@ -167,13 +167,13 @@
|
|
|
167
167
|
"vitest": "^4.0.16"
|
|
168
168
|
},
|
|
169
169
|
"optionalDependencies": {
|
|
170
|
-
"@sweet-search/native-darwin-arm64": "2.6.
|
|
171
|
-
"@sweet-search/native-darwin-x64": "2.6.
|
|
172
|
-
"@sweet-search/native-linux-arm64-gnu": "2.6.
|
|
173
|
-
"@sweet-search/native-linux-arm64-gnu-cuda": "2.6.
|
|
174
|
-
"@sweet-search/native-linux-x64-gnu": "2.6.
|
|
175
|
-
"@sweet-search/native-linux-x64-gnu-cuda": "2.6.
|
|
176
|
-
"@sweet-search/bg-priority": "2.6.
|
|
170
|
+
"@sweet-search/native-darwin-arm64": "2.6.2",
|
|
171
|
+
"@sweet-search/native-darwin-x64": "2.6.2",
|
|
172
|
+
"@sweet-search/native-linux-arm64-gnu": "2.6.2",
|
|
173
|
+
"@sweet-search/native-linux-arm64-gnu-cuda": "2.6.2",
|
|
174
|
+
"@sweet-search/native-linux-x64-gnu": "2.6.2",
|
|
175
|
+
"@sweet-search/native-linux-x64-gnu-cuda": "2.6.2",
|
|
176
|
+
"@sweet-search/bg-priority": "2.6.2"
|
|
177
177
|
},
|
|
178
178
|
"engines": {
|
|
179
179
|
"node": ">=18.0.0"
|