grepmax 0.17.8 → 0.17.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -553,8 +553,78 @@ class TreeSitterChunker {
|
|
|
553
553
|
referencedSymbols.push(name);
|
|
554
554
|
}
|
|
555
555
|
};
|
|
556
|
+
// Leaf identifier node types across grammars (a bare name with no
|
|
557
|
+
// named children — `ErrorCodes`, not `a.ErrorCodes`).
|
|
558
|
+
const LEAF_ID_TYPES = new Set([
|
|
559
|
+
"identifier",
|
|
560
|
+
"type_identifier",
|
|
561
|
+
"constant", // Ruby
|
|
562
|
+
"name", // PHP
|
|
563
|
+
"simple_identifier", // Kotlin, Swift
|
|
564
|
+
"property_identifier",
|
|
565
|
+
"field_identifier", // Go
|
|
566
|
+
]);
|
|
567
|
+
const isLeafId = (n) => {
|
|
568
|
+
var _a, _b;
|
|
569
|
+
return !!n &&
|
|
570
|
+
LEAF_ID_TYPES.has(n.type) &&
|
|
571
|
+
((_b = (_a = n.namedChildren) === null || _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0) === 0;
|
|
572
|
+
};
|
|
573
|
+
const firstNamed = (n) => { var _a, _b; return (_b = ((_a = n.namedChildren) !== null && _a !== void 0 ? _a : [])[0]) !== null && _b !== void 0 ? _b : null; };
|
|
574
|
+
// Resolve the type name out of an instantiation node (`new Foo`,
|
|
575
|
+
// `Foo{}`, `new Foo()`), preferring an explicit `type` field and
|
|
576
|
+
// falling back to the first named child (qualified names reduce to
|
|
577
|
+
// their rightmost segment via simpleRefName).
|
|
578
|
+
const instantiatedTypeName = (n) => {
|
|
579
|
+
var _a;
|
|
580
|
+
const typed = (_a = (n.childForFieldName ? n.childForFieldName("type") : null)) !== null && _a !== void 0 ? _a : firstNamed(n);
|
|
581
|
+
if (!typed)
|
|
582
|
+
return null;
|
|
583
|
+
if (isLeafId(typed))
|
|
584
|
+
return typed.text;
|
|
585
|
+
return simpleRefName(typed);
|
|
586
|
+
};
|
|
587
|
+
// First `type_identifier` reachable directly or via a `user_type`
|
|
588
|
+
// wrapper — covers Java `instanceof_expression` and Kotlin/Swift
|
|
589
|
+
// `check_expression` (`x is T`).
|
|
590
|
+
const firstTypeIdent = (n) => {
|
|
591
|
+
var _a, _b;
|
|
592
|
+
for (const c of (_a = n.namedChildren) !== null && _a !== void 0 ? _a : []) {
|
|
593
|
+
if (c.type === "type_identifier")
|
|
594
|
+
return c.text;
|
|
595
|
+
if (c.type === "user_type") {
|
|
596
|
+
const t = ((_b = c.namedChildren) !== null && _b !== void 0 ? _b : []).find((x) => x.type === "type_identifier");
|
|
597
|
+
if (t === null || t === void 0 ? void 0 : t.text)
|
|
598
|
+
return t.text;
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
return null;
|
|
602
|
+
};
|
|
603
|
+
// Member / scope access node types, one per grammar. We capture the
|
|
604
|
+
// head (object/scope) only when it is a Capitalized leaf identifier,
|
|
605
|
+
// so `ErrorCodes.VALIDATION` / `ErrorCodes::NOT_FOUND` yield an edge
|
|
606
|
+
// to `ErrorCodes` while `this.x` / `req.body` / lowercase locals do not.
|
|
607
|
+
const MEMBER_ACCESS_TYPES = new Set([
|
|
608
|
+
"member_expression", // TS/JS
|
|
609
|
+
"attribute", // Python
|
|
610
|
+
"selector_expression", // Go
|
|
611
|
+
"field_access", // Java
|
|
612
|
+
"member_access_expression", // C#
|
|
613
|
+
"scoped_identifier", // Rust
|
|
614
|
+
"scope_resolution", // Ruby
|
|
615
|
+
"navigation_expression", // Kotlin, Swift
|
|
616
|
+
"field_expression", // Scala
|
|
617
|
+
"class_constant_access_expression", // PHP
|
|
618
|
+
]);
|
|
619
|
+
// Instantiation node types whose first/`type` child names a type.
|
|
620
|
+
const INSTANTIATION_TYPES = new Set([
|
|
621
|
+
"object_creation_expression", // Java, C#, PHP
|
|
622
|
+
"composite_literal", // Go
|
|
623
|
+
"struct_expression", // Rust
|
|
624
|
+
"instance_expression", // Scala
|
|
625
|
+
]);
|
|
556
626
|
const extractRefs = (n) => {
|
|
557
|
-
var _a, _b, _c, _d, _e, _f, _g, _h;
|
|
627
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j;
|
|
558
628
|
// Handle JS/TS (call_expression), Python (call), Lua (function_call)
|
|
559
629
|
if (n.type === "call_expression" ||
|
|
560
630
|
n.type === "call" ||
|
|
@@ -617,19 +687,26 @@ class TreeSitterChunker {
|
|
|
617
687
|
}
|
|
618
688
|
}
|
|
619
689
|
}
|
|
620
|
-
// Identifier-as-value references
|
|
621
|
-
//
|
|
690
|
+
// Identifier-as-value references: edges the call-expression capture
|
|
691
|
+
// above misses. These feed the graph-walk consumers (PPR,
|
|
622
692
|
// `gmax dead <ClassName>`, audit) that need class/enum references,
|
|
623
|
-
// not just method-call names.
|
|
693
|
+
// not just method-call names. Node types are grammar-specific but
|
|
694
|
+
// each falls into one of three shapes — instantiation, type-test, or
|
|
695
|
+
// member/scope access — handled uniformly across the 14 grammars.
|
|
696
|
+
// Shape 1 — instantiation: `new ClassName(...)`, `ClassName{...}`.
|
|
624
697
|
if (n.type === "new_expression") {
|
|
625
|
-
//
|
|
698
|
+
// TS/JS — constructor may be qualified (`ns.ClassName`).
|
|
626
699
|
const ctor = n.childForFieldName
|
|
627
700
|
? n.childForFieldName("constructor")
|
|
628
701
|
: null;
|
|
629
702
|
addRef(simpleRefName(ctor));
|
|
630
703
|
}
|
|
631
|
-
else if (n.type
|
|
632
|
-
|
|
704
|
+
else if (INSTANTIATION_TYPES.has(n.type)) {
|
|
705
|
+
addRef(instantiatedTypeName(n));
|
|
706
|
+
}
|
|
707
|
+
// Shape 2 — type-test: `x instanceof T`, `x is T`.
|
|
708
|
+
if (n.type === "binary_expression") {
|
|
709
|
+
// TS/JS, PHP — instanceof is a binary operator.
|
|
633
710
|
const op = n.childForFieldName
|
|
634
711
|
? n.childForFieldName("operator")
|
|
635
712
|
: null;
|
|
@@ -640,19 +717,27 @@ class TreeSitterChunker {
|
|
|
640
717
|
addRef(simpleRefName(right));
|
|
641
718
|
}
|
|
642
719
|
}
|
|
643
|
-
else if (n.type === "
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
720
|
+
else if (n.type === "instanceof_expression" || // Java
|
|
721
|
+
n.type === "check_expression" // Kotlin, Swift (`x is T`)
|
|
722
|
+
) {
|
|
723
|
+
addRef(firstTypeIdent(n));
|
|
724
|
+
}
|
|
725
|
+
else if (n.type === "is_pattern_expression") {
|
|
726
|
+
// C# — `x is BeyondError`: the type sits in a constant_pattern.
|
|
727
|
+
const cp = ((_h = n.namedChildren) !== null && _h !== void 0 ? _h : []).find((c) => c.type === "constant_pattern");
|
|
728
|
+
const id = cp ? firstNamed(cp) : null;
|
|
729
|
+
if (isLeafId(id) && id && /^[A-Z]/.test(id.text))
|
|
730
|
+
addRef(id.text);
|
|
731
|
+
}
|
|
732
|
+
// Shape 3 — member / scope access: `ErrorCodes.MEMBER`,
|
|
733
|
+
// `ErrorCodes::MEMBER`. Capitalized head only (skip `this.x`).
|
|
734
|
+
if (MEMBER_ACCESS_TYPES.has(n.type)) {
|
|
735
|
+
const head = firstNamed(n);
|
|
736
|
+
if (head && isLeafId(head) && /^[A-Z]/.test(head.text)) {
|
|
737
|
+
addRef(head.text);
|
|
653
738
|
}
|
|
654
739
|
}
|
|
655
|
-
for (const child of (
|
|
740
|
+
for (const child of (_j = n.namedChildren) !== null && _j !== void 0 ? _j : []) {
|
|
656
741
|
extractRefs(child);
|
|
657
742
|
}
|
|
658
743
|
};
|
|
@@ -10,12 +10,40 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
10
10
|
};
|
|
11
11
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
12
|
exports.Searcher = void 0;
|
|
13
|
+
exports.asSymbolQuery = asSymbolQuery;
|
|
13
14
|
exports.buildWhereClause = buildWhereClause;
|
|
14
15
|
const config_1 = require("../../config");
|
|
15
16
|
const filter_builder_1 = require("../utils/filter-builder");
|
|
16
17
|
const pool_1 = require("../workers/pool");
|
|
17
18
|
const intent_1 = require("./intent");
|
|
18
19
|
const pagerank_1 = require("./pagerank");
|
|
20
|
+
// Reads a defined_symbols / referenced_symbols column that may arrive as a plain
|
|
21
|
+
// array or a LanceDB Arrow proxy (.toArray()).
|
|
22
|
+
function readSymbolArray(val) {
|
|
23
|
+
if (!val)
|
|
24
|
+
return [];
|
|
25
|
+
if (Array.isArray(val))
|
|
26
|
+
return val.filter((v) => typeof v === "string");
|
|
27
|
+
const maybe = val;
|
|
28
|
+
if (typeof maybe.toArray === "function") {
|
|
29
|
+
try {
|
|
30
|
+
const a = maybe.toArray();
|
|
31
|
+
return Array.isArray(a) ? a.filter((v) => typeof v === "string") : [];
|
|
32
|
+
}
|
|
33
|
+
catch (_a) {
|
|
34
|
+
return [];
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return [];
|
|
38
|
+
}
|
|
39
|
+
// A query that is a single bare identifier ("BeyondError", "requireAuth", "map")
|
|
40
|
+
// is almost always a symbol lookup — the user wants the chunk that *defines*
|
|
41
|
+
// that symbol. Returns the trimmed identifier, or null for natural-language
|
|
42
|
+
// queries. Drives the symbol-definition promotion in search().
|
|
43
|
+
function asSymbolQuery(query) {
|
|
44
|
+
const q = query.trim();
|
|
45
|
+
return /^[A-Za-z_$][A-Za-z0-9_$]*$/.test(q) ? q : null;
|
|
46
|
+
}
|
|
19
47
|
function buildWhereClause(pathPrefix, filters, searchIntent) {
|
|
20
48
|
var _a;
|
|
21
49
|
const parts = [];
|
|
@@ -339,7 +367,7 @@ class Searcher {
|
|
|
339
367
|
}
|
|
340
368
|
search(query, top_k, _search_options, _filters, pathPrefix, intent, signal) {
|
|
341
369
|
return __awaiter(this, void 0, void 0, function* () {
|
|
342
|
-
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l;
|
|
370
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m;
|
|
343
371
|
const finalLimit = top_k !== null && top_k !== void 0 ? top_k : 10;
|
|
344
372
|
// ColBERT rerank is opt-in as of v0.17.1. On the 97-case eval it
|
|
345
373
|
// regresses MRR@10 by ~3% and doubles query latency; sweep across
|
|
@@ -348,6 +376,8 @@ class Searcher {
|
|
|
348
376
|
let doRerank = (_a = _search_options === null || _search_options === void 0 ? void 0 : _search_options.rerank) !== null && _a !== void 0 ? _a : false;
|
|
349
377
|
const explain = (_b = _search_options === null || _search_options === void 0 ? void 0 : _search_options.explain) !== null && _b !== void 0 ? _b : false;
|
|
350
378
|
const searchIntent = intent || (0, intent_1.detectIntent)(query);
|
|
379
|
+
// Bare-identifier queries get symbol-definition promotion (see below).
|
|
380
|
+
const symbolQuery = asSymbolQuery(query);
|
|
351
381
|
const pool = (0, pool_1.getWorkerPool)();
|
|
352
382
|
if (signal === null || signal === void 0 ? void 0 : signal.aborted) {
|
|
353
383
|
const err = new Error("Aborted");
|
|
@@ -372,7 +402,7 @@ class Searcher {
|
|
|
372
402
|
try {
|
|
373
403
|
table = yield this.db.ensureTable();
|
|
374
404
|
}
|
|
375
|
-
catch (
|
|
405
|
+
catch (_o) {
|
|
376
406
|
return { data: [] };
|
|
377
407
|
}
|
|
378
408
|
// Ensure FTS index exists (lazy init, retry periodically on failure)
|
|
@@ -396,11 +426,14 @@ class Searcher {
|
|
|
396
426
|
// it in the lightweight path only when the flag is on so we don't bloat the
|
|
397
427
|
// default query path.
|
|
398
428
|
const pagerankEnabled = process.env.GMAX_PAGERANK === "1" && !!pathPrefix;
|
|
429
|
+
// Symbol-definition promotion needs defined_symbols on every candidate, not
|
|
430
|
+
// just the final display set — load it for bare-symbol queries too.
|
|
431
|
+
const needDefinedSymbols = pagerankEnabled || symbolQuery !== null;
|
|
399
432
|
const LIGHTWEIGHT_COLUMNS = [
|
|
400
433
|
"id", "path", "hash", "chunk_index", "start_line", "end_line",
|
|
401
434
|
"is_anchor", "chunk_type", "role", "complexity", "is_exported",
|
|
402
435
|
"content", "parent_symbol", "referenced_symbols", "pooled_colbert_48d",
|
|
403
|
-
...(
|
|
436
|
+
...(needDefinedSymbols ? ["defined_symbols"] : []),
|
|
404
437
|
];
|
|
405
438
|
// _distance is auto-added by vectorSearch, _score by FTS — include each
|
|
406
439
|
// in the respective query to suppress LanceDB deprecation warnings
|
|
@@ -438,7 +471,7 @@ class Searcher {
|
|
|
438
471
|
this.ftsAvailable = true;
|
|
439
472
|
console.warn("[Searcher] Rebuilt FTS index with position support — retry search");
|
|
440
473
|
}
|
|
441
|
-
catch (
|
|
474
|
+
catch (_p) { }
|
|
442
475
|
}
|
|
443
476
|
else {
|
|
444
477
|
console.warn(`[Searcher] FTS search failed (will retry later): ${msg}`);
|
|
@@ -552,6 +585,30 @@ class Searcher {
|
|
|
552
585
|
return { data: [] };
|
|
553
586
|
}
|
|
554
587
|
const rerankCandidates = stage2Candidates.slice(0, RERANK_TOP);
|
|
588
|
+
// Symbol-definition promotion (1/2): membership. For a bare-symbol query,
|
|
589
|
+
// ensure the chunk(s) that actually DEFINE the symbol reach the rerank set
|
|
590
|
+
// even when the cosine / RERANK_TOP cuts would drop them — e.g. ErrorCodes
|
|
591
|
+
// sits at pooled-cosine rank 24 (> RERANK_TOP=20) and resolveActor at fusion
|
|
592
|
+
// rank 91 (> the stage-2 cut). Pulled from the top-200 fusion pool, bounded
|
|
593
|
+
// so the rerank batch stays small. Must run before Phase B so the injected
|
|
594
|
+
// chunks get their colbert data fetched for reranking. The score boost in
|
|
595
|
+
// (2/2) below then lets them win dedup over their own method-child chunks.
|
|
596
|
+
if (symbolQuery && rerankCandidates.length > 0) {
|
|
597
|
+
const present = new Set(rerankCandidates.map((d) => d.id).filter(Boolean));
|
|
598
|
+
const MAX_INJECT = 5;
|
|
599
|
+
let injected = 0;
|
|
600
|
+
for (const d of topCandidates) {
|
|
601
|
+
if (injected >= MAX_INJECT)
|
|
602
|
+
break;
|
|
603
|
+
if (!d.id || present.has(d.id))
|
|
604
|
+
continue;
|
|
605
|
+
if (readSymbolArray(d.defined_symbols).includes(symbolQuery)) {
|
|
606
|
+
rerankCandidates.push(d);
|
|
607
|
+
present.add(d.id);
|
|
608
|
+
injected++;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
}
|
|
555
612
|
// Phase B: Lazy-load colbert data only for the ~20 rerank candidates
|
|
556
613
|
if (doRerank && rerankCandidates.length > 0) {
|
|
557
614
|
const rerankIds = rerankCandidates
|
|
@@ -599,13 +656,25 @@ class Searcher {
|
|
|
599
656
|
// Small tie-breaker so later items don't all share 0
|
|
600
657
|
return fusedScore || 1 / (idx + 1);
|
|
601
658
|
});
|
|
659
|
+
// Symbol-definition promotion (2/2): score. Multiplicatively boost any
|
|
660
|
+
// candidate that defines the queried symbol so the definition chunk outranks
|
|
661
|
+
// its own method-child chunks (e.g. the `BeyondError` class chunk vs its
|
|
662
|
+
// constructor/toJSON, which otherwise score higher on the literal and evict
|
|
663
|
+
// the parent in overlap dedup). Multiplicative keeps it scale-invariant
|
|
664
|
+
// across the rerank-on (ColBERT maxsim) and rerank-off (fusion) score ranges.
|
|
665
|
+
const envDefBoost = Number.parseFloat((_k = process.env.GMAX_DEF_BOOST) !== null && _k !== void 0 ? _k : "");
|
|
666
|
+
const DEF_MATCH_BOOST = Number.isFinite(envDefBoost) && envDefBoost >= 1 ? envDefBoost : 5;
|
|
602
667
|
const scored = rerankCandidates.map((doc, idx) => {
|
|
603
668
|
var _a, _b;
|
|
604
669
|
const base = (_a = scores === null || scores === void 0 ? void 0 : scores[idx]) !== null && _a !== void 0 ? _a : 0;
|
|
605
670
|
const key = doc.id || `${doc.path}:${doc.chunk_index}`;
|
|
606
671
|
const fusedScore = (_b = candidateScores.get(key)) !== null && _b !== void 0 ? _b : 0;
|
|
607
672
|
const blended = base + FUSED_WEIGHT * fusedScore;
|
|
608
|
-
|
|
673
|
+
let boosted = this.applyStructureBoost(doc, blended, searchIntent);
|
|
674
|
+
if (symbolQuery &&
|
|
675
|
+
readSymbolArray(doc.defined_symbols).includes(symbolQuery)) {
|
|
676
|
+
boosted *= DEF_MATCH_BOOST;
|
|
677
|
+
}
|
|
609
678
|
return {
|
|
610
679
|
record: doc,
|
|
611
680
|
score: boosted,
|
|
@@ -622,7 +691,7 @@ class Searcher {
|
|
|
622
691
|
try {
|
|
623
692
|
const { scores: prScores, max: prMax } = yield (0, pagerank_1.loadOrComputePageRank)(this.db, pathPrefix);
|
|
624
693
|
if (prMax > 0) {
|
|
625
|
-
const envWeight = Number.parseFloat((
|
|
694
|
+
const envWeight = Number.parseFloat((_l = process.env.GMAX_PR_WEIGHT) !== null && _l !== void 0 ? _l : "");
|
|
626
695
|
const PR_WEIGHT = Number.isFinite(envWeight) && envWeight >= 0 ? envWeight : 0.05;
|
|
627
696
|
for (const item of scored) {
|
|
628
697
|
const raw = item.record.defined_symbols;
|
|
@@ -637,7 +706,7 @@ class Searcher {
|
|
|
637
706
|
defs = arr.filter((v) => typeof v === "string");
|
|
638
707
|
}
|
|
639
708
|
}
|
|
640
|
-
catch (
|
|
709
|
+
catch (_q) { }
|
|
641
710
|
}
|
|
642
711
|
const norm = (0, pagerank_1.pageRankBoostForSymbols)(defs, prScores, prMax);
|
|
643
712
|
item.score += PR_WEIGHT * norm;
|
|
@@ -655,7 +724,7 @@ class Searcher {
|
|
|
655
724
|
// Item 10: Per-file diversification
|
|
656
725
|
const seenFiles = new Map();
|
|
657
726
|
const diversified = [];
|
|
658
|
-
const envMaxPerFile = Number.parseInt((
|
|
727
|
+
const envMaxPerFile = Number.parseInt((_m = process.env.GMAX_MAX_PER_FILE) !== null && _m !== void 0 ? _m : "", 10);
|
|
659
728
|
const MAX_PER_FILE = Number.isFinite(envMaxPerFile) && envMaxPerFile > 0 ? envMaxPerFile : 3;
|
|
660
729
|
for (const item of uniqueScored) {
|
|
661
730
|
const path = item.record.path || "";
|
package/package.json
CHANGED