searchsocket 0.3.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -39
- package/dist/cli.js +947 -1378
- package/dist/client.cjs +45 -0
- package/dist/client.d.cts +3 -2
- package/dist/client.d.ts +3 -2
- package/dist/client.js +45 -1
- package/dist/index.cjs +909 -1286
- package/dist/index.d.cts +73 -33
- package/dist/index.d.ts +73 -33
- package/dist/index.js +906 -1281
- package/dist/plugin-B_npJSux.d.cts +36 -0
- package/dist/plugin-M-aW0ev6.d.ts +36 -0
- package/dist/scroll.cjs +185 -0
- package/dist/scroll.d.cts +42 -0
- package/dist/scroll.d.ts +42 -0
- package/dist/scroll.js +183 -0
- package/dist/sveltekit.cjs +997 -1204
- package/dist/sveltekit.d.cts +3 -43
- package/dist/sveltekit.d.ts +3 -43
- package/dist/sveltekit.js +995 -1202
- package/dist/{types-BrG6XTUU.d.cts → types-Dk43uz25.d.cts} +50 -109
- package/dist/{types-BrG6XTUU.d.ts → types-Dk43uz25.d.ts} +50 -109
- package/package.json +10 -3
package/dist/index.js
CHANGED
|
@@ -3,12 +3,12 @@ import path from 'path';
|
|
|
3
3
|
import { createJiti } from 'jiti';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
import { execSync, spawn } from 'child_process';
|
|
6
|
-
import pLimit2 from 'p-limit';
|
|
7
6
|
import { createHash } from 'crypto';
|
|
8
7
|
import { load } from 'cheerio';
|
|
9
8
|
import matter from 'gray-matter';
|
|
10
|
-
import fs4 from 'fs/promises';
|
|
11
9
|
import fg from 'fast-glob';
|
|
10
|
+
import pLimit from 'p-limit';
|
|
11
|
+
import fs3 from 'fs/promises';
|
|
12
12
|
import net from 'net';
|
|
13
13
|
import { gunzipSync } from 'zlib';
|
|
14
14
|
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
@@ -2759,12 +2759,12 @@ var require_ChildNode = __commonJS({
|
|
|
2759
2759
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/ChildNode.js"(exports$1, module) {
|
|
2760
2760
|
var Node2 = require_Node();
|
|
2761
2761
|
var LinkedList = require_LinkedList();
|
|
2762
|
-
var createDocumentFragmentFromArguments = function(
|
|
2763
|
-
var docFrag =
|
|
2762
|
+
var createDocumentFragmentFromArguments = function(document2, args) {
|
|
2763
|
+
var docFrag = document2.createDocumentFragment();
|
|
2764
2764
|
for (var i = 0; i < args.length; i++) {
|
|
2765
2765
|
var argItem = args[i];
|
|
2766
2766
|
var isNode = argItem instanceof Node2;
|
|
2767
|
-
docFrag.appendChild(isNode ? argItem :
|
|
2767
|
+
docFrag.appendChild(isNode ? argItem : document2.createTextNode(String(argItem)));
|
|
2768
2768
|
}
|
|
2769
2769
|
return docFrag;
|
|
2770
2770
|
};
|
|
@@ -2922,7 +2922,7 @@ var require_NamedNodeMap = __commonJS({
|
|
|
2922
2922
|
// node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Element.js
|
|
2923
2923
|
var require_Element = __commonJS({
|
|
2924
2924
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Element.js"(exports$1, module) {
|
|
2925
|
-
module.exports =
|
|
2925
|
+
module.exports = Element2;
|
|
2926
2926
|
var xml = require_xmlnames();
|
|
2927
2927
|
var utils = require_utils();
|
|
2928
2928
|
var NAMESPACE = utils.NAMESPACE;
|
|
@@ -2939,7 +2939,7 @@ var require_Element = __commonJS({
|
|
|
2939
2939
|
var NonDocumentTypeChildNode = require_NonDocumentTypeChildNode();
|
|
2940
2940
|
var NamedNodeMap = require_NamedNodeMap();
|
|
2941
2941
|
var uppercaseCache = /* @__PURE__ */ Object.create(null);
|
|
2942
|
-
function
|
|
2942
|
+
function Element2(doc, localName, namespaceURI, prefix) {
|
|
2943
2943
|
ContainerNode.call(this);
|
|
2944
2944
|
this.nodeType = Node2.ELEMENT_NODE;
|
|
2945
2945
|
this.ownerDocument = doc;
|
|
@@ -2959,7 +2959,7 @@ var require_Element = __commonJS({
|
|
|
2959
2959
|
recursiveGetText(node.childNodes[i], a);
|
|
2960
2960
|
}
|
|
2961
2961
|
}
|
|
2962
|
-
|
|
2962
|
+
Element2.prototype = Object.create(ContainerNode.prototype, {
|
|
2963
2963
|
isHTML: { get: function isHTML() {
|
|
2964
2964
|
return this.namespaceURI === NAMESPACE.HTML && this.ownerDocument.isHTML;
|
|
2965
2965
|
} },
|
|
@@ -3029,7 +3029,7 @@ var require_Element = __commonJS({
|
|
|
3029
3029
|
return NodeUtils.serializeOne(this, { nodeType: 0 });
|
|
3030
3030
|
},
|
|
3031
3031
|
set: function(v) {
|
|
3032
|
-
var
|
|
3032
|
+
var document2 = this.ownerDocument;
|
|
3033
3033
|
var parent = this.parentNode;
|
|
3034
3034
|
if (parent === null) {
|
|
3035
3035
|
return;
|
|
@@ -3040,8 +3040,8 @@ var require_Element = __commonJS({
|
|
|
3040
3040
|
if (parent.nodeType === Node2.DOCUMENT_FRAGMENT_NODE) {
|
|
3041
3041
|
parent = parent.ownerDocument.createElement("body");
|
|
3042
3042
|
}
|
|
3043
|
-
var parser =
|
|
3044
|
-
|
|
3043
|
+
var parser = document2.implementation.mozHTMLParser(
|
|
3044
|
+
document2._address,
|
|
3045
3045
|
parent
|
|
3046
3046
|
);
|
|
3047
3047
|
parser.parse(v === null ? "" : String(v), true);
|
|
@@ -3100,7 +3100,7 @@ var require_Element = __commonJS({
|
|
|
3100
3100
|
default:
|
|
3101
3101
|
utils.SyntaxError();
|
|
3102
3102
|
}
|
|
3103
|
-
if (!(context instanceof
|
|
3103
|
+
if (!(context instanceof Element2) || context.ownerDocument.isHTML && context.localName === "html" && context.namespaceURI === NAMESPACE.HTML) {
|
|
3104
3104
|
context = context.ownerDocument.createElementNS(NAMESPACE.HTML, "body");
|
|
3105
3105
|
}
|
|
3106
3106
|
var parser = this.ownerDocument.implementation.mozHTMLParser(
|
|
@@ -3708,10 +3708,10 @@ var require_Element = __commonJS({
|
|
|
3708
3708
|
return nodes.item ? nodes : new NodeList(nodes);
|
|
3709
3709
|
} }
|
|
3710
3710
|
});
|
|
3711
|
-
Object.defineProperties(
|
|
3712
|
-
Object.defineProperties(
|
|
3711
|
+
Object.defineProperties(Element2.prototype, ChildNode);
|
|
3712
|
+
Object.defineProperties(Element2.prototype, NonDocumentTypeChildNode);
|
|
3713
3713
|
attributes.registerChangeHandler(
|
|
3714
|
-
|
|
3714
|
+
Element2,
|
|
3715
3715
|
"id",
|
|
3716
3716
|
function(element, lname, oldval, newval) {
|
|
3717
3717
|
if (element.rooted) {
|
|
@@ -3725,7 +3725,7 @@ var require_Element = __commonJS({
|
|
|
3725
3725
|
}
|
|
3726
3726
|
);
|
|
3727
3727
|
attributes.registerChangeHandler(
|
|
3728
|
-
|
|
3728
|
+
Element2,
|
|
3729
3729
|
"class",
|
|
3730
3730
|
function(element, lname, oldval, newval) {
|
|
3731
3731
|
if (element._classList) {
|
|
@@ -3824,7 +3824,7 @@ var require_Element = __commonJS({
|
|
|
3824
3824
|
}
|
|
3825
3825
|
}
|
|
3826
3826
|
});
|
|
3827
|
-
|
|
3827
|
+
Element2._Attr = Attr;
|
|
3828
3828
|
function AttributesArray(elt) {
|
|
3829
3829
|
NamedNodeMap.call(this, elt);
|
|
3830
3830
|
for (var name in elt._attrsByQName) {
|
|
@@ -4226,7 +4226,7 @@ var require_DocumentFragment = __commonJS({
|
|
|
4226
4226
|
var Node2 = require_Node();
|
|
4227
4227
|
var NodeList = require_NodeList();
|
|
4228
4228
|
var ContainerNode = require_ContainerNode();
|
|
4229
|
-
var
|
|
4229
|
+
var Element2 = require_Element();
|
|
4230
4230
|
var select = require_select();
|
|
4231
4231
|
var utils = require_utils();
|
|
4232
4232
|
function DocumentFragment(doc) {
|
|
@@ -4244,9 +4244,9 @@ var require_DocumentFragment = __commonJS({
|
|
|
4244
4244
|
}
|
|
4245
4245
|
},
|
|
4246
4246
|
// Copy the text content getter/setter from Element
|
|
4247
|
-
textContent: Object.getOwnPropertyDescriptor(
|
|
4247
|
+
textContent: Object.getOwnPropertyDescriptor(Element2.prototype, "textContent"),
|
|
4248
4248
|
// Copy the text content getter/setter from Element
|
|
4249
|
-
innerText: Object.getOwnPropertyDescriptor(
|
|
4249
|
+
innerText: Object.getOwnPropertyDescriptor(Element2.prototype, "innerText"),
|
|
4250
4250
|
querySelector: { value: function(selector) {
|
|
4251
4251
|
var nodes = this.querySelectorAll(selector);
|
|
4252
4252
|
return nodes.length ? nodes[0] : null;
|
|
@@ -4254,8 +4254,8 @@ var require_DocumentFragment = __commonJS({
|
|
|
4254
4254
|
querySelectorAll: { value: function(selector) {
|
|
4255
4255
|
var context = Object.create(this);
|
|
4256
4256
|
context.isHTML = true;
|
|
4257
|
-
context.getElementsByTagName =
|
|
4258
|
-
context.nextElement = Object.getOwnPropertyDescriptor(
|
|
4257
|
+
context.getElementsByTagName = Element2.prototype.getElementsByTagName;
|
|
4258
|
+
context.nextElement = Object.getOwnPropertyDescriptor(Element2.prototype, "firstElementChild").get;
|
|
4259
4259
|
var nodes = select(selector, context);
|
|
4260
4260
|
return nodes.item ? nodes : new NodeList(nodes);
|
|
4261
4261
|
} },
|
|
@@ -4337,7 +4337,7 @@ var require_ProcessingInstruction = __commonJS({
|
|
|
4337
4337
|
// node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/NodeFilter.js
|
|
4338
4338
|
var require_NodeFilter = __commonJS({
|
|
4339
4339
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/NodeFilter.js"(exports$1, module) {
|
|
4340
|
-
var
|
|
4340
|
+
var NodeFilter2 = {
|
|
4341
4341
|
// Constants for acceptNode()
|
|
4342
4342
|
FILTER_ACCEPT: 1,
|
|
4343
4343
|
FILTER_REJECT: 2,
|
|
@@ -4362,7 +4362,7 @@ var require_NodeFilter = __commonJS({
|
|
|
4362
4362
|
SHOW_NOTATION: 2048
|
|
4363
4363
|
// historical
|
|
4364
4364
|
};
|
|
4365
|
-
module.exports =
|
|
4365
|
+
module.exports = NodeFilter2.constructor = NodeFilter2.prototype = NodeFilter2;
|
|
4366
4366
|
}
|
|
4367
4367
|
});
|
|
4368
4368
|
|
|
@@ -4437,7 +4437,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4437
4437
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/TreeWalker.js"(exports$1, module) {
|
|
4438
4438
|
module.exports = TreeWalker;
|
|
4439
4439
|
var Node2 = require_Node();
|
|
4440
|
-
var
|
|
4440
|
+
var NodeFilter2 = require_NodeFilter();
|
|
4441
4441
|
var NodeTraversal = require_NodeTraversal();
|
|
4442
4442
|
var utils = require_utils();
|
|
4443
4443
|
var mapChild = {
|
|
@@ -4457,11 +4457,11 @@ var require_TreeWalker = __commonJS({
|
|
|
4457
4457
|
node = tw._currentNode[mapChild[type]];
|
|
4458
4458
|
while (node !== null) {
|
|
4459
4459
|
result = tw._internalFilter(node);
|
|
4460
|
-
if (result ===
|
|
4460
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4461
4461
|
tw._currentNode = node;
|
|
4462
4462
|
return node;
|
|
4463
4463
|
}
|
|
4464
|
-
if (result ===
|
|
4464
|
+
if (result === NodeFilter2.FILTER_SKIP) {
|
|
4465
4465
|
child = node[mapChild[type]];
|
|
4466
4466
|
if (child !== null) {
|
|
4467
4467
|
node = child;
|
|
@@ -4495,12 +4495,12 @@ var require_TreeWalker = __commonJS({
|
|
|
4495
4495
|
while (sibling !== null) {
|
|
4496
4496
|
node = sibling;
|
|
4497
4497
|
result = tw._internalFilter(node);
|
|
4498
|
-
if (result ===
|
|
4498
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4499
4499
|
tw._currentNode = node;
|
|
4500
4500
|
return node;
|
|
4501
4501
|
}
|
|
4502
4502
|
sibling = node[mapChild[type]];
|
|
4503
|
-
if (result ===
|
|
4503
|
+
if (result === NodeFilter2.FILTER_REJECT || sibling === null) {
|
|
4504
4504
|
sibling = node[mapSibling[type]];
|
|
4505
4505
|
}
|
|
4506
4506
|
}
|
|
@@ -4508,7 +4508,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4508
4508
|
if (node === null || node === tw.root) {
|
|
4509
4509
|
return null;
|
|
4510
4510
|
}
|
|
4511
|
-
if (tw._internalFilter(node) ===
|
|
4511
|
+
if (tw._internalFilter(node) === NodeFilter2.FILTER_ACCEPT) {
|
|
4512
4512
|
return null;
|
|
4513
4513
|
}
|
|
4514
4514
|
}
|
|
@@ -4556,11 +4556,11 @@ var require_TreeWalker = __commonJS({
|
|
|
4556
4556
|
utils.InvalidStateError();
|
|
4557
4557
|
}
|
|
4558
4558
|
if (!(1 << node.nodeType - 1 & this._whatToShow)) {
|
|
4559
|
-
return
|
|
4559
|
+
return NodeFilter2.FILTER_SKIP;
|
|
4560
4560
|
}
|
|
4561
4561
|
filter = this._filter;
|
|
4562
4562
|
if (filter === null) {
|
|
4563
|
-
result =
|
|
4563
|
+
result = NodeFilter2.FILTER_ACCEPT;
|
|
4564
4564
|
} else {
|
|
4565
4565
|
this._active = true;
|
|
4566
4566
|
try {
|
|
@@ -4589,7 +4589,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4589
4589
|
if (node === null) {
|
|
4590
4590
|
return null;
|
|
4591
4591
|
}
|
|
4592
|
-
if (this._internalFilter(node) ===
|
|
4592
|
+
if (this._internalFilter(node) === NodeFilter2.FILTER_ACCEPT) {
|
|
4593
4593
|
this._currentNode = node;
|
|
4594
4594
|
return node;
|
|
4595
4595
|
}
|
|
@@ -4642,17 +4642,17 @@ var require_TreeWalker = __commonJS({
|
|
|
4642
4642
|
for (previousSibling = node.previousSibling; previousSibling; previousSibling = node.previousSibling) {
|
|
4643
4643
|
node = previousSibling;
|
|
4644
4644
|
result = this._internalFilter(node);
|
|
4645
|
-
if (result ===
|
|
4645
|
+
if (result === NodeFilter2.FILTER_REJECT) {
|
|
4646
4646
|
continue;
|
|
4647
4647
|
}
|
|
4648
4648
|
for (lastChild = node.lastChild; lastChild; lastChild = node.lastChild) {
|
|
4649
4649
|
node = lastChild;
|
|
4650
4650
|
result = this._internalFilter(node);
|
|
4651
|
-
if (result ===
|
|
4651
|
+
if (result === NodeFilter2.FILTER_REJECT) {
|
|
4652
4652
|
break;
|
|
4653
4653
|
}
|
|
4654
4654
|
}
|
|
4655
|
-
if (result ===
|
|
4655
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4656
4656
|
this._currentNode = node;
|
|
4657
4657
|
return node;
|
|
4658
4658
|
}
|
|
@@ -4661,7 +4661,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4661
4661
|
return null;
|
|
4662
4662
|
}
|
|
4663
4663
|
node = node.parentNode;
|
|
4664
|
-
if (this._internalFilter(node) ===
|
|
4664
|
+
if (this._internalFilter(node) === NodeFilter2.FILTER_ACCEPT) {
|
|
4665
4665
|
this._currentNode = node;
|
|
4666
4666
|
return node;
|
|
4667
4667
|
}
|
|
@@ -4678,26 +4678,26 @@ var require_TreeWalker = __commonJS({
|
|
|
4678
4678
|
nextNode: { value: function nextNode() {
|
|
4679
4679
|
var node, result, firstChild, nextSibling;
|
|
4680
4680
|
node = this._currentNode;
|
|
4681
|
-
result =
|
|
4681
|
+
result = NodeFilter2.FILTER_ACCEPT;
|
|
4682
4682
|
CHILDREN:
|
|
4683
4683
|
while (true) {
|
|
4684
4684
|
for (firstChild = node.firstChild; firstChild; firstChild = node.firstChild) {
|
|
4685
4685
|
node = firstChild;
|
|
4686
4686
|
result = this._internalFilter(node);
|
|
4687
|
-
if (result ===
|
|
4687
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4688
4688
|
this._currentNode = node;
|
|
4689
4689
|
return node;
|
|
4690
|
-
} else if (result ===
|
|
4690
|
+
} else if (result === NodeFilter2.FILTER_REJECT) {
|
|
4691
4691
|
break;
|
|
4692
4692
|
}
|
|
4693
4693
|
}
|
|
4694
4694
|
for (nextSibling = NodeTraversal.nextSkippingChildren(node, this.root); nextSibling; nextSibling = NodeTraversal.nextSkippingChildren(node, this.root)) {
|
|
4695
4695
|
node = nextSibling;
|
|
4696
4696
|
result = this._internalFilter(node);
|
|
4697
|
-
if (result ===
|
|
4697
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4698
4698
|
this._currentNode = node;
|
|
4699
4699
|
return node;
|
|
4700
|
-
} else if (result ===
|
|
4700
|
+
} else if (result === NodeFilter2.FILTER_SKIP) {
|
|
4701
4701
|
continue CHILDREN;
|
|
4702
4702
|
}
|
|
4703
4703
|
}
|
|
@@ -4716,7 +4716,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4716
4716
|
var require_NodeIterator = __commonJS({
|
|
4717
4717
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/NodeIterator.js"(exports$1, module) {
|
|
4718
4718
|
module.exports = NodeIterator;
|
|
4719
|
-
var
|
|
4719
|
+
var NodeFilter2 = require_NodeFilter();
|
|
4720
4720
|
var NodeTraversal = require_NodeTraversal();
|
|
4721
4721
|
var utils = require_utils();
|
|
4722
4722
|
function move(node, stayWithin, directionIsNext) {
|
|
@@ -4751,7 +4751,7 @@ var require_NodeIterator = __commonJS({
|
|
|
4751
4751
|
}
|
|
4752
4752
|
}
|
|
4753
4753
|
var result = ni._internalFilter(node);
|
|
4754
|
-
if (result ===
|
|
4754
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4755
4755
|
break;
|
|
4756
4756
|
}
|
|
4757
4757
|
}
|
|
@@ -4799,11 +4799,11 @@ var require_NodeIterator = __commonJS({
|
|
|
4799
4799
|
utils.InvalidStateError();
|
|
4800
4800
|
}
|
|
4801
4801
|
if (!(1 << node.nodeType - 1 & this._whatToShow)) {
|
|
4802
|
-
return
|
|
4802
|
+
return NodeFilter2.FILTER_SKIP;
|
|
4803
4803
|
}
|
|
4804
4804
|
filter = this._filter;
|
|
4805
4805
|
if (filter === null) {
|
|
4806
|
-
result =
|
|
4806
|
+
result = NodeFilter2.FILTER_ACCEPT;
|
|
4807
4807
|
} else {
|
|
4808
4808
|
this._active = true;
|
|
4809
4809
|
try {
|
|
@@ -5013,32 +5013,32 @@ var require_URL = __commonJS({
|
|
|
5013
5013
|
else
|
|
5014
5014
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5015
5015
|
}
|
|
5016
|
-
function remove_dot_segments(
|
|
5017
|
-
if (!
|
|
5016
|
+
function remove_dot_segments(path13) {
|
|
5017
|
+
if (!path13) return path13;
|
|
5018
5018
|
var output = "";
|
|
5019
|
-
while (
|
|
5020
|
-
if (
|
|
5021
|
-
|
|
5019
|
+
while (path13.length > 0) {
|
|
5020
|
+
if (path13 === "." || path13 === "..") {
|
|
5021
|
+
path13 = "";
|
|
5022
5022
|
break;
|
|
5023
5023
|
}
|
|
5024
|
-
var twochars =
|
|
5025
|
-
var threechars =
|
|
5026
|
-
var fourchars =
|
|
5024
|
+
var twochars = path13.substring(0, 2);
|
|
5025
|
+
var threechars = path13.substring(0, 3);
|
|
5026
|
+
var fourchars = path13.substring(0, 4);
|
|
5027
5027
|
if (threechars === "../") {
|
|
5028
|
-
|
|
5028
|
+
path13 = path13.substring(3);
|
|
5029
5029
|
} else if (twochars === "./") {
|
|
5030
|
-
|
|
5030
|
+
path13 = path13.substring(2);
|
|
5031
5031
|
} else if (threechars === "/./") {
|
|
5032
|
-
|
|
5033
|
-
} else if (twochars === "/." &&
|
|
5034
|
-
|
|
5035
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5036
|
-
|
|
5032
|
+
path13 = "/" + path13.substring(3);
|
|
5033
|
+
} else if (twochars === "/." && path13.length === 2) {
|
|
5034
|
+
path13 = "/";
|
|
5035
|
+
} else if (fourchars === "/../" || threechars === "/.." && path13.length === 3) {
|
|
5036
|
+
path13 = "/" + path13.substring(4);
|
|
5037
5037
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5038
5038
|
} else {
|
|
5039
|
-
var segment =
|
|
5039
|
+
var segment = path13.match(/(\/?([^\/]*))/)[0];
|
|
5040
5040
|
output += segment;
|
|
5041
|
-
|
|
5041
|
+
path13 = path13.substring(segment.length);
|
|
5042
5042
|
}
|
|
5043
5043
|
}
|
|
5044
5044
|
return output;
|
|
@@ -5603,9 +5603,9 @@ var require_defineElement = __commonJS({
|
|
|
5603
5603
|
});
|
|
5604
5604
|
return c;
|
|
5605
5605
|
};
|
|
5606
|
-
function EventHandlerBuilder(body,
|
|
5606
|
+
function EventHandlerBuilder(body, document2, form, element) {
|
|
5607
5607
|
this.body = body;
|
|
5608
|
-
this.document =
|
|
5608
|
+
this.document = document2;
|
|
5609
5609
|
this.form = form;
|
|
5610
5610
|
this.element = element;
|
|
5611
5611
|
}
|
|
@@ -5639,7 +5639,7 @@ var require_defineElement = __commonJS({
|
|
|
5639
5639
|
var require_htmlelts = __commonJS({
|
|
5640
5640
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/htmlelts.js"(exports$1) {
|
|
5641
5641
|
var Node2 = require_Node();
|
|
5642
|
-
var
|
|
5642
|
+
var Element2 = require_Element();
|
|
5643
5643
|
var CSSStyleDeclaration = require_CSSStyleDeclaration();
|
|
5644
5644
|
var utils = require_utils();
|
|
5645
5645
|
var URLUtils = require_URLUtils();
|
|
@@ -5707,10 +5707,10 @@ var require_htmlelts = __commonJS({
|
|
|
5707
5707
|
this._form = null;
|
|
5708
5708
|
};
|
|
5709
5709
|
var HTMLElement = exports$1.HTMLElement = define({
|
|
5710
|
-
superclass:
|
|
5710
|
+
superclass: Element2,
|
|
5711
5711
|
name: "HTMLElement",
|
|
5712
5712
|
ctor: function HTMLElement2(doc, localName, prefix) {
|
|
5713
|
-
|
|
5713
|
+
Element2.call(this, doc, localName, utils.NAMESPACE.HTML, prefix);
|
|
5714
5714
|
},
|
|
5715
5715
|
props: {
|
|
5716
5716
|
dangerouslySetInnerHTML: {
|
|
@@ -7192,7 +7192,7 @@ var require_htmlelts = __commonJS({
|
|
|
7192
7192
|
// node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/svg.js
|
|
7193
7193
|
var require_svg = __commonJS({
|
|
7194
7194
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/svg.js"(exports$1) {
|
|
7195
|
-
var
|
|
7195
|
+
var Element2 = require_Element();
|
|
7196
7196
|
var defineElement = require_defineElement();
|
|
7197
7197
|
var utils = require_utils();
|
|
7198
7198
|
var CSSStyleDeclaration = require_CSSStyleDeclaration();
|
|
@@ -7206,10 +7206,10 @@ var require_svg = __commonJS({
|
|
|
7206
7206
|
return defineElement(spec, SVGElement, svgElements, svgNameToImpl);
|
|
7207
7207
|
}
|
|
7208
7208
|
var SVGElement = define({
|
|
7209
|
-
superclass:
|
|
7209
|
+
superclass: Element2,
|
|
7210
7210
|
name: "SVGElement",
|
|
7211
7211
|
ctor: function SVGElement2(doc, localName, prefix) {
|
|
7212
|
-
|
|
7212
|
+
Element2.call(this, doc, localName, utils.NAMESPACE.SVG, prefix);
|
|
7213
7213
|
},
|
|
7214
7214
|
props: {
|
|
7215
7215
|
style: { get: function() {
|
|
@@ -7344,7 +7344,7 @@ var require_Document = __commonJS({
|
|
|
7344
7344
|
var Node2 = require_Node();
|
|
7345
7345
|
var NodeList = require_NodeList();
|
|
7346
7346
|
var ContainerNode = require_ContainerNode();
|
|
7347
|
-
var
|
|
7347
|
+
var Element2 = require_Element();
|
|
7348
7348
|
var Text = require_Text();
|
|
7349
7349
|
var Comment = require_Comment();
|
|
7350
7350
|
var Event = require_Event();
|
|
@@ -7353,7 +7353,7 @@ var require_Document = __commonJS({
|
|
|
7353
7353
|
var DOMImplementation = require_DOMImplementation();
|
|
7354
7354
|
var TreeWalker = require_TreeWalker();
|
|
7355
7355
|
var NodeIterator = require_NodeIterator();
|
|
7356
|
-
var
|
|
7356
|
+
var NodeFilter2 = require_NodeFilter();
|
|
7357
7357
|
var URL2 = require_URL();
|
|
7358
7358
|
var select = require_select();
|
|
7359
7359
|
var events = require_events();
|
|
@@ -7492,13 +7492,13 @@ var require_Document = __commonJS({
|
|
|
7492
7492
|
if (this.isHTML) {
|
|
7493
7493
|
localName = utils.toASCIILowerCase(localName);
|
|
7494
7494
|
}
|
|
7495
|
-
return new
|
|
7495
|
+
return new Element2._Attr(null, localName, null, null, "");
|
|
7496
7496
|
} },
|
|
7497
7497
|
createAttributeNS: { value: function(namespace, qualifiedName) {
|
|
7498
7498
|
namespace = namespace === null || namespace === void 0 || namespace === "" ? null : String(namespace);
|
|
7499
7499
|
qualifiedName = String(qualifiedName);
|
|
7500
7500
|
var ve = validateAndExtract(namespace, qualifiedName);
|
|
7501
|
-
return new
|
|
7501
|
+
return new Element2._Attr(null, ve.localName, ve.prefix, ve.namespace, "");
|
|
7502
7502
|
} },
|
|
7503
7503
|
createElement: { value: function(localName) {
|
|
7504
7504
|
localName = String(localName);
|
|
@@ -7510,7 +7510,7 @@ var require_Document = __commonJS({
|
|
|
7510
7510
|
} else if (this.contentType === "application/xhtml+xml") {
|
|
7511
7511
|
return html.createElement(this, localName, null);
|
|
7512
7512
|
} else {
|
|
7513
|
-
return new
|
|
7513
|
+
return new Element2(this, localName, null, null);
|
|
7514
7514
|
}
|
|
7515
7515
|
}, writable: isApiWritable },
|
|
7516
7516
|
createElementNS: { value: function(namespace, qualifiedName) {
|
|
@@ -7527,7 +7527,7 @@ var require_Document = __commonJS({
|
|
|
7527
7527
|
} else if (namespace === NAMESPACE.SVG) {
|
|
7528
7528
|
return svg.createElement(this, localName, prefix);
|
|
7529
7529
|
}
|
|
7530
|
-
return new
|
|
7530
|
+
return new Element2(this, localName, namespace, prefix);
|
|
7531
7531
|
} },
|
|
7532
7532
|
createEvent: { value: function createEvent(interfaceName) {
|
|
7533
7533
|
interfaceName = interfaceName.toLowerCase();
|
|
@@ -7549,7 +7549,7 @@ var require_Document = __commonJS({
|
|
|
7549
7549
|
if (!(root3 instanceof Node2)) {
|
|
7550
7550
|
throw new TypeError("root not a node");
|
|
7551
7551
|
}
|
|
7552
|
-
whatToShow = whatToShow === void 0 ?
|
|
7552
|
+
whatToShow = whatToShow === void 0 ? NodeFilter2.SHOW_ALL : +whatToShow;
|
|
7553
7553
|
filter = filter === void 0 ? null : filter;
|
|
7554
7554
|
return new TreeWalker(root3, whatToShow, filter);
|
|
7555
7555
|
} },
|
|
@@ -7561,7 +7561,7 @@ var require_Document = __commonJS({
|
|
|
7561
7561
|
if (!(root3 instanceof Node2)) {
|
|
7562
7562
|
throw new TypeError("root not a node");
|
|
7563
7563
|
}
|
|
7564
|
-
whatToShow = whatToShow === void 0 ?
|
|
7564
|
+
whatToShow = whatToShow === void 0 ? NodeFilter2.SHOW_ALL : +whatToShow;
|
|
7565
7565
|
filter = filter === void 0 ? null : filter;
|
|
7566
7566
|
return new NodeIterator(root3, whatToShow, filter);
|
|
7567
7567
|
} },
|
|
@@ -7622,10 +7622,10 @@ var require_Document = __commonJS({
|
|
|
7622
7622
|
return this.byId[id] instanceof MultiId;
|
|
7623
7623
|
} },
|
|
7624
7624
|
// Just copy this method from the Element prototype
|
|
7625
|
-
getElementsByName: { value:
|
|
7626
|
-
getElementsByTagName: { value:
|
|
7627
|
-
getElementsByTagNameNS: { value:
|
|
7628
|
-
getElementsByClassName: { value:
|
|
7625
|
+
getElementsByName: { value: Element2.prototype.getElementsByName },
|
|
7626
|
+
getElementsByTagName: { value: Element2.prototype.getElementsByTagName },
|
|
7627
|
+
getElementsByTagNameNS: { value: Element2.prototype.getElementsByTagNameNS },
|
|
7628
|
+
getElementsByClassName: { value: Element2.prototype.getElementsByClassName },
|
|
7629
7629
|
adoptNode: { value: function adoptNode(node) {
|
|
7630
7630
|
if (node.nodeType === Node2.DOCUMENT_NODE) utils.NotSupportedError();
|
|
7631
7631
|
if (node.nodeType === Node2.ATTRIBUTE_NODE) {
|
|
@@ -16451,8 +16451,8 @@ var require_Window = __commonJS({
|
|
|
16451
16451
|
var Location = require_Location();
|
|
16452
16452
|
var utils = require_utils();
|
|
16453
16453
|
module.exports = Window;
|
|
16454
|
-
function Window(
|
|
16455
|
-
this.document =
|
|
16454
|
+
function Window(document2) {
|
|
16455
|
+
this.document = document2 || new DOMImplementation(null).createHTMLDocument("");
|
|
16456
16456
|
this.document._scripting_enabled = true;
|
|
16457
16457
|
this.document.defaultView = this;
|
|
16458
16458
|
this.location = new Location(this, this.document._address || "about:blank");
|
|
@@ -16582,11 +16582,11 @@ var require_lib = __commonJS({
|
|
|
16582
16582
|
};
|
|
16583
16583
|
};
|
|
16584
16584
|
exports$1.createWindow = function(html, address) {
|
|
16585
|
-
var
|
|
16585
|
+
var document2 = exports$1.createDocument(html);
|
|
16586
16586
|
if (address !== void 0) {
|
|
16587
|
-
|
|
16587
|
+
document2._address = address;
|
|
16588
16588
|
}
|
|
16589
|
-
return new impl.Window(
|
|
16589
|
+
return new impl.Window(document2);
|
|
16590
16590
|
};
|
|
16591
16591
|
exports$1.impl = impl;
|
|
16592
16592
|
}
|
|
@@ -16602,6 +16602,8 @@ var searchSocketConfigSchema = z.object({
|
|
|
16602
16602
|
envVar: z.string().min(1).optional(),
|
|
16603
16603
|
sanitize: z.boolean().optional()
|
|
16604
16604
|
}).optional(),
|
|
16605
|
+
exclude: z.array(z.string()).optional(),
|
|
16606
|
+
respectRobotsTxt: z.boolean().optional(),
|
|
16605
16607
|
source: z.object({
|
|
16606
16608
|
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16607
16609
|
staticOutputDir: z.string().min(1).optional(),
|
|
@@ -16649,29 +16651,18 @@ var searchSocketConfigSchema = z.object({
|
|
|
16649
16651
|
prependTitle: z.boolean().optional(),
|
|
16650
16652
|
pageSummaryChunk: z.boolean().optional()
|
|
16651
16653
|
}).optional(),
|
|
16652
|
-
|
|
16653
|
-
|
|
16654
|
-
|
|
16655
|
-
|
|
16656
|
-
|
|
16657
|
-
batchSize: z.number().int().positive().optional(),
|
|
16658
|
-
concurrency: z.number().int().positive().optional(),
|
|
16659
|
-
pricePer1kTokens: z.number().positive().optional()
|
|
16654
|
+
upstash: z.object({
|
|
16655
|
+
url: z.string().url().optional(),
|
|
16656
|
+
token: z.string().min(1).optional(),
|
|
16657
|
+
urlEnv: z.string().min(1).optional(),
|
|
16658
|
+
tokenEnv: z.string().min(1).optional()
|
|
16660
16659
|
}).optional(),
|
|
16661
|
-
|
|
16662
|
-
|
|
16663
|
-
|
|
16664
|
-
|
|
16665
|
-
|
|
16666
|
-
|
|
16667
|
-
authTokenEnv: z.string().optional(),
|
|
16668
|
-
localPath: z.string().optional()
|
|
16669
|
-
}).optional()
|
|
16670
|
-
}).optional(),
|
|
16671
|
-
rerank: z.object({
|
|
16672
|
-
enabled: z.boolean().optional(),
|
|
16673
|
-
topN: z.number().int().positive().optional(),
|
|
16674
|
-
model: z.string().optional()
|
|
16660
|
+
search: z.object({
|
|
16661
|
+
semanticWeight: z.number().min(0).max(1).optional(),
|
|
16662
|
+
inputEnrichment: z.boolean().optional(),
|
|
16663
|
+
reranking: z.boolean().optional(),
|
|
16664
|
+
dualSearch: z.boolean().optional(),
|
|
16665
|
+
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
16675
16666
|
}).optional(),
|
|
16676
16667
|
ranking: z.object({
|
|
16677
16668
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -16681,11 +16672,12 @@ var searchSocketConfigSchema = z.object({
|
|
|
16681
16672
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16682
16673
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16683
16674
|
minScore: z.number().min(0).max(1).optional(),
|
|
16675
|
+
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
16684
16676
|
weights: z.object({
|
|
16685
16677
|
incomingLinks: z.number().optional(),
|
|
16686
16678
|
depth: z.number().optional(),
|
|
16687
|
-
|
|
16688
|
-
|
|
16679
|
+
aggregation: z.number().optional(),
|
|
16680
|
+
titleMatch: z.number().optional()
|
|
16689
16681
|
}).optional()
|
|
16690
16682
|
}).optional(),
|
|
16691
16683
|
api: z.object({
|
|
@@ -16707,8 +16699,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16707
16699
|
}).optional()
|
|
16708
16700
|
}).optional(),
|
|
16709
16701
|
state: z.object({
|
|
16710
|
-
dir: z.string().optional()
|
|
16711
|
-
writeMirror: z.boolean().optional()
|
|
16702
|
+
dir: z.string().optional()
|
|
16712
16703
|
}).optional()
|
|
16713
16704
|
});
|
|
16714
16705
|
|
|
@@ -16732,6 +16723,8 @@ function createDefaultConfig(projectId) {
|
|
|
16732
16723
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16733
16724
|
sanitize: true
|
|
16734
16725
|
},
|
|
16726
|
+
exclude: [],
|
|
16727
|
+
respectRobotsTxt: true,
|
|
16735
16728
|
source: {
|
|
16736
16729
|
mode: "static-output",
|
|
16737
16730
|
staticOutputDir: "build",
|
|
@@ -16760,24 +16753,16 @@ function createDefaultConfig(projectId) {
|
|
|
16760
16753
|
prependTitle: true,
|
|
16761
16754
|
pageSummaryChunk: true
|
|
16762
16755
|
},
|
|
16763
|
-
|
|
16764
|
-
|
|
16765
|
-
|
|
16766
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16767
|
-
batchSize: 64,
|
|
16768
|
-
concurrency: 4
|
|
16769
|
-
},
|
|
16770
|
-
vector: {
|
|
16771
|
-
turso: {
|
|
16772
|
-
urlEnv: "TURSO_DATABASE_URL",
|
|
16773
|
-
authTokenEnv: "TURSO_AUTH_TOKEN",
|
|
16774
|
-
localPath: ".searchsocket/vectors.db"
|
|
16775
|
-
}
|
|
16756
|
+
upstash: {
|
|
16757
|
+
urlEnv: "UPSTASH_SEARCH_REST_URL",
|
|
16758
|
+
tokenEnv: "UPSTASH_SEARCH_REST_TOKEN"
|
|
16776
16759
|
},
|
|
16777
|
-
|
|
16778
|
-
|
|
16779
|
-
|
|
16780
|
-
|
|
16760
|
+
search: {
|
|
16761
|
+
semanticWeight: 0.75,
|
|
16762
|
+
inputEnrichment: true,
|
|
16763
|
+
reranking: true,
|
|
16764
|
+
dualSearch: true,
|
|
16765
|
+
pageSearchWeight: 0.3
|
|
16781
16766
|
},
|
|
16782
16767
|
ranking: {
|
|
16783
16768
|
enableIncomingLinkBoost: true,
|
|
@@ -16786,12 +16771,13 @@ function createDefaultConfig(projectId) {
|
|
|
16786
16771
|
aggregationCap: 5,
|
|
16787
16772
|
aggregationDecay: 0.5,
|
|
16788
16773
|
minChunkScoreRatio: 0.5,
|
|
16789
|
-
minScore: 0,
|
|
16774
|
+
minScore: 0.3,
|
|
16775
|
+
scoreGapThreshold: 0.4,
|
|
16790
16776
|
weights: {
|
|
16791
16777
|
incomingLinks: 0.05,
|
|
16792
16778
|
depth: 0.03,
|
|
16793
|
-
|
|
16794
|
-
|
|
16779
|
+
aggregation: 0.1,
|
|
16780
|
+
titleMatch: 0.15
|
|
16795
16781
|
}
|
|
16796
16782
|
},
|
|
16797
16783
|
api: {
|
|
@@ -16809,8 +16795,7 @@ function createDefaultConfig(projectId) {
|
|
|
16809
16795
|
}
|
|
16810
16796
|
},
|
|
16811
16797
|
state: {
|
|
16812
|
-
dir: ".searchsocket"
|
|
16813
|
-
writeMirror: false
|
|
16798
|
+
dir: ".searchsocket"
|
|
16814
16799
|
}
|
|
16815
16800
|
};
|
|
16816
16801
|
}
|
|
@@ -16896,6 +16881,8 @@ ${issues}`
|
|
|
16896
16881
|
...defaults.scope,
|
|
16897
16882
|
...parsed.scope
|
|
16898
16883
|
},
|
|
16884
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16885
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16899
16886
|
source: {
|
|
16900
16887
|
...defaults.source,
|
|
16901
16888
|
...parsed.source,
|
|
@@ -16932,21 +16919,13 @@ ${issues}`
|
|
|
16932
16919
|
...defaults.chunking,
|
|
16933
16920
|
...parsed.chunking
|
|
16934
16921
|
},
|
|
16935
|
-
|
|
16936
|
-
...defaults.
|
|
16937
|
-
...parsed.
|
|
16922
|
+
upstash: {
|
|
16923
|
+
...defaults.upstash,
|
|
16924
|
+
...parsed.upstash
|
|
16938
16925
|
},
|
|
16939
|
-
|
|
16940
|
-
...defaults.
|
|
16941
|
-
...parsed.
|
|
16942
|
-
turso: {
|
|
16943
|
-
...defaults.vector.turso,
|
|
16944
|
-
...parsed.vector?.turso
|
|
16945
|
-
}
|
|
16946
|
-
},
|
|
16947
|
-
rerank: {
|
|
16948
|
-
...defaults.rerank,
|
|
16949
|
-
...parsed.rerank
|
|
16926
|
+
search: {
|
|
16927
|
+
...defaults.search,
|
|
16928
|
+
...parsed.search
|
|
16950
16929
|
},
|
|
16951
16930
|
ranking: {
|
|
16952
16931
|
...defaults.ranking,
|
|
@@ -17125,660 +17104,245 @@ function resolveScope(config, override) {
|
|
|
17125
17104
|
scopeId: `${config.project.id}:${scopeName}`
|
|
17126
17105
|
};
|
|
17127
17106
|
}
|
|
17128
|
-
function sleep(ms) {
|
|
17129
|
-
return new Promise((resolve) => {
|
|
17130
|
-
setTimeout(resolve, ms);
|
|
17131
|
-
});
|
|
17132
|
-
}
|
|
17133
|
-
var JinaEmbeddingsProvider = class {
|
|
17134
|
-
apiKey;
|
|
17135
|
-
batchSize;
|
|
17136
|
-
concurrency;
|
|
17137
|
-
defaultTask;
|
|
17138
|
-
constructor(options) {
|
|
17139
|
-
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17140
|
-
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
17141
|
-
}
|
|
17142
|
-
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17143
|
-
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17144
|
-
}
|
|
17145
|
-
this.apiKey = options.apiKey;
|
|
17146
|
-
this.batchSize = options.batchSize;
|
|
17147
|
-
this.concurrency = options.concurrency;
|
|
17148
|
-
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17149
|
-
}
|
|
17150
|
-
estimateTokens(text) {
|
|
17151
|
-
const normalized = text.trim();
|
|
17152
|
-
if (!normalized) {
|
|
17153
|
-
return 0;
|
|
17154
|
-
}
|
|
17155
|
-
const wordCount = normalized.match(/[A-Za-z0-9_]+/g)?.length ?? 0;
|
|
17156
|
-
const punctuationCount = normalized.match(/[^\s\w]/g)?.length ?? 0;
|
|
17157
|
-
const cjkCount = normalized.match(/[\u3400-\u9fff]/g)?.length ?? 0;
|
|
17158
|
-
const charEstimate = Math.ceil(normalized.length / 4);
|
|
17159
|
-
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17160
|
-
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17161
|
-
}
|
|
17162
|
-
async embedTexts(texts, modelId, task) {
|
|
17163
|
-
if (texts.length === 0) {
|
|
17164
|
-
return [];
|
|
17165
|
-
}
|
|
17166
|
-
const batches = [];
|
|
17167
|
-
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
17168
|
-
batches.push({
|
|
17169
|
-
index: i,
|
|
17170
|
-
values: texts.slice(i, i + this.batchSize)
|
|
17171
|
-
});
|
|
17172
|
-
}
|
|
17173
|
-
const outputs = new Array(batches.length);
|
|
17174
|
-
const limit = pLimit2(this.concurrency);
|
|
17175
|
-
await Promise.all(
|
|
17176
|
-
batches.map(
|
|
17177
|
-
(batch, position) => limit(async () => {
|
|
17178
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17179
|
-
})
|
|
17180
|
-
)
|
|
17181
|
-
);
|
|
17182
|
-
return outputs.flat();
|
|
17183
|
-
}
|
|
17184
|
-
async embedWithRetry(texts, modelId, task) {
|
|
17185
|
-
const maxAttempts = 5;
|
|
17186
|
-
let attempt = 0;
|
|
17187
|
-
while (attempt < maxAttempts) {
|
|
17188
|
-
attempt += 1;
|
|
17189
|
-
let response;
|
|
17190
|
-
try {
|
|
17191
|
-
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17192
|
-
method: "POST",
|
|
17193
|
-
headers: {
|
|
17194
|
-
"content-type": "application/json",
|
|
17195
|
-
authorization: `Bearer ${this.apiKey}`
|
|
17196
|
-
},
|
|
17197
|
-
body: JSON.stringify({
|
|
17198
|
-
model: modelId,
|
|
17199
|
-
input: texts,
|
|
17200
|
-
task
|
|
17201
|
-
})
|
|
17202
|
-
});
|
|
17203
|
-
} catch (error) {
|
|
17204
|
-
if (attempt >= maxAttempts) {
|
|
17205
|
-
throw error;
|
|
17206
|
-
}
|
|
17207
|
-
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17208
|
-
continue;
|
|
17209
|
-
}
|
|
17210
|
-
if (!response.ok) {
|
|
17211
|
-
const retryable = response.status === 429 || response.status >= 500;
|
|
17212
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17213
|
-
const errorBody = await response.text();
|
|
17214
|
-
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17215
|
-
}
|
|
17216
|
-
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17217
|
-
continue;
|
|
17218
|
-
}
|
|
17219
|
-
const payload = await response.json();
|
|
17220
|
-
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17221
|
-
throw new Error("Invalid Jina embeddings response format");
|
|
17222
|
-
}
|
|
17223
|
-
return payload.data.map((entry) => entry.embedding);
|
|
17224
|
-
}
|
|
17225
|
-
throw new Error("Unreachable retry state");
|
|
17226
|
-
}
|
|
17227
|
-
};
|
|
17228
|
-
|
|
17229
|
-
// src/embeddings/factory.ts
|
|
17230
|
-
function createEmbeddingsProvider(config) {
|
|
17231
|
-
if (config.embeddings.provider !== "jina") {
|
|
17232
|
-
throw new SearchSocketError(
|
|
17233
|
-
"CONFIG_MISSING",
|
|
17234
|
-
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17235
|
-
);
|
|
17236
|
-
}
|
|
17237
|
-
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17238
|
-
if (!apiKey) {
|
|
17239
|
-
throw new SearchSocketError(
|
|
17240
|
-
"CONFIG_MISSING",
|
|
17241
|
-
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17242
|
-
);
|
|
17243
|
-
}
|
|
17244
|
-
return new JinaEmbeddingsProvider({
|
|
17245
|
-
apiKey,
|
|
17246
|
-
batchSize: config.embeddings.batchSize,
|
|
17247
|
-
concurrency: config.embeddings.concurrency
|
|
17248
|
-
});
|
|
17249
|
-
}
|
|
17250
|
-
|
|
17251
|
-
// src/rerank/jina.ts
|
|
17252
|
-
function sleep2(ms) {
|
|
17253
|
-
return new Promise((resolve) => {
|
|
17254
|
-
setTimeout(resolve, ms);
|
|
17255
|
-
});
|
|
17256
|
-
}
|
|
17257
|
-
var JinaReranker = class {
|
|
17258
|
-
apiKey;
|
|
17259
|
-
model;
|
|
17260
|
-
maxRetries;
|
|
17261
|
-
constructor(options) {
|
|
17262
|
-
this.apiKey = options.apiKey;
|
|
17263
|
-
this.model = options.model;
|
|
17264
|
-
this.maxRetries = options.maxRetries ?? 2;
|
|
17265
|
-
}
|
|
17266
|
-
async rerank(query, candidates, topN) {
|
|
17267
|
-
if (candidates.length === 0) {
|
|
17268
|
-
return [];
|
|
17269
|
-
}
|
|
17270
|
-
const body = {
|
|
17271
|
-
model: this.model,
|
|
17272
|
-
query,
|
|
17273
|
-
documents: candidates.map((candidate) => candidate.text),
|
|
17274
|
-
top_n: topN ?? candidates.length,
|
|
17275
|
-
return_documents: false
|
|
17276
|
-
};
|
|
17277
|
-
let attempt = 0;
|
|
17278
|
-
while (attempt <= this.maxRetries) {
|
|
17279
|
-
attempt += 1;
|
|
17280
|
-
let response;
|
|
17281
|
-
try {
|
|
17282
|
-
response = await fetch("https://api.jina.ai/v1/rerank", {
|
|
17283
|
-
method: "POST",
|
|
17284
|
-
headers: {
|
|
17285
|
-
"content-type": "application/json",
|
|
17286
|
-
authorization: `Bearer ${this.apiKey}`
|
|
17287
|
-
},
|
|
17288
|
-
body: JSON.stringify(body)
|
|
17289
|
-
});
|
|
17290
|
-
} catch (error) {
|
|
17291
|
-
if (attempt <= this.maxRetries) {
|
|
17292
|
-
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
17293
|
-
continue;
|
|
17294
|
-
}
|
|
17295
|
-
throw error;
|
|
17296
|
-
}
|
|
17297
|
-
if (!response.ok) {
|
|
17298
|
-
const retryable = response.status === 429 || response.status >= 500;
|
|
17299
|
-
if (retryable && attempt <= this.maxRetries) {
|
|
17300
|
-
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
17301
|
-
continue;
|
|
17302
|
-
}
|
|
17303
|
-
const errorBody = await response.text();
|
|
17304
|
-
throw new Error(`Jina rerank failed (${response.status}): ${errorBody}`);
|
|
17305
|
-
}
|
|
17306
|
-
const payload = await response.json();
|
|
17307
|
-
const rawResults = payload.results ?? payload.data ?? [];
|
|
17308
|
-
if (!Array.isArray(rawResults)) {
|
|
17309
|
-
throw new Error("Invalid Jina rerank response format");
|
|
17310
|
-
}
|
|
17311
|
-
return rawResults.flatMap((item) => {
|
|
17312
|
-
const index = item.index;
|
|
17313
|
-
if (typeof index !== "number" || index < 0 || index >= candidates.length) {
|
|
17314
|
-
return [];
|
|
17315
|
-
}
|
|
17316
|
-
const candidate = candidates[index];
|
|
17317
|
-
if (!candidate) {
|
|
17318
|
-
return [];
|
|
17319
|
-
}
|
|
17320
|
-
const score = typeof item.relevance_score === "number" ? item.relevance_score : item.score ?? 0;
|
|
17321
|
-
return [
|
|
17322
|
-
{
|
|
17323
|
-
id: candidate.id,
|
|
17324
|
-
score
|
|
17325
|
-
}
|
|
17326
|
-
];
|
|
17327
|
-
}).sort((a, b) => b.score - a.score);
|
|
17328
|
-
}
|
|
17329
|
-
throw new Error("Jina rerank request failed after retries");
|
|
17330
|
-
}
|
|
17331
|
-
};
|
|
17332
|
-
|
|
17333
|
-
// src/rerank/factory.ts
|
|
17334
|
-
function createReranker(config) {
|
|
17335
|
-
if (!config.rerank.enabled) {
|
|
17336
|
-
return null;
|
|
17337
|
-
}
|
|
17338
|
-
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17339
|
-
if (!apiKey) {
|
|
17340
|
-
return null;
|
|
17341
|
-
}
|
|
17342
|
-
return new JinaReranker({
|
|
17343
|
-
apiKey,
|
|
17344
|
-
model: config.rerank.model
|
|
17345
|
-
});
|
|
17346
|
-
}
|
|
17347
17107
|
function ensureStateDirs(cwd, stateDir, scope) {
|
|
17348
17108
|
const statePath = path.resolve(cwd, stateDir);
|
|
17349
|
-
|
|
17350
|
-
|
|
17351
|
-
return { statePath, pagesPath };
|
|
17109
|
+
fs.mkdirSync(statePath, { recursive: true });
|
|
17110
|
+
return { statePath };
|
|
17352
17111
|
}
|
|
17353
17112
|
|
|
17354
|
-
// src/vector/
|
|
17355
|
-
|
|
17113
|
+
// src/vector/upstash.ts
|
|
17114
|
+
function chunkIndexName(scope) {
|
|
17115
|
+
return `${scope.projectId}--${scope.scopeName}`;
|
|
17116
|
+
}
|
|
17117
|
+
function pageIndexName(scope) {
|
|
17118
|
+
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17119
|
+
}
|
|
17120
|
+
var UpstashSearchStore = class {
|
|
17356
17121
|
client;
|
|
17357
|
-
dimension;
|
|
17358
|
-
chunksReady = false;
|
|
17359
|
-
registryReady = false;
|
|
17360
|
-
pagesReady = false;
|
|
17361
17122
|
constructor(opts) {
|
|
17362
17123
|
this.client = opts.client;
|
|
17363
|
-
this.dimension = opts.dimension;
|
|
17364
|
-
}
|
|
17365
|
-
async ensureRegistry() {
|
|
17366
|
-
if (this.registryReady) return;
|
|
17367
|
-
await this.client.execute(`
|
|
17368
|
-
CREATE TABLE IF NOT EXISTS registry (
|
|
17369
|
-
scope_key TEXT PRIMARY KEY,
|
|
17370
|
-
project_id TEXT NOT NULL,
|
|
17371
|
-
scope_name TEXT NOT NULL,
|
|
17372
|
-
model_id TEXT NOT NULL,
|
|
17373
|
-
last_indexed_at TEXT NOT NULL,
|
|
17374
|
-
vector_count INTEGER,
|
|
17375
|
-
last_estimate_tokens INTEGER,
|
|
17376
|
-
last_estimate_cost_usd REAL,
|
|
17377
|
-
last_estimate_changed_chunks INTEGER
|
|
17378
|
-
)
|
|
17379
|
-
`);
|
|
17380
|
-
const estimateCols = [
|
|
17381
|
-
{ name: "last_estimate_tokens", def: "INTEGER" },
|
|
17382
|
-
{ name: "last_estimate_cost_usd", def: "REAL" },
|
|
17383
|
-
{ name: "last_estimate_changed_chunks", def: "INTEGER" }
|
|
17384
|
-
];
|
|
17385
|
-
for (const col of estimateCols) {
|
|
17386
|
-
try {
|
|
17387
|
-
await this.client.execute(`ALTER TABLE registry ADD COLUMN ${col.name} ${col.def}`);
|
|
17388
|
-
} catch (error) {
|
|
17389
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17390
|
-
throw error;
|
|
17391
|
-
}
|
|
17392
|
-
}
|
|
17393
|
-
}
|
|
17394
|
-
this.registryReady = true;
|
|
17395
|
-
}
|
|
17396
|
-
async ensureChunks(dim) {
|
|
17397
|
-
if (this.chunksReady) return;
|
|
17398
|
-
const exists = await this.chunksTableExists();
|
|
17399
|
-
if (exists) {
|
|
17400
|
-
const currentDim = await this.getChunksDimension();
|
|
17401
|
-
if (currentDim !== null && currentDim !== dim) {
|
|
17402
|
-
await this.client.batch([
|
|
17403
|
-
"DROP INDEX IF EXISTS idx",
|
|
17404
|
-
"DROP TABLE IF EXISTS chunks"
|
|
17405
|
-
]);
|
|
17406
|
-
}
|
|
17407
|
-
}
|
|
17408
|
-
await this.client.batch([
|
|
17409
|
-
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17410
|
-
id TEXT PRIMARY KEY,
|
|
17411
|
-
project_id TEXT NOT NULL,
|
|
17412
|
-
scope_name TEXT NOT NULL,
|
|
17413
|
-
url TEXT NOT NULL,
|
|
17414
|
-
path TEXT NOT NULL,
|
|
17415
|
-
title TEXT NOT NULL,
|
|
17416
|
-
section_title TEXT NOT NULL DEFAULT '',
|
|
17417
|
-
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17418
|
-
snippet TEXT NOT NULL DEFAULT '',
|
|
17419
|
-
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17420
|
-
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17421
|
-
content_hash TEXT NOT NULL DEFAULT '',
|
|
17422
|
-
model_id TEXT NOT NULL DEFAULT '',
|
|
17423
|
-
depth INTEGER NOT NULL DEFAULT 0,
|
|
17424
|
-
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17425
|
-
route_file TEXT NOT NULL DEFAULT '',
|
|
17426
|
-
tags TEXT NOT NULL DEFAULT '[]',
|
|
17427
|
-
description TEXT NOT NULL DEFAULT '',
|
|
17428
|
-
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17429
|
-
embedding F32_BLOB(${dim})
|
|
17430
|
-
)`,
|
|
17431
|
-
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17432
|
-
]);
|
|
17433
|
-
this.chunksReady = true;
|
|
17434
|
-
}
|
|
17435
|
-
async ensurePages() {
|
|
17436
|
-
if (this.pagesReady) return;
|
|
17437
|
-
await this.client.execute(`
|
|
17438
|
-
CREATE TABLE IF NOT EXISTS pages (
|
|
17439
|
-
project_id TEXT NOT NULL,
|
|
17440
|
-
scope_name TEXT NOT NULL,
|
|
17441
|
-
url TEXT NOT NULL,
|
|
17442
|
-
title TEXT NOT NULL,
|
|
17443
|
-
markdown TEXT NOT NULL,
|
|
17444
|
-
route_file TEXT NOT NULL DEFAULT '',
|
|
17445
|
-
route_resolution TEXT NOT NULL DEFAULT 'exact',
|
|
17446
|
-
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17447
|
-
outgoing_links INTEGER NOT NULL DEFAULT 0,
|
|
17448
|
-
depth INTEGER NOT NULL DEFAULT 0,
|
|
17449
|
-
tags TEXT NOT NULL DEFAULT '[]',
|
|
17450
|
-
indexed_at TEXT NOT NULL,
|
|
17451
|
-
PRIMARY KEY (project_id, scope_name, url)
|
|
17452
|
-
)
|
|
17453
|
-
`);
|
|
17454
|
-
this.pagesReady = true;
|
|
17455
|
-
}
|
|
17456
|
-
async chunksTableExists() {
|
|
17457
|
-
try {
|
|
17458
|
-
await this.client.execute("SELECT 1 FROM chunks LIMIT 0");
|
|
17459
|
-
return true;
|
|
17460
|
-
} catch (error) {
|
|
17461
|
-
if (error instanceof Error && error.message.includes("no such table")) {
|
|
17462
|
-
return false;
|
|
17463
|
-
}
|
|
17464
|
-
throw error;
|
|
17465
|
-
}
|
|
17466
17124
|
}
|
|
17467
|
-
|
|
17468
|
-
|
|
17469
|
-
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17470
|
-
*/
|
|
17471
|
-
async getChunksDimension() {
|
|
17472
|
-
try {
|
|
17473
|
-
const rs = await this.client.execute(
|
|
17474
|
-
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17475
|
-
);
|
|
17476
|
-
if (rs.rows.length === 0) return null;
|
|
17477
|
-
const sql = rs.rows[0].sql;
|
|
17478
|
-
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17479
|
-
return match ? parseInt(match[1], 10) : null;
|
|
17480
|
-
} catch {
|
|
17481
|
-
return null;
|
|
17482
|
-
}
|
|
17125
|
+
chunkIndex(scope) {
|
|
17126
|
+
return this.client.index(chunkIndexName(scope));
|
|
17483
17127
|
}
|
|
17484
|
-
|
|
17485
|
-
|
|
17486
|
-
* Used by `clean --remote` for a full reset.
|
|
17487
|
-
*/
|
|
17488
|
-
async dropAllTables() {
|
|
17489
|
-
await this.client.batch([
|
|
17490
|
-
"DROP INDEX IF EXISTS idx",
|
|
17491
|
-
"DROP TABLE IF EXISTS chunks",
|
|
17492
|
-
"DROP TABLE IF EXISTS registry",
|
|
17493
|
-
"DROP TABLE IF EXISTS pages"
|
|
17494
|
-
]);
|
|
17495
|
-
this.chunksReady = false;
|
|
17496
|
-
this.registryReady = false;
|
|
17497
|
-
this.pagesReady = false;
|
|
17128
|
+
pageIndex(scope) {
|
|
17129
|
+
return this.client.index(pageIndexName(scope));
|
|
17498
17130
|
}
|
|
17499
|
-
async
|
|
17500
|
-
if (
|
|
17501
|
-
const
|
|
17502
|
-
await this.ensureChunks(dim);
|
|
17131
|
+
async upsertChunks(chunks, scope) {
|
|
17132
|
+
if (chunks.length === 0) return;
|
|
17133
|
+
const index = this.chunkIndex(scope);
|
|
17503
17134
|
const BATCH_SIZE = 100;
|
|
17504
|
-
for (let i = 0; i <
|
|
17505
|
-
const batch =
|
|
17506
|
-
|
|
17507
|
-
sql: `INSERT OR REPLACE INTO chunks
|
|
17508
|
-
(id, project_id, scope_name, url, path, title, section_title,
|
|
17509
|
-
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17510
|
-
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17511
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17512
|
-
args: [
|
|
17513
|
-
r.id,
|
|
17514
|
-
r.metadata.projectId,
|
|
17515
|
-
r.metadata.scopeName,
|
|
17516
|
-
r.metadata.url,
|
|
17517
|
-
r.metadata.path,
|
|
17518
|
-
r.metadata.title,
|
|
17519
|
-
r.metadata.sectionTitle,
|
|
17520
|
-
JSON.stringify(r.metadata.headingPath),
|
|
17521
|
-
r.metadata.snippet,
|
|
17522
|
-
r.metadata.chunkText,
|
|
17523
|
-
r.metadata.ordinal,
|
|
17524
|
-
r.metadata.contentHash,
|
|
17525
|
-
r.metadata.modelId,
|
|
17526
|
-
r.metadata.depth,
|
|
17527
|
-
r.metadata.incomingLinks,
|
|
17528
|
-
r.metadata.routeFile,
|
|
17529
|
-
JSON.stringify(r.metadata.tags),
|
|
17530
|
-
r.metadata.description ?? "",
|
|
17531
|
-
JSON.stringify(r.metadata.keywords ?? []),
|
|
17532
|
-
JSON.stringify(r.vector)
|
|
17533
|
-
]
|
|
17534
|
-
}));
|
|
17535
|
-
await this.client.batch(stmts);
|
|
17135
|
+
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17136
|
+
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17137
|
+
await index.upsert(batch);
|
|
17536
17138
|
}
|
|
17537
17139
|
}
|
|
17538
|
-
async query
|
|
17539
|
-
const
|
|
17540
|
-
await
|
|
17541
|
-
|
|
17542
|
-
|
|
17543
|
-
|
|
17544
|
-
|
|
17545
|
-
|
|
17546
|
-
|
|
17547
|
-
c.description, c.keywords,
|
|
17548
|
-
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17549
|
-
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17550
|
-
JOIN chunks AS c ON c.rowid = v.id`,
|
|
17551
|
-
args: [queryJson, queryJson, opts.topK]
|
|
17140
|
+
async search(query, opts, scope) {
|
|
17141
|
+
const index = this.chunkIndex(scope);
|
|
17142
|
+
const results = await index.search({
|
|
17143
|
+
query,
|
|
17144
|
+
limit: opts.limit,
|
|
17145
|
+
semanticWeight: opts.semanticWeight,
|
|
17146
|
+
inputEnrichment: opts.inputEnrichment,
|
|
17147
|
+
reranking: opts.reranking,
|
|
17148
|
+
filter: opts.filter
|
|
17552
17149
|
});
|
|
17553
|
-
|
|
17554
|
-
|
|
17555
|
-
|
|
17556
|
-
|
|
17557
|
-
|
|
17558
|
-
|
|
17559
|
-
|
|
17560
|
-
|
|
17561
|
-
|
|
17562
|
-
|
|
17563
|
-
|
|
17564
|
-
|
|
17565
|
-
|
|
17566
|
-
|
|
17567
|
-
|
|
17568
|
-
|
|
17150
|
+
return results.map((doc) => ({
|
|
17151
|
+
id: doc.id,
|
|
17152
|
+
score: doc.score,
|
|
17153
|
+
metadata: {
|
|
17154
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17155
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17156
|
+
url: doc.content.url,
|
|
17157
|
+
path: doc.metadata?.path ?? "",
|
|
17158
|
+
title: doc.content.title,
|
|
17159
|
+
sectionTitle: doc.content.sectionTitle,
|
|
17160
|
+
headingPath: doc.content.headingPath ? doc.content.headingPath.split(" > ").filter(Boolean) : [],
|
|
17161
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17162
|
+
chunkText: doc.content.text,
|
|
17163
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17164
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17165
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17166
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17167
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17168
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
17169
|
+
description: doc.metadata?.description || void 0,
|
|
17170
|
+
keywords: doc.metadata?.keywords ? doc.metadata.keywords.split(",").filter(Boolean) : void 0
|
|
17569
17171
|
}
|
|
17570
|
-
|
|
17571
|
-
|
|
17572
|
-
|
|
17573
|
-
|
|
17574
|
-
|
|
17575
|
-
|
|
17576
|
-
|
|
17577
|
-
|
|
17578
|
-
|
|
17579
|
-
|
|
17580
|
-
|
|
17581
|
-
|
|
17582
|
-
|
|
17583
|
-
})();
|
|
17584
|
-
hits.push({
|
|
17585
|
-
id: row.id,
|
|
17586
|
-
score,
|
|
17587
|
-
metadata: {
|
|
17588
|
-
projectId,
|
|
17589
|
-
scopeName,
|
|
17590
|
-
url: row.url,
|
|
17591
|
-
path: rowPath,
|
|
17592
|
-
title: row.title,
|
|
17593
|
-
sectionTitle: row.section_title,
|
|
17594
|
-
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17595
|
-
snippet: row.snippet,
|
|
17596
|
-
chunkText: row.chunk_text || "",
|
|
17597
|
-
ordinal: row.ordinal || 0,
|
|
17598
|
-
contentHash: row.content_hash,
|
|
17599
|
-
modelId: row.model_id,
|
|
17600
|
-
depth: row.depth,
|
|
17601
|
-
incomingLinks: row.incoming_links,
|
|
17602
|
-
routeFile: row.route_file,
|
|
17603
|
-
tags,
|
|
17604
|
-
description,
|
|
17605
|
-
keywords
|
|
17606
|
-
}
|
|
17172
|
+
}));
|
|
17173
|
+
}
|
|
17174
|
+
async searchPages(query, opts, scope) {
|
|
17175
|
+
const index = this.pageIndex(scope);
|
|
17176
|
+
let results;
|
|
17177
|
+
try {
|
|
17178
|
+
results = await index.search({
|
|
17179
|
+
query,
|
|
17180
|
+
limit: opts.limit,
|
|
17181
|
+
semanticWeight: opts.semanticWeight,
|
|
17182
|
+
inputEnrichment: opts.inputEnrichment,
|
|
17183
|
+
reranking: true,
|
|
17184
|
+
filter: opts.filter
|
|
17607
17185
|
});
|
|
17186
|
+
} catch {
|
|
17187
|
+
return [];
|
|
17608
17188
|
}
|
|
17609
|
-
|
|
17610
|
-
|
|
17189
|
+
return results.map((doc) => ({
|
|
17190
|
+
id: doc.id,
|
|
17191
|
+
score: doc.score,
|
|
17192
|
+
title: doc.content.title,
|
|
17193
|
+
url: doc.content.url,
|
|
17194
|
+
description: doc.content.description ?? "",
|
|
17195
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
17196
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17197
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17198
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17199
|
+
}));
|
|
17611
17200
|
}
|
|
17612
17201
|
async deleteByIds(ids, scope) {
|
|
17613
17202
|
if (ids.length === 0) return;
|
|
17203
|
+
const index = this.chunkIndex(scope);
|
|
17614
17204
|
const BATCH_SIZE = 500;
|
|
17615
17205
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17616
17206
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17617
|
-
|
|
17618
|
-
await this.client.execute({
|
|
17619
|
-
sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ? AND id IN (${placeholders})`,
|
|
17620
|
-
args: [scope.projectId, scope.scopeName, ...batch]
|
|
17621
|
-
});
|
|
17207
|
+
await index.delete(batch);
|
|
17622
17208
|
}
|
|
17623
17209
|
}
|
|
17624
17210
|
async deleteScope(scope) {
|
|
17625
|
-
await this.ensureRegistry();
|
|
17626
17211
|
try {
|
|
17627
|
-
|
|
17628
|
-
|
|
17629
|
-
|
|
17630
|
-
});
|
|
17631
|
-
} catch (error) {
|
|
17632
|
-
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
17633
|
-
throw error;
|
|
17634
|
-
}
|
|
17212
|
+
const chunkIdx = this.chunkIndex(scope);
|
|
17213
|
+
await chunkIdx.deleteIndex();
|
|
17214
|
+
} catch {
|
|
17635
17215
|
}
|
|
17636
17216
|
try {
|
|
17637
|
-
|
|
17638
|
-
|
|
17639
|
-
|
|
17640
|
-
});
|
|
17641
|
-
} catch (error) {
|
|
17642
|
-
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
17643
|
-
throw error;
|
|
17644
|
-
}
|
|
17217
|
+
const pageIdx = this.pageIndex(scope);
|
|
17218
|
+
await pageIdx.deleteIndex();
|
|
17219
|
+
} catch {
|
|
17645
17220
|
}
|
|
17646
|
-
await this.client.execute({
|
|
17647
|
-
sql: `DELETE FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
17648
|
-
args: [scope.projectId, scope.scopeName]
|
|
17649
|
-
});
|
|
17650
17221
|
}
|
|
17651
|
-
async listScopes(
|
|
17652
|
-
await this.
|
|
17653
|
-
const
|
|
17654
|
-
|
|
17655
|
-
|
|
17656
|
-
|
|
17657
|
-
|
|
17658
|
-
|
|
17659
|
-
|
|
17660
|
-
|
|
17661
|
-
|
|
17662
|
-
|
|
17663
|
-
|
|
17664
|
-
|
|
17665
|
-
|
|
17666
|
-
|
|
17667
|
-
|
|
17668
|
-
|
|
17669
|
-
|
|
17670
|
-
|
|
17671
|
-
|
|
17672
|
-
|
|
17673
|
-
|
|
17674
|
-
|
|
17675
|
-
|
|
17676
|
-
|
|
17677
|
-
|
|
17678
|
-
|
|
17679
|
-
|
|
17680
|
-
|
|
17681
|
-
|
|
17682
|
-
|
|
17683
|
-
|
|
17684
|
-
|
|
17685
|
-
|
|
17686
|
-
info.lastEstimateCostUSD ?? null,
|
|
17687
|
-
info.lastEstimateChangedChunks ?? null
|
|
17688
|
-
]
|
|
17689
|
-
});
|
|
17222
|
+
async listScopes(projectId) {
|
|
17223
|
+
const allIndexes = await this.client.listIndexes();
|
|
17224
|
+
const prefix = `${projectId}--`;
|
|
17225
|
+
const scopeNames = /* @__PURE__ */ new Set();
|
|
17226
|
+
for (const name of allIndexes) {
|
|
17227
|
+
if (name.startsWith(prefix) && !name.endsWith("--pages")) {
|
|
17228
|
+
const scopeName = name.slice(prefix.length);
|
|
17229
|
+
scopeNames.add(scopeName);
|
|
17230
|
+
}
|
|
17231
|
+
}
|
|
17232
|
+
const scopes = [];
|
|
17233
|
+
for (const scopeName of scopeNames) {
|
|
17234
|
+
const scope = {
|
|
17235
|
+
projectId,
|
|
17236
|
+
scopeName,
|
|
17237
|
+
scopeId: `${projectId}:${scopeName}`
|
|
17238
|
+
};
|
|
17239
|
+
try {
|
|
17240
|
+
const info = await this.chunkIndex(scope).info();
|
|
17241
|
+
scopes.push({
|
|
17242
|
+
projectId,
|
|
17243
|
+
scopeName,
|
|
17244
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17245
|
+
documentCount: info.documentCount
|
|
17246
|
+
});
|
|
17247
|
+
} catch {
|
|
17248
|
+
scopes.push({
|
|
17249
|
+
projectId,
|
|
17250
|
+
scopeName,
|
|
17251
|
+
lastIndexedAt: "unknown",
|
|
17252
|
+
documentCount: 0
|
|
17253
|
+
});
|
|
17254
|
+
}
|
|
17255
|
+
}
|
|
17256
|
+
return scopes;
|
|
17690
17257
|
}
|
|
17691
17258
|
async getContentHashes(scope) {
|
|
17692
|
-
const exists = await this.chunksTableExists();
|
|
17693
|
-
if (!exists) return /* @__PURE__ */ new Map();
|
|
17694
|
-
const rs = await this.client.execute({
|
|
17695
|
-
sql: `SELECT id, content_hash FROM chunks WHERE project_id = ? AND scope_name = ?`,
|
|
17696
|
-
args: [scope.projectId, scope.scopeName]
|
|
17697
|
-
});
|
|
17698
17259
|
const map = /* @__PURE__ */ new Map();
|
|
17699
|
-
|
|
17700
|
-
|
|
17260
|
+
const index = this.chunkIndex(scope);
|
|
17261
|
+
let cursor = "0";
|
|
17262
|
+
try {
|
|
17263
|
+
for (; ; ) {
|
|
17264
|
+
const result = await index.range({ cursor, limit: 100 });
|
|
17265
|
+
for (const doc of result.documents) {
|
|
17266
|
+
if (doc.metadata?.contentHash) {
|
|
17267
|
+
map.set(doc.id, doc.metadata.contentHash);
|
|
17268
|
+
}
|
|
17269
|
+
}
|
|
17270
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17271
|
+
cursor = result.nextCursor;
|
|
17272
|
+
}
|
|
17273
|
+
} catch {
|
|
17701
17274
|
}
|
|
17702
17275
|
return map;
|
|
17703
17276
|
}
|
|
17704
17277
|
async upsertPages(pages, scope) {
|
|
17705
17278
|
if (pages.length === 0) return;
|
|
17706
|
-
|
|
17707
|
-
|
|
17708
|
-
if (page.projectId !== scope.projectId || page.scopeName !== scope.scopeName) {
|
|
17709
|
-
throw new Error(
|
|
17710
|
-
`Page scope mismatch: page has ${page.projectId}:${page.scopeName} but scope is ${scope.projectId}:${scope.scopeName}`
|
|
17711
|
-
);
|
|
17712
|
-
}
|
|
17713
|
-
}
|
|
17714
|
-
const BATCH_SIZE = 100;
|
|
17279
|
+
const index = this.pageIndex(scope);
|
|
17280
|
+
const BATCH_SIZE = 50;
|
|
17715
17281
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17716
17282
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17717
|
-
const
|
|
17718
|
-
|
|
17719
|
-
|
|
17720
|
-
|
|
17721
|
-
|
|
17722
|
-
|
|
17723
|
-
p.
|
|
17724
|
-
p.
|
|
17725
|
-
p.
|
|
17726
|
-
p.
|
|
17727
|
-
|
|
17728
|
-
|
|
17729
|
-
p.
|
|
17730
|
-
p.
|
|
17731
|
-
p.
|
|
17732
|
-
p.
|
|
17733
|
-
|
|
17734
|
-
p.
|
|
17735
|
-
|
|
17283
|
+
const docs = batch.map((p) => ({
|
|
17284
|
+
id: p.url,
|
|
17285
|
+
content: {
|
|
17286
|
+
title: p.title,
|
|
17287
|
+
url: p.url,
|
|
17288
|
+
type: "page",
|
|
17289
|
+
description: p.description ?? "",
|
|
17290
|
+
keywords: (p.keywords ?? []).join(","),
|
|
17291
|
+
summary: p.summary ?? "",
|
|
17292
|
+
tags: p.tags.join(",")
|
|
17293
|
+
},
|
|
17294
|
+
metadata: {
|
|
17295
|
+
markdown: p.markdown,
|
|
17296
|
+
projectId: p.projectId,
|
|
17297
|
+
scopeName: p.scopeName,
|
|
17298
|
+
routeFile: p.routeFile,
|
|
17299
|
+
routeResolution: p.routeResolution,
|
|
17300
|
+
incomingLinks: p.incomingLinks,
|
|
17301
|
+
outgoingLinks: p.outgoingLinks,
|
|
17302
|
+
depth: p.depth,
|
|
17303
|
+
indexedAt: p.indexedAt
|
|
17304
|
+
}
|
|
17736
17305
|
}));
|
|
17737
|
-
await
|
|
17306
|
+
await index.upsert(docs);
|
|
17738
17307
|
}
|
|
17739
17308
|
}
|
|
17740
17309
|
async getPage(url, scope) {
|
|
17741
|
-
|
|
17742
|
-
|
|
17743
|
-
|
|
17744
|
-
|
|
17745
|
-
|
|
17746
|
-
|
|
17747
|
-
|
|
17748
|
-
|
|
17749
|
-
|
|
17750
|
-
|
|
17751
|
-
|
|
17752
|
-
|
|
17753
|
-
|
|
17754
|
-
|
|
17755
|
-
|
|
17756
|
-
|
|
17757
|
-
|
|
17758
|
-
|
|
17759
|
-
|
|
17760
|
-
|
|
17761
|
-
|
|
17310
|
+
const index = this.pageIndex(scope);
|
|
17311
|
+
try {
|
|
17312
|
+
const results = await index.fetch([url]);
|
|
17313
|
+
const doc = results[0];
|
|
17314
|
+
if (!doc) return null;
|
|
17315
|
+
return {
|
|
17316
|
+
url: doc.content.url,
|
|
17317
|
+
title: doc.content.title,
|
|
17318
|
+
markdown: doc.metadata.markdown,
|
|
17319
|
+
projectId: doc.metadata.projectId,
|
|
17320
|
+
scopeName: doc.metadata.scopeName,
|
|
17321
|
+
routeFile: doc.metadata.routeFile,
|
|
17322
|
+
routeResolution: doc.metadata.routeResolution,
|
|
17323
|
+
incomingLinks: doc.metadata.incomingLinks,
|
|
17324
|
+
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17325
|
+
depth: doc.metadata.depth,
|
|
17326
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
17327
|
+
indexedAt: doc.metadata.indexedAt,
|
|
17328
|
+
summary: doc.content.summary || void 0,
|
|
17329
|
+
description: doc.content.description || void 0,
|
|
17330
|
+
keywords: doc.content.keywords ? doc.content.keywords.split(",").filter(Boolean) : void 0
|
|
17331
|
+
};
|
|
17332
|
+
} catch {
|
|
17333
|
+
return null;
|
|
17334
|
+
}
|
|
17762
17335
|
}
|
|
17763
17336
|
async deletePages(scope) {
|
|
17764
|
-
|
|
17765
|
-
|
|
17766
|
-
|
|
17767
|
-
|
|
17768
|
-
}
|
|
17769
|
-
}
|
|
17770
|
-
async getScopeModelId(scope) {
|
|
17771
|
-
await this.ensureRegistry();
|
|
17772
|
-
const rs = await this.client.execute({
|
|
17773
|
-
sql: `SELECT model_id FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
17774
|
-
args: [scope.projectId, scope.scopeName]
|
|
17775
|
-
});
|
|
17776
|
-
if (rs.rows.length === 0) return null;
|
|
17777
|
-
return rs.rows[0].model_id;
|
|
17337
|
+
try {
|
|
17338
|
+
const index = this.pageIndex(scope);
|
|
17339
|
+
await index.reset();
|
|
17340
|
+
} catch {
|
|
17341
|
+
}
|
|
17778
17342
|
}
|
|
17779
17343
|
async health() {
|
|
17780
17344
|
try {
|
|
17781
|
-
await this.client.
|
|
17345
|
+
await this.client.info();
|
|
17782
17346
|
return { ok: true };
|
|
17783
17347
|
} catch (error) {
|
|
17784
17348
|
return {
|
|
@@ -17787,40 +17351,34 @@ var TursoVectorStore = class {
|
|
|
17787
17351
|
};
|
|
17788
17352
|
}
|
|
17789
17353
|
}
|
|
17354
|
+
async dropAllIndexes(projectId) {
|
|
17355
|
+
const allIndexes = await this.client.listIndexes();
|
|
17356
|
+
const prefix = `${projectId}--`;
|
|
17357
|
+
for (const name of allIndexes) {
|
|
17358
|
+
if (name.startsWith(prefix)) {
|
|
17359
|
+
try {
|
|
17360
|
+
const index = this.client.index(name);
|
|
17361
|
+
await index.deleteIndex();
|
|
17362
|
+
} catch {
|
|
17363
|
+
}
|
|
17364
|
+
}
|
|
17365
|
+
}
|
|
17366
|
+
}
|
|
17790
17367
|
};
|
|
17791
17368
|
|
|
17792
17369
|
// src/vector/factory.ts
|
|
17793
|
-
async function
|
|
17794
|
-
const
|
|
17795
|
-
const
|
|
17796
|
-
if (
|
|
17797
|
-
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17798
|
-
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17799
|
-
const client2 = createClient2({
|
|
17800
|
-
url: remoteUrl,
|
|
17801
|
-
authToken
|
|
17802
|
-
});
|
|
17803
|
-
return new TursoVectorStore({
|
|
17804
|
-
client: client2,
|
|
17805
|
-
dimension: config.vector.dimension
|
|
17806
|
-
});
|
|
17807
|
-
}
|
|
17808
|
-
if (isServerless()) {
|
|
17370
|
+
async function createUpstashStore(config) {
|
|
17371
|
+
const url = config.upstash.url ?? process.env[config.upstash.urlEnv];
|
|
17372
|
+
const token = config.upstash.token ?? process.env[config.upstash.tokenEnv];
|
|
17373
|
+
if (!url || !token) {
|
|
17809
17374
|
throw new SearchSocketError(
|
|
17810
17375
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17811
|
-
`
|
|
17376
|
+
`Missing Upstash Search credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17812
17377
|
);
|
|
17813
17378
|
}
|
|
17814
|
-
const {
|
|
17815
|
-
const
|
|
17816
|
-
|
|
17817
|
-
const client = createClient({
|
|
17818
|
-
url: `file:${localPath}`
|
|
17819
|
-
});
|
|
17820
|
-
return new TursoVectorStore({
|
|
17821
|
-
client,
|
|
17822
|
-
dimension: config.vector.dimension
|
|
17823
|
-
});
|
|
17379
|
+
const { Search } = await import('@upstash/search');
|
|
17380
|
+
const client = new Search({ url, token });
|
|
17381
|
+
return new UpstashSearchStore({ client });
|
|
17824
17382
|
}
|
|
17825
17383
|
function sha1(input) {
|
|
17826
17384
|
return createHash("sha1").update(input).digest("hex");
|
|
@@ -17839,13 +17397,6 @@ function normalizeUrlPath(rawPath) {
|
|
|
17839
17397
|
}
|
|
17840
17398
|
return out;
|
|
17841
17399
|
}
|
|
17842
|
-
function urlPathToMirrorRelative(urlPath) {
|
|
17843
|
-
const normalized = normalizeUrlPath(urlPath);
|
|
17844
|
-
if (normalized === "/") {
|
|
17845
|
-
return "index.md";
|
|
17846
|
-
}
|
|
17847
|
-
return `${normalized.slice(1)}.md`;
|
|
17848
|
-
}
|
|
17849
17400
|
function staticHtmlFileToUrl(filePath, rootDir) {
|
|
17850
17401
|
const relative = path.relative(rootDir, filePath).replace(/\\/g, "/");
|
|
17851
17402
|
if (relative === "index.html") {
|
|
@@ -18120,7 +17671,7 @@ function buildEmbeddingText(chunk, prependTitle) {
|
|
|
18120
17671
|
|
|
18121
17672
|
${chunk.chunkText}`;
|
|
18122
17673
|
}
|
|
18123
|
-
function
|
|
17674
|
+
function chunkPage(page, config, scope) {
|
|
18124
17675
|
const sections = parseHeadingSections(page.markdown, config.chunking.headingPathDepth);
|
|
18125
17676
|
const rawChunks = sections.flatMap((section) => splitSection(section, config.chunking));
|
|
18126
17677
|
const chunks = [];
|
|
@@ -19037,6 +18588,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19037
18588
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19038
18589
|
return null;
|
|
19039
18590
|
}
|
|
18591
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
18592
|
+
let weight;
|
|
18593
|
+
if (weightRaw !== void 0) {
|
|
18594
|
+
const parsed = Number(weightRaw);
|
|
18595
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
18596
|
+
weight = parsed;
|
|
18597
|
+
}
|
|
18598
|
+
}
|
|
18599
|
+
if (weight === 0) {
|
|
18600
|
+
return null;
|
|
18601
|
+
}
|
|
19040
18602
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19041
18603
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19042
18604
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19092,7 +18654,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19092
18654
|
noindex: false,
|
|
19093
18655
|
tags,
|
|
19094
18656
|
description,
|
|
19095
|
-
keywords
|
|
18657
|
+
keywords,
|
|
18658
|
+
weight
|
|
19096
18659
|
};
|
|
19097
18660
|
}
|
|
19098
18661
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19105,6 +18668,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19105
18668
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19106
18669
|
return null;
|
|
19107
18670
|
}
|
|
18671
|
+
let mdWeight;
|
|
18672
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
18673
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
18674
|
+
mdWeight = rawWeight;
|
|
18675
|
+
}
|
|
18676
|
+
if (mdWeight === 0) {
|
|
18677
|
+
return null;
|
|
18678
|
+
}
|
|
19108
18679
|
const content = parsed.content;
|
|
19109
18680
|
const normalized = normalizeMarkdown(content);
|
|
19110
18681
|
if (!normalizeText(normalized)) {
|
|
@@ -19127,56 +18698,10 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19127
18698
|
noindex: false,
|
|
19128
18699
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19129
18700
|
description: fmDescription,
|
|
19130
|
-
keywords: fmKeywords
|
|
18701
|
+
keywords: fmKeywords,
|
|
18702
|
+
weight: mdWeight
|
|
19131
18703
|
};
|
|
19132
18704
|
}
|
|
19133
|
-
function yamlString(value) {
|
|
19134
|
-
return JSON.stringify(value);
|
|
19135
|
-
}
|
|
19136
|
-
function yamlArray(values) {
|
|
19137
|
-
return `[${values.map((v) => JSON.stringify(v)).join(", ")}]`;
|
|
19138
|
-
}
|
|
19139
|
-
function buildMirrorMarkdown(page) {
|
|
19140
|
-
const frontmatterLines = [
|
|
19141
|
-
"---",
|
|
19142
|
-
`url: ${yamlString(page.url)}`,
|
|
19143
|
-
`title: ${yamlString(page.title)}`,
|
|
19144
|
-
`scope: ${yamlString(page.scope)}`,
|
|
19145
|
-
`routeFile: ${yamlString(page.routeFile)}`,
|
|
19146
|
-
`routeResolution: ${yamlString(page.routeResolution)}`,
|
|
19147
|
-
`generatedAt: ${yamlString(page.generatedAt)}`,
|
|
19148
|
-
`incomingLinks: ${page.incomingLinks}`,
|
|
19149
|
-
`outgoingLinks: ${page.outgoingLinks}`,
|
|
19150
|
-
`depth: ${page.depth}`,
|
|
19151
|
-
`tags: ${yamlArray(page.tags)}`,
|
|
19152
|
-
"---",
|
|
19153
|
-
""
|
|
19154
|
-
];
|
|
19155
|
-
return `${frontmatterLines.join("\n")}${normalizeMarkdown(page.markdown)}`;
|
|
19156
|
-
}
|
|
19157
|
-
function stripGeneratedAt(content) {
|
|
19158
|
-
return content.replace(/^generatedAt: .*$/m, "");
|
|
19159
|
-
}
|
|
19160
|
-
async function writeMirrorPage(statePath, scope, page) {
|
|
19161
|
-
const relative = urlPathToMirrorRelative(page.url);
|
|
19162
|
-
const outputPath = path.join(statePath, "pages", scope.scopeName, relative);
|
|
19163
|
-
await fs4.mkdir(path.dirname(outputPath), { recursive: true });
|
|
19164
|
-
const newContent = buildMirrorMarkdown(page);
|
|
19165
|
-
try {
|
|
19166
|
-
const existing = await fs4.readFile(outputPath, "utf8");
|
|
19167
|
-
if (stripGeneratedAt(existing) === stripGeneratedAt(newContent)) {
|
|
19168
|
-
return outputPath;
|
|
19169
|
-
}
|
|
19170
|
-
} catch {
|
|
19171
|
-
}
|
|
19172
|
-
await fs4.writeFile(outputPath, newContent, "utf8");
|
|
19173
|
-
return outputPath;
|
|
19174
|
-
}
|
|
19175
|
-
async function cleanMirrorForScope(statePath, scope) {
|
|
19176
|
-
const target = path.join(statePath, "pages", scope.scopeName);
|
|
19177
|
-
await fs4.rm(target, { recursive: true, force: true });
|
|
19178
|
-
await fs4.mkdir(target, { recursive: true });
|
|
19179
|
-
}
|
|
19180
18705
|
function segmentToRegex(segment) {
|
|
19181
18706
|
if (segment.startsWith("(") && segment.endsWith(")")) {
|
|
19182
18707
|
return { regex: "", score: 0 };
|
|
@@ -19323,6 +18848,38 @@ var Logger = class {
|
|
|
19323
18848
|
`);
|
|
19324
18849
|
}
|
|
19325
18850
|
};
|
|
18851
|
+
|
|
18852
|
+
// src/utils/pattern.ts
|
|
18853
|
+
function matchUrlPattern(url, pattern) {
|
|
18854
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
18855
|
+
const normalizedUrl = norm(url);
|
|
18856
|
+
const normalizedPattern = norm(pattern);
|
|
18857
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
18858
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
18859
|
+
if (prefix === "") {
|
|
18860
|
+
return true;
|
|
18861
|
+
}
|
|
18862
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
18863
|
+
}
|
|
18864
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
18865
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
18866
|
+
if (prefix === "") {
|
|
18867
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
18868
|
+
}
|
|
18869
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
18870
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
18871
|
+
return rest.length > 0 && !rest.includes("/");
|
|
18872
|
+
}
|
|
18873
|
+
return normalizedUrl === normalizedPattern;
|
|
18874
|
+
}
|
|
18875
|
+
function matchUrlPatterns(url, patterns) {
|
|
18876
|
+
for (const pattern of patterns) {
|
|
18877
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
18878
|
+
}
|
|
18879
|
+
return false;
|
|
18880
|
+
}
|
|
18881
|
+
|
|
18882
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
19326
18883
|
function routeIdToFile(routeId) {
|
|
19327
18884
|
if (routeId === "/") {
|
|
19328
18885
|
return "src/routes/+page.svelte";
|
|
@@ -19337,7 +18894,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
19337
18894
|
const manifestPath = path.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
19338
18895
|
let content;
|
|
19339
18896
|
try {
|
|
19340
|
-
content = await
|
|
18897
|
+
content = await fs3.readFile(manifestPath, "utf8");
|
|
19341
18898
|
} catch {
|
|
19342
18899
|
throw new SearchSocketError(
|
|
19343
18900
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19396,15 +18953,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19396
18953
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19397
18954
|
}
|
|
19398
18955
|
function isExcluded(url, patterns) {
|
|
19399
|
-
|
|
19400
|
-
if (pattern.endsWith("/*")) {
|
|
19401
|
-
const prefix = pattern.slice(0, -1);
|
|
19402
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19403
|
-
} else if (url === pattern) {
|
|
19404
|
-
return true;
|
|
19405
|
-
}
|
|
19406
|
-
}
|
|
19407
|
-
return false;
|
|
18956
|
+
return matchUrlPatterns(url, patterns);
|
|
19408
18957
|
}
|
|
19409
18958
|
function findFreePort() {
|
|
19410
18959
|
return new Promise((resolve, reject) => {
|
|
@@ -19518,7 +19067,7 @@ async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
|
19518
19067
|
const visited = /* @__PURE__ */ new Set();
|
|
19519
19068
|
const pages = [];
|
|
19520
19069
|
const queue = [];
|
|
19521
|
-
const limit =
|
|
19070
|
+
const limit = pLimit(8);
|
|
19522
19071
|
for (const seed of seedUrls) {
|
|
19523
19072
|
const normalized = normalizeUrlPath(seed);
|
|
19524
19073
|
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
@@ -19600,7 +19149,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19600
19149
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19601
19150
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19602
19151
|
try {
|
|
19603
|
-
const concurrencyLimit =
|
|
19152
|
+
const concurrencyLimit = pLimit(8);
|
|
19604
19153
|
const results = await Promise.allSettled(
|
|
19605
19154
|
selected.map(
|
|
19606
19155
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19674,7 +19223,7 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19674
19223
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19675
19224
|
const pages = [];
|
|
19676
19225
|
for (const filePath of selected) {
|
|
19677
|
-
const raw = await
|
|
19226
|
+
const raw = await fs3.readFile(filePath, "utf8");
|
|
19678
19227
|
const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
|
|
19679
19228
|
pages.push({
|
|
19680
19229
|
url: filePathToUrl(filePath, baseDir),
|
|
@@ -19769,7 +19318,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
19769
19318
|
const routes = await resolveRoutes(config);
|
|
19770
19319
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
19771
19320
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
19772
|
-
const concurrencyLimit =
|
|
19321
|
+
const concurrencyLimit = pLimit(8);
|
|
19773
19322
|
const results = await Promise.allSettled(
|
|
19774
19323
|
selected.map(
|
|
19775
19324
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19810,7 +19359,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19810
19359
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19811
19360
|
const pages = [];
|
|
19812
19361
|
for (const filePath of selected) {
|
|
19813
|
-
const html = await
|
|
19362
|
+
const html = await fs3.readFile(filePath, "utf8");
|
|
19814
19363
|
pages.push({
|
|
19815
19364
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19816
19365
|
html,
|
|
@@ -19820,43 +19369,293 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19820
19369
|
}
|
|
19821
19370
|
return pages;
|
|
19822
19371
|
}
|
|
19823
|
-
|
|
19824
|
-
|
|
19825
|
-
|
|
19826
|
-
|
|
19827
|
-
|
|
19828
|
-
|
|
19829
|
-
|
|
19372
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
19373
|
+
const lines = content.split(/\r?\n/);
|
|
19374
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
19375
|
+
let currentAgents = [];
|
|
19376
|
+
for (const rawLine of lines) {
|
|
19377
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
19378
|
+
if (!line) continue;
|
|
19379
|
+
const colonIdx = line.indexOf(":");
|
|
19380
|
+
if (colonIdx === -1) continue;
|
|
19381
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
19382
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
19383
|
+
if (directive === "user-agent") {
|
|
19384
|
+
const agentName = value.toLowerCase();
|
|
19385
|
+
currentAgents.push(agentName);
|
|
19386
|
+
if (!agentGroups.has(agentName)) {
|
|
19387
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
19388
|
+
}
|
|
19389
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
19390
|
+
for (const agent of currentAgents) {
|
|
19391
|
+
agentGroups.get(agent).disallow.push(value);
|
|
19392
|
+
}
|
|
19393
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
19394
|
+
for (const agent of currentAgents) {
|
|
19395
|
+
agentGroups.get(agent).allow.push(value);
|
|
19396
|
+
}
|
|
19397
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
19398
|
+
currentAgents = [];
|
|
19399
|
+
}
|
|
19400
|
+
}
|
|
19401
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
19402
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
19403
|
+
return specific;
|
|
19404
|
+
}
|
|
19405
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
19406
|
+
}
|
|
19407
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
19408
|
+
let longestDisallow = "";
|
|
19409
|
+
for (const pattern of rules3.disallow) {
|
|
19410
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
19411
|
+
longestDisallow = pattern;
|
|
19412
|
+
}
|
|
19413
|
+
}
|
|
19414
|
+
if (!longestDisallow) return false;
|
|
19415
|
+
let longestAllow = "";
|
|
19416
|
+
for (const pattern of rules3.allow) {
|
|
19417
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
19418
|
+
longestAllow = pattern;
|
|
19419
|
+
}
|
|
19420
|
+
}
|
|
19421
|
+
return longestAllow.length < longestDisallow.length;
|
|
19422
|
+
}
|
|
19423
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
19424
|
+
try {
|
|
19425
|
+
const content = await fs3.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
19426
|
+
return parseRobotsTxt(content);
|
|
19427
|
+
} catch {
|
|
19428
|
+
return null;
|
|
19429
|
+
}
|
|
19430
|
+
}
|
|
19431
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
19432
|
+
try {
|
|
19433
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
19434
|
+
const response = await fetch(url);
|
|
19435
|
+
if (!response.ok) return null;
|
|
19436
|
+
const content = await response.text();
|
|
19437
|
+
return parseRobotsTxt(content);
|
|
19438
|
+
} catch {
|
|
19439
|
+
return null;
|
|
19440
|
+
}
|
|
19441
|
+
}
|
|
19442
|
+
|
|
19443
|
+
// src/search/ranking.ts
|
|
19444
|
+
function nonNegativeOrZero(value) {
|
|
19445
|
+
if (!Number.isFinite(value)) {
|
|
19446
|
+
return 0;
|
|
19447
|
+
}
|
|
19448
|
+
return Math.max(0, value);
|
|
19449
|
+
}
|
|
19450
|
+
function normalizeForTitleMatch(text) {
|
|
19451
|
+
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
19452
|
+
}
|
|
19453
|
+
function rankHits(hits, config, query) {
|
|
19454
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
19455
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
19456
|
+
return hits.map((hit) => {
|
|
19457
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
19458
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
19459
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19460
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
19461
|
+
}
|
|
19462
|
+
if (config.ranking.enableDepthBoost) {
|
|
19463
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19464
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
19465
|
+
}
|
|
19466
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
19467
|
+
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
19468
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
19469
|
+
score += titleMatchWeight;
|
|
19470
|
+
}
|
|
19471
|
+
}
|
|
19472
|
+
return {
|
|
19473
|
+
hit,
|
|
19474
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19475
|
+
};
|
|
19476
|
+
}).sort((a, b) => {
|
|
19477
|
+
const delta = b.finalScore - a.finalScore;
|
|
19478
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19479
|
+
});
|
|
19480
|
+
}
|
|
19481
|
+
function trimByScoreGap(results, config) {
|
|
19482
|
+
if (results.length === 0) return results;
|
|
19483
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
19484
|
+
const minScore = config.ranking.minScore;
|
|
19485
|
+
if (minScore > 0 && results.length > 0) {
|
|
19486
|
+
const sortedScores = results.map((r) => r.pageScore).sort((a, b) => a - b);
|
|
19487
|
+
const mid = Math.floor(sortedScores.length / 2);
|
|
19488
|
+
const median = sortedScores.length % 2 === 0 ? (sortedScores[mid - 1] + sortedScores[mid]) / 2 : sortedScores[mid];
|
|
19489
|
+
if (median < minScore) return [];
|
|
19490
|
+
}
|
|
19491
|
+
if (threshold > 0 && results.length > 1) {
|
|
19492
|
+
for (let i = 1; i < results.length; i++) {
|
|
19493
|
+
const prev = results[i - 1].pageScore;
|
|
19494
|
+
const current = results[i].pageScore;
|
|
19495
|
+
if (prev > 0) {
|
|
19496
|
+
const gap = (prev - current) / prev;
|
|
19497
|
+
if (gap >= threshold) {
|
|
19498
|
+
return results.slice(0, i);
|
|
19499
|
+
}
|
|
19500
|
+
}
|
|
19501
|
+
}
|
|
19502
|
+
}
|
|
19503
|
+
return results;
|
|
19504
|
+
}
|
|
19505
|
+
function findPageWeight(url, pageWeights) {
|
|
19506
|
+
let bestPattern = "";
|
|
19507
|
+
let bestWeight = 1;
|
|
19508
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
19509
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
19510
|
+
bestPattern = pattern;
|
|
19511
|
+
bestWeight = weight;
|
|
19512
|
+
}
|
|
19513
|
+
}
|
|
19514
|
+
return bestWeight;
|
|
19515
|
+
}
|
|
19516
|
+
function aggregateByPage(ranked, config) {
|
|
19517
|
+
const groups = /* @__PURE__ */ new Map();
|
|
19518
|
+
for (const hit of ranked) {
|
|
19519
|
+
const url = hit.hit.metadata.url;
|
|
19520
|
+
const group = groups.get(url);
|
|
19521
|
+
if (group) group.push(hit);
|
|
19522
|
+
else groups.set(url, [hit]);
|
|
19523
|
+
}
|
|
19524
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
19525
|
+
const pages = [];
|
|
19526
|
+
for (const [url, chunks] of groups) {
|
|
19527
|
+
chunks.sort((a, b) => {
|
|
19528
|
+
const delta = b.finalScore - a.finalScore;
|
|
19529
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19530
|
+
});
|
|
19531
|
+
const best = chunks[0];
|
|
19532
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
19533
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
19534
|
+
let aggregationBonus = 0;
|
|
19535
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
19536
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
19537
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
19538
|
+
}
|
|
19539
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
19540
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
19541
|
+
if (pageWeight === 0) continue;
|
|
19542
|
+
if (pageWeight !== 1) {
|
|
19543
|
+
pageScore *= pageWeight;
|
|
19544
|
+
}
|
|
19545
|
+
pages.push({
|
|
19546
|
+
url,
|
|
19547
|
+
title: best.hit.metadata.title,
|
|
19548
|
+
routeFile: best.hit.metadata.routeFile,
|
|
19549
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
19550
|
+
bestChunk: best,
|
|
19551
|
+
matchingChunks: chunks
|
|
19552
|
+
});
|
|
19553
|
+
}
|
|
19554
|
+
return pages.sort((a, b) => {
|
|
19555
|
+
const delta = b.pageScore - a.pageScore;
|
|
19556
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19557
|
+
});
|
|
19558
|
+
}
|
|
19559
|
+
function mergePageAndChunkResults(pageHits, rankedChunks, config) {
|
|
19560
|
+
if (pageHits.length === 0) return rankedChunks;
|
|
19561
|
+
const w = config.search.pageSearchWeight;
|
|
19562
|
+
const pageScoreMap = /* @__PURE__ */ new Map();
|
|
19563
|
+
for (const ph of pageHits) {
|
|
19564
|
+
pageScoreMap.set(ph.url, ph);
|
|
19565
|
+
}
|
|
19566
|
+
const pagesWithChunks = /* @__PURE__ */ new Set();
|
|
19567
|
+
const merged = rankedChunks.map((ranked) => {
|
|
19568
|
+
const url = ranked.hit.metadata.url;
|
|
19569
|
+
const pageHit = pageScoreMap.get(url);
|
|
19570
|
+
if (pageHit) {
|
|
19571
|
+
pagesWithChunks.add(url);
|
|
19572
|
+
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
19573
|
+
return {
|
|
19574
|
+
hit: ranked.hit,
|
|
19575
|
+
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
19576
|
+
};
|
|
19577
|
+
}
|
|
19578
|
+
return ranked;
|
|
19579
|
+
});
|
|
19580
|
+
for (const [url, pageHit] of pageScoreMap) {
|
|
19581
|
+
if (pagesWithChunks.has(url)) continue;
|
|
19582
|
+
const syntheticScore = pageHit.score * w;
|
|
19583
|
+
const syntheticHit = {
|
|
19584
|
+
id: `page:${url}`,
|
|
19585
|
+
score: pageHit.score,
|
|
19586
|
+
metadata: {
|
|
19587
|
+
projectId: "",
|
|
19588
|
+
scopeName: "",
|
|
19589
|
+
url: pageHit.url,
|
|
19590
|
+
path: pageHit.url,
|
|
19591
|
+
title: pageHit.title,
|
|
19592
|
+
sectionTitle: "",
|
|
19593
|
+
headingPath: [],
|
|
19594
|
+
snippet: pageHit.description || pageHit.title,
|
|
19595
|
+
chunkText: pageHit.description || pageHit.title,
|
|
19596
|
+
ordinal: 0,
|
|
19597
|
+
contentHash: "",
|
|
19598
|
+
depth: pageHit.depth,
|
|
19599
|
+
incomingLinks: pageHit.incomingLinks,
|
|
19600
|
+
routeFile: pageHit.routeFile,
|
|
19601
|
+
tags: pageHit.tags
|
|
19602
|
+
}
|
|
19603
|
+
};
|
|
19604
|
+
merged.push({
|
|
19605
|
+
hit: syntheticHit,
|
|
19606
|
+
finalScore: Number.isFinite(syntheticScore) ? syntheticScore : 0
|
|
19607
|
+
});
|
|
19608
|
+
}
|
|
19609
|
+
return merged.sort((a, b) => {
|
|
19610
|
+
const delta = b.finalScore - a.finalScore;
|
|
19611
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19612
|
+
});
|
|
19613
|
+
}
|
|
19614
|
+
|
|
19615
|
+
// src/utils/time.ts
|
|
19616
|
+
function nowIso() {
|
|
19617
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
19618
|
+
}
|
|
19619
|
+
function hrTimeMs(start) {
|
|
19620
|
+
return Number(process.hrtime.bigint() - start) / 1e6;
|
|
19621
|
+
}
|
|
19622
|
+
|
|
19623
|
+
// src/indexing/pipeline.ts
|
|
19624
|
+
function buildPageSummary(page, maxChars = 3500) {
|
|
19625
|
+
const parts = [page.title];
|
|
19626
|
+
if (page.description) {
|
|
19627
|
+
parts.push(page.description);
|
|
19628
|
+
}
|
|
19629
|
+
if (page.keywords && page.keywords.length > 0) {
|
|
19630
|
+
parts.push(page.keywords.join(", "));
|
|
19631
|
+
}
|
|
19632
|
+
const plainBody = page.markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/!?\[([^\]]*)\]\([^)]*\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/[>*_|~\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
19633
|
+
if (plainBody) {
|
|
19634
|
+
parts.push(plainBody);
|
|
19635
|
+
}
|
|
19636
|
+
const joined = parts.join("\n\n");
|
|
19637
|
+
if (joined.length <= maxChars) return joined;
|
|
19638
|
+
return joined.slice(0, maxChars).trim();
|
|
19830
19639
|
}
|
|
19831
|
-
|
|
19832
|
-
// src/indexing/pipeline.ts
|
|
19833
|
-
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19834
|
-
"jina-embeddings-v3": 2e-5
|
|
19835
|
-
};
|
|
19836
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
19837
19640
|
var IndexPipeline = class _IndexPipeline {
|
|
19838
19641
|
cwd;
|
|
19839
19642
|
config;
|
|
19840
|
-
|
|
19841
|
-
vectorStore;
|
|
19643
|
+
store;
|
|
19842
19644
|
logger;
|
|
19843
19645
|
constructor(options) {
|
|
19844
19646
|
this.cwd = options.cwd;
|
|
19845
19647
|
this.config = options.config;
|
|
19846
|
-
this.
|
|
19847
|
-
this.vectorStore = options.vectorStore;
|
|
19648
|
+
this.store = options.store;
|
|
19848
19649
|
this.logger = options.logger;
|
|
19849
19650
|
}
|
|
19850
19651
|
static async create(options = {}) {
|
|
19851
19652
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
19852
19653
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
19853
|
-
const
|
|
19854
|
-
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
19654
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
19855
19655
|
return new _IndexPipeline({
|
|
19856
19656
|
cwd,
|
|
19857
19657
|
config,
|
|
19858
|
-
|
|
19859
|
-
vectorStore,
|
|
19658
|
+
store,
|
|
19860
19659
|
logger: options.logger ?? new Logger()
|
|
19861
19660
|
});
|
|
19862
19661
|
}
|
|
@@ -19876,25 +19675,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19876
19675
|
stageTimingsMs[name] = Math.round(hrTimeMs(start));
|
|
19877
19676
|
};
|
|
19878
19677
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19879
|
-
|
|
19678
|
+
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
19880
19679
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19881
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode},
|
|
19680
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-search)`);
|
|
19882
19681
|
if (options.force) {
|
|
19883
19682
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19884
|
-
await cleanMirrorForScope(statePath, scope);
|
|
19885
19683
|
}
|
|
19886
19684
|
if (options.dryRun) {
|
|
19887
19685
|
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
19888
19686
|
}
|
|
19889
19687
|
const manifestStart = stageStart();
|
|
19890
|
-
const existingHashes = await this.
|
|
19891
|
-
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
19892
|
-
if (existingModelId && existingModelId !== this.config.embeddings.model && !options.force) {
|
|
19893
|
-
throw new SearchSocketError(
|
|
19894
|
-
"EMBEDDING_MODEL_MISMATCH",
|
|
19895
|
-
`Scope ${scope.scopeName} uses model ${existingModelId}. Re-run with --force to migrate.`
|
|
19896
|
-
);
|
|
19897
|
-
}
|
|
19688
|
+
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
|
|
19898
19689
|
stageEnd("manifest", manifestStart);
|
|
19899
19690
|
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
19900
19691
|
const sourceStart = stageStart();
|
|
@@ -19911,6 +19702,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19911
19702
|
}
|
|
19912
19703
|
stageEnd("source", sourceStart);
|
|
19913
19704
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
19705
|
+
const filterStart = stageStart();
|
|
19706
|
+
let filteredSourcePages = sourcePages;
|
|
19707
|
+
if (this.config.exclude.length > 0) {
|
|
19708
|
+
const beforeExclude = filteredSourcePages.length;
|
|
19709
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
19710
|
+
const url = normalizeUrlPath(p.url);
|
|
19711
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
19712
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
19713
|
+
return false;
|
|
19714
|
+
}
|
|
19715
|
+
return true;
|
|
19716
|
+
});
|
|
19717
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
19718
|
+
if (excludedCount > 0) {
|
|
19719
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
19720
|
+
}
|
|
19721
|
+
}
|
|
19722
|
+
if (this.config.respectRobotsTxt) {
|
|
19723
|
+
let robotsRules = null;
|
|
19724
|
+
if (sourceMode === "static-output") {
|
|
19725
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
19726
|
+
path.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
19727
|
+
);
|
|
19728
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
19729
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
19730
|
+
path.resolve(this.cwd, this.config.source.build.outputDir)
|
|
19731
|
+
);
|
|
19732
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
19733
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
19734
|
+
}
|
|
19735
|
+
if (robotsRules) {
|
|
19736
|
+
const beforeRobots = filteredSourcePages.length;
|
|
19737
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
19738
|
+
const url = normalizeUrlPath(p.url);
|
|
19739
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
19740
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
19741
|
+
return false;
|
|
19742
|
+
}
|
|
19743
|
+
return true;
|
|
19744
|
+
});
|
|
19745
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
19746
|
+
if (robotsExcluded > 0) {
|
|
19747
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
19748
|
+
}
|
|
19749
|
+
}
|
|
19750
|
+
}
|
|
19751
|
+
stageEnd("filter", filterStart);
|
|
19914
19752
|
const routeStart = stageStart();
|
|
19915
19753
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19916
19754
|
stageEnd("route_map", routeStart);
|
|
@@ -19918,7 +19756,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19918
19756
|
const extractStart = stageStart();
|
|
19919
19757
|
this.logger.info("Extracting content...");
|
|
19920
19758
|
const extractedPages = [];
|
|
19921
|
-
for (const sourcePage of
|
|
19759
|
+
for (const sourcePage of filteredSourcePages) {
|
|
19922
19760
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
19923
19761
|
if (!extracted) {
|
|
19924
19762
|
this.logger.warn(
|
|
@@ -19944,16 +19782,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19944
19782
|
seenUrls.add(page.url);
|
|
19945
19783
|
uniquePages.push(page);
|
|
19946
19784
|
}
|
|
19785
|
+
const indexablePages = [];
|
|
19786
|
+
for (const page of uniquePages) {
|
|
19787
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
19788
|
+
if (effectiveWeight === 0) {
|
|
19789
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
19790
|
+
continue;
|
|
19791
|
+
}
|
|
19792
|
+
indexablePages.push(page);
|
|
19793
|
+
}
|
|
19794
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
19795
|
+
if (zeroWeightCount > 0) {
|
|
19796
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
19797
|
+
}
|
|
19947
19798
|
stageEnd("extract", extractStart);
|
|
19948
|
-
const skippedPages =
|
|
19949
|
-
this.logger.info(`Extracted ${
|
|
19799
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
19800
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19950
19801
|
const linkStart = stageStart();
|
|
19951
|
-
const pageSet = new Set(
|
|
19802
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19952
19803
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
19953
|
-
for (const page of
|
|
19804
|
+
for (const page of indexablePages) {
|
|
19954
19805
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19955
19806
|
}
|
|
19956
|
-
for (const page of
|
|
19807
|
+
for (const page of indexablePages) {
|
|
19957
19808
|
for (const outgoing of page.outgoingLinks) {
|
|
19958
19809
|
if (!pageSet.has(outgoing)) {
|
|
19959
19810
|
continue;
|
|
@@ -19963,9 +19814,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19963
19814
|
}
|
|
19964
19815
|
stageEnd("links", linkStart);
|
|
19965
19816
|
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
19966
|
-
const
|
|
19967
|
-
this.logger.info("
|
|
19968
|
-
const
|
|
19817
|
+
const pagesStart = stageStart();
|
|
19818
|
+
this.logger.info("Building indexed pages...");
|
|
19819
|
+
const pages = [];
|
|
19969
19820
|
let routeExact = 0;
|
|
19970
19821
|
let routeBestEffort = 0;
|
|
19971
19822
|
const precomputedRoutes = /* @__PURE__ */ new Map();
|
|
@@ -19977,7 +19828,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19977
19828
|
});
|
|
19978
19829
|
}
|
|
19979
19830
|
}
|
|
19980
|
-
for (const page of
|
|
19831
|
+
for (const page of indexablePages) {
|
|
19981
19832
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19982
19833
|
if (routeMatch.routeResolution === "best-effort") {
|
|
19983
19834
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -19994,7 +19845,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19994
19845
|
} else {
|
|
19995
19846
|
routeExact += 1;
|
|
19996
19847
|
}
|
|
19997
|
-
const
|
|
19848
|
+
const indexedPage = {
|
|
19998
19849
|
url: page.url,
|
|
19999
19850
|
title: page.title,
|
|
20000
19851
|
scope: scope.scopeName,
|
|
@@ -20009,35 +19860,38 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20009
19860
|
description: page.description,
|
|
20010
19861
|
keywords: page.keywords
|
|
20011
19862
|
};
|
|
20012
|
-
|
|
20013
|
-
|
|
20014
|
-
await writeMirrorPage(statePath, scope, mirror);
|
|
20015
|
-
}
|
|
20016
|
-
this.logger.event("markdown_written", { url: page.url });
|
|
19863
|
+
pages.push(indexedPage);
|
|
19864
|
+
this.logger.event("page_indexed", { url: page.url });
|
|
20017
19865
|
}
|
|
20018
19866
|
if (!options.dryRun) {
|
|
20019
|
-
const pageRecords =
|
|
20020
|
-
|
|
20021
|
-
|
|
20022
|
-
|
|
20023
|
-
|
|
20024
|
-
|
|
20025
|
-
|
|
20026
|
-
|
|
20027
|
-
|
|
20028
|
-
|
|
20029
|
-
|
|
20030
|
-
|
|
20031
|
-
|
|
20032
|
-
|
|
20033
|
-
|
|
20034
|
-
|
|
19867
|
+
const pageRecords = pages.map((p) => {
|
|
19868
|
+
const summary = buildPageSummary(p);
|
|
19869
|
+
return {
|
|
19870
|
+
url: p.url,
|
|
19871
|
+
title: p.title,
|
|
19872
|
+
markdown: p.markdown,
|
|
19873
|
+
projectId: scope.projectId,
|
|
19874
|
+
scopeName: scope.scopeName,
|
|
19875
|
+
routeFile: p.routeFile,
|
|
19876
|
+
routeResolution: p.routeResolution,
|
|
19877
|
+
incomingLinks: p.incomingLinks,
|
|
19878
|
+
outgoingLinks: p.outgoingLinks,
|
|
19879
|
+
depth: p.depth,
|
|
19880
|
+
tags: p.tags,
|
|
19881
|
+
indexedAt: p.generatedAt,
|
|
19882
|
+
summary,
|
|
19883
|
+
description: p.description,
|
|
19884
|
+
keywords: p.keywords
|
|
19885
|
+
};
|
|
19886
|
+
});
|
|
19887
|
+
await this.store.deletePages(scope);
|
|
19888
|
+
await this.store.upsertPages(pageRecords, scope);
|
|
20035
19889
|
}
|
|
20036
|
-
stageEnd("
|
|
20037
|
-
this.logger.info(`
|
|
19890
|
+
stageEnd("pages", pagesStart);
|
|
19891
|
+
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
20038
19892
|
const chunkStart = stageStart();
|
|
20039
19893
|
this.logger.info("Chunking pages...");
|
|
20040
|
-
let chunks =
|
|
19894
|
+
let chunks = pages.flatMap((page) => chunkPage(page, this.config, scope));
|
|
20041
19895
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
20042
19896
|
if (typeof maxChunks === "number") {
|
|
20043
19897
|
chunks = chunks.slice(0, maxChunks);
|
|
@@ -20069,259 +19923,90 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20069
19923
|
});
|
|
20070
19924
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20071
19925
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20072
|
-
const
|
|
20073
|
-
|
|
20074
|
-
for (const chunk of changedChunks) {
|
|
20075
|
-
chunkTokenEstimates.set(chunk.chunkKey, this.embeddings.estimateTokens(buildEmbeddingText(chunk, this.config.chunking.prependTitle)));
|
|
20076
|
-
}
|
|
20077
|
-
const estimatedTokens = changedChunks.reduce(
|
|
20078
|
-
(sum, chunk) => sum + (chunkTokenEstimates.get(chunk.chunkKey) ?? 0),
|
|
20079
|
-
0
|
|
20080
|
-
);
|
|
20081
|
-
const pricePer1k = this.config.embeddings.pricePer1kTokens ?? EMBEDDING_PRICE_PER_1K_TOKENS_USD[this.config.embeddings.model] ?? DEFAULT_EMBEDDING_PRICE_PER_1K;
|
|
20082
|
-
const estimatedCostUSD = estimatedTokens / 1e3 * pricePer1k;
|
|
20083
|
-
let newEmbeddings = 0;
|
|
20084
|
-
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
19926
|
+
const upsertStart = stageStart();
|
|
19927
|
+
let documentsUpserted = 0;
|
|
20085
19928
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20086
|
-
this.logger.info(`
|
|
20087
|
-
const
|
|
20088
|
-
|
|
20089
|
-
|
|
20090
|
-
|
|
20091
|
-
|
|
20092
|
-
|
|
20093
|
-
|
|
20094
|
-
|
|
20095
|
-
|
|
20096
|
-
);
|
|
20097
|
-
|
|
20098
|
-
for (let i = 0; i < changedChunks.length; i += 1) {
|
|
20099
|
-
const chunk = changedChunks[i];
|
|
20100
|
-
const embedding = embeddings[i];
|
|
20101
|
-
if (!chunk || !embedding || embedding.length === 0 || embedding.some((value) => !Number.isFinite(value))) {
|
|
20102
|
-
throw new SearchSocketError(
|
|
20103
|
-
"VECTOR_BACKEND_UNAVAILABLE",
|
|
20104
|
-
`Embedding provider returned an invalid vector for chunk index ${i}.`
|
|
20105
|
-
);
|
|
20106
|
-
}
|
|
20107
|
-
vectorsByChunk.set(chunk.chunkKey, embedding);
|
|
20108
|
-
newEmbeddings += 1;
|
|
20109
|
-
this.logger.event("embedded_new", { chunkKey: chunk.chunkKey });
|
|
20110
|
-
}
|
|
20111
|
-
}
|
|
20112
|
-
stageEnd("embedding", embedStart);
|
|
20113
|
-
if (changedChunks.length > 0) {
|
|
20114
|
-
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20115
|
-
} else {
|
|
20116
|
-
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20117
|
-
}
|
|
20118
|
-
const syncStart = stageStart();
|
|
20119
|
-
if (!options.dryRun) {
|
|
20120
|
-
this.logger.info("Syncing vectors...");
|
|
20121
|
-
const upserts = [];
|
|
20122
|
-
for (const chunk of changedChunks) {
|
|
20123
|
-
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
20124
|
-
if (!vector) {
|
|
20125
|
-
continue;
|
|
20126
|
-
}
|
|
20127
|
-
upserts.push({
|
|
19929
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Search...`);
|
|
19930
|
+
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
19931
|
+
const docs = changedChunks.map((chunk) => {
|
|
19932
|
+
const title = chunk.title;
|
|
19933
|
+
const sectionTitle = chunk.sectionTitle ?? "";
|
|
19934
|
+
const url = chunk.url;
|
|
19935
|
+
const tags = chunk.tags.join(",");
|
|
19936
|
+
const headingPath = chunk.headingPath.join(" > ");
|
|
19937
|
+
const otherFieldsLen = title.length + sectionTitle.length + url.length + tags.length + headingPath.length;
|
|
19938
|
+
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
19939
|
+
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
19940
|
+
return {
|
|
20128
19941
|
id: chunk.chunkKey,
|
|
20129
|
-
|
|
19942
|
+
content: { title, sectionTitle, text, url, tags, headingPath },
|
|
20130
19943
|
metadata: {
|
|
20131
19944
|
projectId: scope.projectId,
|
|
20132
19945
|
scopeName: scope.scopeName,
|
|
20133
|
-
url: chunk.url,
|
|
20134
19946
|
path: chunk.path,
|
|
20135
|
-
title: chunk.title,
|
|
20136
|
-
sectionTitle: chunk.sectionTitle ?? "",
|
|
20137
|
-
headingPath: chunk.headingPath,
|
|
20138
19947
|
snippet: chunk.snippet,
|
|
20139
|
-
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20140
19948
|
ordinal: chunk.ordinal,
|
|
20141
19949
|
contentHash: chunk.contentHash,
|
|
20142
|
-
modelId: this.config.embeddings.model,
|
|
20143
19950
|
depth: chunk.depth,
|
|
20144
19951
|
incomingLinks: chunk.incomingLinks,
|
|
20145
19952
|
routeFile: chunk.routeFile,
|
|
20146
|
-
|
|
20147
|
-
|
|
20148
|
-
keywords: chunk.keywords
|
|
19953
|
+
description: chunk.description ?? "",
|
|
19954
|
+
keywords: (chunk.keywords ?? []).join(",")
|
|
20149
19955
|
}
|
|
20150
|
-
}
|
|
20151
|
-
}
|
|
20152
|
-
if (upserts.length > 0) {
|
|
20153
|
-
await this.vectorStore.upsert(upserts, scope);
|
|
20154
|
-
this.logger.event("upserted", { count: upserts.length });
|
|
20155
|
-
}
|
|
20156
|
-
if (deletes.length > 0) {
|
|
20157
|
-
await this.vectorStore.deleteByIds(deletes, scope);
|
|
20158
|
-
this.logger.event("deleted", { count: deletes.length });
|
|
20159
|
-
}
|
|
20160
|
-
}
|
|
20161
|
-
stageEnd("sync", syncStart);
|
|
20162
|
-
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
20163
|
-
const finalizeStart = stageStart();
|
|
20164
|
-
if (!options.dryRun) {
|
|
20165
|
-
const scopeInfo = {
|
|
20166
|
-
projectId: scope.projectId,
|
|
20167
|
-
scopeName: scope.scopeName,
|
|
20168
|
-
modelId: this.config.embeddings.model,
|
|
20169
|
-
lastIndexedAt: nowIso(),
|
|
20170
|
-
vectorCount: chunks.length,
|
|
20171
|
-
lastEstimateTokens: estimatedTokens,
|
|
20172
|
-
lastEstimateCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
20173
|
-
lastEstimateChangedChunks: changedChunks.length
|
|
20174
|
-
};
|
|
20175
|
-
await this.vectorStore.recordScope(scopeInfo);
|
|
20176
|
-
this.logger.event("registry_updated", {
|
|
20177
|
-
scope: scope.scopeName,
|
|
20178
|
-
vectorCount: chunks.length
|
|
19956
|
+
};
|
|
20179
19957
|
});
|
|
19958
|
+
await this.store.upsertChunks(docs, scope);
|
|
19959
|
+
documentsUpserted = docs.length;
|
|
19960
|
+
this.logger.event("upserted", { count: docs.length });
|
|
19961
|
+
}
|
|
19962
|
+
if (!options.dryRun && deletes.length > 0) {
|
|
19963
|
+
await this.store.deleteByIds(deletes, scope);
|
|
19964
|
+
this.logger.event("deleted", { count: deletes.length });
|
|
19965
|
+
}
|
|
19966
|
+
stageEnd("upsert", upsertStart);
|
|
19967
|
+
if (changedChunks.length > 0) {
|
|
19968
|
+
this.logger.info(`Upserted ${documentsUpserted} document${documentsUpserted === 1 ? "" : "s"} (${stageTimingsMs["upsert"]}ms)`);
|
|
19969
|
+
} else {
|
|
19970
|
+
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
20180
19971
|
}
|
|
20181
|
-
stageEnd("finalize", finalizeStart);
|
|
20182
19972
|
this.logger.info("Done.");
|
|
20183
19973
|
return {
|
|
20184
|
-
pagesProcessed:
|
|
19974
|
+
pagesProcessed: pages.length,
|
|
20185
19975
|
chunksTotal: chunks.length,
|
|
20186
19976
|
chunksChanged: changedChunks.length,
|
|
20187
|
-
|
|
19977
|
+
documentsUpserted,
|
|
20188
19978
|
deletes: deletes.length,
|
|
20189
|
-
estimatedTokens,
|
|
20190
|
-
estimatedCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
20191
19979
|
routeExact,
|
|
20192
19980
|
routeBestEffort,
|
|
20193
19981
|
stageTimingsMs
|
|
20194
19982
|
};
|
|
20195
19983
|
}
|
|
20196
19984
|
};
|
|
20197
|
-
|
|
20198
|
-
// src/search/ranking.ts
|
|
20199
|
-
function nonNegativeOrZero(value) {
|
|
20200
|
-
if (!Number.isFinite(value)) {
|
|
20201
|
-
return 0;
|
|
20202
|
-
}
|
|
20203
|
-
return Math.max(0, value);
|
|
20204
|
-
}
|
|
20205
|
-
function rankHits(hits, config) {
|
|
20206
|
-
return hits.map((hit) => {
|
|
20207
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20208
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
20209
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
20210
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
20211
|
-
}
|
|
20212
|
-
if (config.ranking.enableDepthBoost) {
|
|
20213
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
20214
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
20215
|
-
}
|
|
20216
|
-
return {
|
|
20217
|
-
hit,
|
|
20218
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
20219
|
-
};
|
|
20220
|
-
}).sort((a, b) => {
|
|
20221
|
-
const delta = b.finalScore - a.finalScore;
|
|
20222
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20223
|
-
});
|
|
20224
|
-
}
|
|
20225
|
-
function findPageWeight(url, pageWeights) {
|
|
20226
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
20227
|
-
const normalizedUrl = norm(url);
|
|
20228
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20229
|
-
if (norm(pattern) === normalizedUrl) {
|
|
20230
|
-
return weight;
|
|
20231
|
-
}
|
|
20232
|
-
}
|
|
20233
|
-
let bestPrefix = "";
|
|
20234
|
-
let bestWeight = 1;
|
|
20235
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20236
|
-
const normalizedPattern = norm(pattern);
|
|
20237
|
-
if (normalizedPattern === "/") continue;
|
|
20238
|
-
const prefix = `${normalizedPattern}/`;
|
|
20239
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
20240
|
-
bestPrefix = prefix;
|
|
20241
|
-
bestWeight = weight;
|
|
20242
|
-
}
|
|
20243
|
-
}
|
|
20244
|
-
return bestWeight;
|
|
20245
|
-
}
|
|
20246
|
-
function aggregateByPage(ranked, config) {
|
|
20247
|
-
const groups = /* @__PURE__ */ new Map();
|
|
20248
|
-
for (const hit of ranked) {
|
|
20249
|
-
const url = hit.hit.metadata.url;
|
|
20250
|
-
const group = groups.get(url);
|
|
20251
|
-
if (group) group.push(hit);
|
|
20252
|
-
else groups.set(url, [hit]);
|
|
20253
|
-
}
|
|
20254
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
20255
|
-
const pages = [];
|
|
20256
|
-
for (const [url, chunks] of groups) {
|
|
20257
|
-
chunks.sort((a, b) => {
|
|
20258
|
-
const delta = b.finalScore - a.finalScore;
|
|
20259
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20260
|
-
});
|
|
20261
|
-
const best = chunks[0];
|
|
20262
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20263
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
20264
|
-
let aggregationBonus = 0;
|
|
20265
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
20266
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20267
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20268
|
-
}
|
|
20269
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20270
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20271
|
-
if (pageWeight === 0) continue;
|
|
20272
|
-
if (pageWeight !== 1) {
|
|
20273
|
-
pageScore *= pageWeight;
|
|
20274
|
-
}
|
|
20275
|
-
pages.push({
|
|
20276
|
-
url,
|
|
20277
|
-
title: best.hit.metadata.title,
|
|
20278
|
-
routeFile: best.hit.metadata.routeFile,
|
|
20279
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20280
|
-
bestChunk: best,
|
|
20281
|
-
matchingChunks: chunks
|
|
20282
|
-
});
|
|
20283
|
-
}
|
|
20284
|
-
return pages.sort((a, b) => {
|
|
20285
|
-
const delta = b.pageScore - a.pageScore;
|
|
20286
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20287
|
-
});
|
|
20288
|
-
}
|
|
20289
|
-
|
|
20290
|
-
// src/search/engine.ts
|
|
20291
19985
|
var requestSchema = z.object({
|
|
20292
19986
|
q: z.string().trim().min(1),
|
|
20293
19987
|
topK: z.number().int().positive().max(100).optional(),
|
|
20294
19988
|
scope: z.string().optional(),
|
|
20295
19989
|
pathPrefix: z.string().optional(),
|
|
20296
19990
|
tags: z.array(z.string()).optional(),
|
|
20297
|
-
rerank: z.boolean().optional(),
|
|
20298
19991
|
groupBy: z.enum(["page", "chunk"]).optional()
|
|
20299
19992
|
});
|
|
20300
19993
|
var SearchEngine = class _SearchEngine {
|
|
20301
19994
|
cwd;
|
|
20302
19995
|
config;
|
|
20303
|
-
|
|
20304
|
-
vectorStore;
|
|
20305
|
-
reranker;
|
|
19996
|
+
store;
|
|
20306
19997
|
constructor(options) {
|
|
20307
19998
|
this.cwd = options.cwd;
|
|
20308
19999
|
this.config = options.config;
|
|
20309
|
-
this.
|
|
20310
|
-
this.vectorStore = options.vectorStore;
|
|
20311
|
-
this.reranker = options.reranker;
|
|
20000
|
+
this.store = options.store;
|
|
20312
20001
|
}
|
|
20313
20002
|
static async create(options = {}) {
|
|
20314
20003
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
20315
20004
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
20316
|
-
const
|
|
20317
|
-
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
20318
|
-
const reranker = options.reranker === void 0 ? createReranker(config) : options.reranker;
|
|
20005
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
20319
20006
|
return new _SearchEngine({
|
|
20320
20007
|
cwd,
|
|
20321
20008
|
config,
|
|
20322
|
-
|
|
20323
|
-
vectorStore,
|
|
20324
|
-
reranker
|
|
20009
|
+
store
|
|
20325
20010
|
});
|
|
20326
20011
|
}
|
|
20327
20012
|
getConfig() {
|
|
@@ -20335,99 +20020,130 @@ var SearchEngine = class _SearchEngine {
|
|
|
20335
20020
|
const input = parsed.data;
|
|
20336
20021
|
const totalStart = process.hrtime.bigint();
|
|
20337
20022
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20338
|
-
await this.assertModelCompatibility(resolvedScope);
|
|
20339
20023
|
const topK = input.topK ?? 10;
|
|
20340
|
-
const wantsRerank = Boolean(input.rerank);
|
|
20341
20024
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20342
20025
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20343
|
-
const
|
|
20344
|
-
|
|
20345
|
-
|
|
20346
|
-
|
|
20347
|
-
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
20026
|
+
const filterParts = [];
|
|
20027
|
+
if (input.pathPrefix) {
|
|
20028
|
+
const prefix = input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}`;
|
|
20029
|
+
filterParts.push(`url GLOB '${prefix}*'`);
|
|
20348
20030
|
}
|
|
20349
|
-
|
|
20350
|
-
|
|
20351
|
-
|
|
20352
|
-
|
|
20353
|
-
{
|
|
20354
|
-
topK: candidateK,
|
|
20355
|
-
pathPrefix: input.pathPrefix,
|
|
20356
|
-
tags: input.tags
|
|
20357
|
-
},
|
|
20358
|
-
resolvedScope
|
|
20359
|
-
);
|
|
20360
|
-
const vectorMs = hrTimeMs(vectorStart);
|
|
20361
|
-
const ranked = rankHits(hits, this.config);
|
|
20362
|
-
let usedRerank = false;
|
|
20363
|
-
let rerankMs = 0;
|
|
20364
|
-
let ordered = ranked;
|
|
20365
|
-
if (wantsRerank) {
|
|
20366
|
-
const rerankStart = process.hrtime.bigint();
|
|
20367
|
-
ordered = await this.rerankHits(input.q, ranked, topK);
|
|
20368
|
-
rerankMs = hrTimeMs(rerankStart);
|
|
20369
|
-
usedRerank = true;
|
|
20031
|
+
if (input.tags && input.tags.length > 0) {
|
|
20032
|
+
for (const tag of input.tags) {
|
|
20033
|
+
filterParts.push(`tags GLOB '*${tag}*'`);
|
|
20034
|
+
}
|
|
20370
20035
|
}
|
|
20371
|
-
|
|
20372
|
-
const
|
|
20036
|
+
const filter = filterParts.length > 0 ? filterParts.join(" AND ") : void 0;
|
|
20037
|
+
const useDualSearch = this.config.search.dualSearch && groupByPage;
|
|
20038
|
+
const searchStart = process.hrtime.bigint();
|
|
20039
|
+
let ranked;
|
|
20040
|
+
if (useDualSearch) {
|
|
20041
|
+
const chunkLimit = Math.max(topK * 10, 100);
|
|
20042
|
+
const pageLimit = 20;
|
|
20043
|
+
const [pageHits, chunkHits] = await Promise.all([
|
|
20044
|
+
this.store.searchPages(
|
|
20045
|
+
input.q,
|
|
20046
|
+
{
|
|
20047
|
+
limit: pageLimit,
|
|
20048
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
20049
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
20050
|
+
filter
|
|
20051
|
+
},
|
|
20052
|
+
resolvedScope
|
|
20053
|
+
),
|
|
20054
|
+
this.store.search(
|
|
20055
|
+
input.q,
|
|
20056
|
+
{
|
|
20057
|
+
limit: chunkLimit,
|
|
20058
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
20059
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
20060
|
+
reranking: false,
|
|
20061
|
+
filter
|
|
20062
|
+
},
|
|
20063
|
+
resolvedScope
|
|
20064
|
+
)
|
|
20065
|
+
]);
|
|
20066
|
+
const rankedChunks = rankHits(chunkHits, this.config, input.q);
|
|
20067
|
+
ranked = mergePageAndChunkResults(pageHits, rankedChunks, this.config);
|
|
20068
|
+
} else {
|
|
20069
|
+
const hits = await this.store.search(
|
|
20070
|
+
input.q,
|
|
20071
|
+
{
|
|
20072
|
+
limit: candidateK,
|
|
20073
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
20074
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
20075
|
+
reranking: this.config.search.reranking,
|
|
20076
|
+
filter
|
|
20077
|
+
},
|
|
20078
|
+
resolvedScope
|
|
20079
|
+
);
|
|
20080
|
+
ranked = rankHits(hits, this.config, input.q);
|
|
20081
|
+
}
|
|
20082
|
+
const searchMs = hrTimeMs(searchStart);
|
|
20083
|
+
const results = this.buildResults(ranked, topK, groupByPage, input.q);
|
|
20084
|
+
return {
|
|
20085
|
+
q: input.q,
|
|
20086
|
+
scope: resolvedScope.scopeName,
|
|
20087
|
+
results,
|
|
20088
|
+
meta: {
|
|
20089
|
+
timingsMs: {
|
|
20090
|
+
search: Math.round(searchMs),
|
|
20091
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20092
|
+
}
|
|
20093
|
+
}
|
|
20094
|
+
};
|
|
20095
|
+
}
|
|
20096
|
+
ensureSnippet(hit) {
|
|
20097
|
+
const snippet = hit.hit.metadata.snippet;
|
|
20098
|
+
if (snippet && snippet.length >= 30) return snippet;
|
|
20099
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
20100
|
+
if (chunkText) return toSnippet(chunkText);
|
|
20101
|
+
return snippet || "";
|
|
20102
|
+
}
|
|
20103
|
+
buildResults(ordered, topK, groupByPage, _query) {
|
|
20373
20104
|
if (groupByPage) {
|
|
20374
20105
|
let pages = aggregateByPage(ordered, this.config);
|
|
20375
|
-
|
|
20376
|
-
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20377
|
-
}
|
|
20106
|
+
pages = trimByScoreGap(pages, this.config);
|
|
20378
20107
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20379
|
-
|
|
20108
|
+
return pages.slice(0, topK).map((page) => {
|
|
20380
20109
|
const bestScore = page.bestChunk.finalScore;
|
|
20381
|
-
const
|
|
20382
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20110
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20111
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
20383
20112
|
return {
|
|
20384
20113
|
url: page.url,
|
|
20385
20114
|
title: page.title,
|
|
20386
20115
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
20387
|
-
snippet: page.bestChunk
|
|
20116
|
+
snippet: this.ensureSnippet(page.bestChunk),
|
|
20388
20117
|
score: Number(page.pageScore.toFixed(6)),
|
|
20389
20118
|
routeFile: page.routeFile,
|
|
20390
20119
|
chunks: meaningful.length > 1 ? meaningful.map((c) => ({
|
|
20391
20120
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
20392
|
-
snippet: c
|
|
20121
|
+
snippet: this.ensureSnippet(c),
|
|
20393
20122
|
headingPath: c.hit.metadata.headingPath,
|
|
20394
20123
|
score: Number(c.finalScore.toFixed(6))
|
|
20395
20124
|
})) : void 0
|
|
20396
20125
|
};
|
|
20397
20126
|
});
|
|
20398
20127
|
} else {
|
|
20128
|
+
let filtered = ordered;
|
|
20129
|
+
const minScore = this.config.ranking.minScore;
|
|
20399
20130
|
if (minScore > 0) {
|
|
20400
|
-
|
|
20131
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20401
20132
|
}
|
|
20402
|
-
|
|
20133
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20403
20134
|
url: hit.metadata.url,
|
|
20404
20135
|
title: hit.metadata.title,
|
|
20405
20136
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
20406
|
-
snippet: hit
|
|
20137
|
+
snippet: this.ensureSnippet({ hit, finalScore }),
|
|
20407
20138
|
score: Number(finalScore.toFixed(6)),
|
|
20408
20139
|
routeFile: hit.metadata.routeFile
|
|
20409
20140
|
}));
|
|
20410
20141
|
}
|
|
20411
|
-
return {
|
|
20412
|
-
q: input.q,
|
|
20413
|
-
scope: resolvedScope.scopeName,
|
|
20414
|
-
results,
|
|
20415
|
-
meta: {
|
|
20416
|
-
timingsMs: {
|
|
20417
|
-
embed: Math.round(embedMs),
|
|
20418
|
-
vector: Math.round(vectorMs),
|
|
20419
|
-
rerank: Math.round(rerankMs),
|
|
20420
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
20421
|
-
},
|
|
20422
|
-
usedRerank,
|
|
20423
|
-
modelId: this.config.embeddings.model
|
|
20424
|
-
}
|
|
20425
|
-
};
|
|
20426
20142
|
}
|
|
20427
20143
|
async getPage(pathOrUrl, scope) {
|
|
20428
20144
|
const resolvedScope = resolveScope(this.config, scope);
|
|
20429
20145
|
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
20430
|
-
const page = await this.
|
|
20146
|
+
const page = await this.store.getPage(urlPath, resolvedScope);
|
|
20431
20147
|
if (!page) {
|
|
20432
20148
|
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
20433
20149
|
}
|
|
@@ -20448,7 +20164,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20448
20164
|
};
|
|
20449
20165
|
}
|
|
20450
20166
|
async health() {
|
|
20451
|
-
return this.
|
|
20167
|
+
return this.store.health();
|
|
20452
20168
|
}
|
|
20453
20169
|
resolveInputPath(pathOrUrl) {
|
|
20454
20170
|
try {
|
|
@@ -20460,90 +20176,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
20460
20176
|
const withoutQueryOrHash = pathOrUrl.split(/[?#]/)[0] ?? pathOrUrl;
|
|
20461
20177
|
return normalizeUrlPath(withoutQueryOrHash);
|
|
20462
20178
|
}
|
|
20463
|
-
async assertModelCompatibility(scope) {
|
|
20464
|
-
const modelId = await this.vectorStore.getScopeModelId(scope);
|
|
20465
|
-
if (modelId && modelId !== this.config.embeddings.model) {
|
|
20466
|
-
throw new SearchSocketError(
|
|
20467
|
-
"EMBEDDING_MODEL_MISMATCH",
|
|
20468
|
-
`Scope ${scope.scopeName} was indexed with ${modelId}. Current config uses ${this.config.embeddings.model}. Re-index with --force.`
|
|
20469
|
-
);
|
|
20470
|
-
}
|
|
20471
|
-
}
|
|
20472
|
-
async rerankHits(query, ranked, topK) {
|
|
20473
|
-
if (!this.config.rerank.enabled) {
|
|
20474
|
-
throw new SearchSocketError(
|
|
20475
|
-
"INVALID_REQUEST",
|
|
20476
|
-
"rerank=true requested but rerank.enabled is not set to true.",
|
|
20477
|
-
400
|
|
20478
|
-
);
|
|
20479
|
-
}
|
|
20480
|
-
if (!this.reranker) {
|
|
20481
|
-
throw new SearchSocketError(
|
|
20482
|
-
"CONFIG_MISSING",
|
|
20483
|
-
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
20484
|
-
400
|
|
20485
|
-
);
|
|
20486
|
-
}
|
|
20487
|
-
const pageGroups = /* @__PURE__ */ new Map();
|
|
20488
|
-
for (const entry of ranked) {
|
|
20489
|
-
const url = entry.hit.metadata.url;
|
|
20490
|
-
const group = pageGroups.get(url);
|
|
20491
|
-
if (group) group.push(entry);
|
|
20492
|
-
else pageGroups.set(url, [entry]);
|
|
20493
|
-
}
|
|
20494
|
-
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20495
|
-
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20496
|
-
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20497
|
-
const MAX_DOC_CHARS = 2e3;
|
|
20498
|
-
const pageCandidates = [];
|
|
20499
|
-
for (const [url, chunks] of pageGroups) {
|
|
20500
|
-
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
20501
|
-
const bestScore = byScore[0].finalScore;
|
|
20502
|
-
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
20503
|
-
const selected = byScore.filter(
|
|
20504
|
-
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
20505
|
-
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
20506
|
-
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
20507
|
-
const first = selected[0].hit.metadata;
|
|
20508
|
-
const parts = [first.title];
|
|
20509
|
-
if (first.description) {
|
|
20510
|
-
parts.push(first.description);
|
|
20511
|
-
}
|
|
20512
|
-
if (first.keywords && first.keywords.length > 0) {
|
|
20513
|
-
parts.push(first.keywords.join(", "));
|
|
20514
|
-
}
|
|
20515
|
-
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20516
|
-
parts.push(body);
|
|
20517
|
-
let text = parts.join("\n\n");
|
|
20518
|
-
if (text.length > MAX_DOC_CHARS) {
|
|
20519
|
-
text = text.slice(0, MAX_DOC_CHARS);
|
|
20520
|
-
}
|
|
20521
|
-
pageCandidates.push({ id: url, text });
|
|
20522
|
-
}
|
|
20523
|
-
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
20524
|
-
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
20525
|
-
const reranked = await this.reranker.rerank(
|
|
20526
|
-
query,
|
|
20527
|
-
cappedCandidates,
|
|
20528
|
-
maxCandidates
|
|
20529
|
-
);
|
|
20530
|
-
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20531
|
-
return ranked.map((entry) => {
|
|
20532
|
-
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
20533
|
-
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
20534
|
-
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
20535
|
-
return { ...entry, finalScore: base };
|
|
20536
|
-
}
|
|
20537
|
-
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
20538
|
-
return {
|
|
20539
|
-
...entry,
|
|
20540
|
-
finalScore: Number.isFinite(combined) ? combined : base
|
|
20541
|
-
};
|
|
20542
|
-
}).sort((a, b) => {
|
|
20543
|
-
const delta = b.finalScore - a.finalScore;
|
|
20544
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20545
|
-
});
|
|
20546
|
-
}
|
|
20547
20179
|
};
|
|
20548
20180
|
function createServer(engine) {
|
|
20549
20181
|
const server = new McpServer({
|
|
@@ -20553,7 +20185,7 @@ function createServer(engine) {
|
|
|
20553
20185
|
server.registerTool(
|
|
20554
20186
|
"search",
|
|
20555
20187
|
{
|
|
20556
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
20188
|
+
description: "Semantic site search powered by Upstash Search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and groupBy.",
|
|
20557
20189
|
inputSchema: {
|
|
20558
20190
|
query: z.string().min(1),
|
|
20559
20191
|
scope: z.string().optional(),
|
|
@@ -20855,7 +20487,8 @@ function searchsocketHandle(options = {}) {
|
|
|
20855
20487
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20856
20488
|
}
|
|
20857
20489
|
const engine = await getEngine();
|
|
20858
|
-
const
|
|
20490
|
+
const searchRequest = body;
|
|
20491
|
+
const result = await engine.search(searchRequest);
|
|
20859
20492
|
return withCors(
|
|
20860
20493
|
new Response(JSON.stringify(result), {
|
|
20861
20494
|
status: 200,
|
|
@@ -20940,13 +20573,6 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20940
20573
|
let running = false;
|
|
20941
20574
|
return {
|
|
20942
20575
|
name: "searchsocket:auto-index",
|
|
20943
|
-
config() {
|
|
20944
|
-
return {
|
|
20945
|
-
ssr: {
|
|
20946
|
-
external: ["@libsql/client", "libsql"]
|
|
20947
|
-
}
|
|
20948
|
-
};
|
|
20949
|
-
},
|
|
20950
20576
|
async closeBundle() {
|
|
20951
20577
|
if (executed || running) {
|
|
20952
20578
|
return;
|
|
@@ -20968,15 +20594,14 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20968
20594
|
});
|
|
20969
20595
|
const stats = await pipeline.run({
|
|
20970
20596
|
changedOnly: options.changedOnly ?? true,
|
|
20971
|
-
force: options.force ?? false,
|
|
20597
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20972
20598
|
dryRun: options.dryRun ?? false,
|
|
20973
20599
|
scopeOverride: options.scope,
|
|
20974
20600
|
verbose: options.verbose
|
|
20975
20601
|
});
|
|
20976
20602
|
logger3.info(
|
|
20977
|
-
`[searchsocket] indexed pages=${stats.pagesProcessed} chunks=${stats.chunksTotal} changed=${stats.chunksChanged}
|
|
20603
|
+
`[searchsocket] indexed pages=${stats.pagesProcessed} chunks=${stats.chunksTotal} changed=${stats.chunksChanged} upserted=${stats.documentsUpserted}`
|
|
20978
20604
|
);
|
|
20979
|
-
logger3.info("[searchsocket] markdown mirror written under .searchsocket/pages/<scope> (safe to commit for content workflows).");
|
|
20980
20605
|
executed = true;
|
|
20981
20606
|
} finally {
|
|
20982
20607
|
running = false;
|
|
@@ -21027,6 +20652,6 @@ function createSearchClient(options = {}) {
|
|
|
21027
20652
|
*)
|
|
21028
20653
|
*/
|
|
21029
20654
|
|
|
21030
|
-
export { IndexPipeline,
|
|
20655
|
+
export { IndexPipeline, SearchEngine, UpstashSearchStore, createSearchClient, createUpstashStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
21031
20656
|
//# sourceMappingURL=index.js.map
|
|
21032
20657
|
//# sourceMappingURL=index.js.map
|