searchsocket 0.3.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -39
- package/dist/cli.js +947 -1378
- package/dist/client.cjs +45 -0
- package/dist/client.d.cts +3 -2
- package/dist/client.d.ts +3 -2
- package/dist/client.js +45 -1
- package/dist/index.cjs +909 -1286
- package/dist/index.d.cts +73 -33
- package/dist/index.d.ts +73 -33
- package/dist/index.js +906 -1281
- package/dist/plugin-B_npJSux.d.cts +36 -0
- package/dist/plugin-M-aW0ev6.d.ts +36 -0
- package/dist/scroll.cjs +185 -0
- package/dist/scroll.d.cts +42 -0
- package/dist/scroll.d.ts +42 -0
- package/dist/scroll.js +183 -0
- package/dist/sveltekit.cjs +997 -1204
- package/dist/sveltekit.d.cts +3 -43
- package/dist/sveltekit.d.ts +3 -43
- package/dist/sveltekit.js +995 -1202
- package/dist/{types-BrG6XTUU.d.cts → types-Dk43uz25.d.cts} +50 -109
- package/dist/{types-BrG6XTUU.d.ts → types-Dk43uz25.d.ts} +50 -109
- package/package.json +10 -3
package/dist/index.cjs
CHANGED
|
@@ -5,12 +5,12 @@ var path = require('path');
|
|
|
5
5
|
var jiti = require('jiti');
|
|
6
6
|
var zod = require('zod');
|
|
7
7
|
var child_process = require('child_process');
|
|
8
|
-
var pLimit2 = require('p-limit');
|
|
9
8
|
var crypto = require('crypto');
|
|
10
9
|
var cheerio = require('cheerio');
|
|
11
10
|
var matter = require('gray-matter');
|
|
12
|
-
var fs4 = require('fs/promises');
|
|
13
11
|
var fg = require('fast-glob');
|
|
12
|
+
var pLimit = require('p-limit');
|
|
13
|
+
var fs3 = require('fs/promises');
|
|
14
14
|
var net = require('net');
|
|
15
15
|
var zlib = require('zlib');
|
|
16
16
|
var mcp_js = require('@modelcontextprotocol/sdk/server/mcp.js');
|
|
@@ -22,10 +22,10 @@ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
|
22
22
|
|
|
23
23
|
var fs__default = /*#__PURE__*/_interopDefault(fs);
|
|
24
24
|
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
25
|
-
var pLimit2__default = /*#__PURE__*/_interopDefault(pLimit2);
|
|
26
25
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
27
|
-
var fs4__default = /*#__PURE__*/_interopDefault(fs4);
|
|
28
26
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
27
|
+
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
28
|
+
var fs3__default = /*#__PURE__*/_interopDefault(fs3);
|
|
29
29
|
var net__default = /*#__PURE__*/_interopDefault(net);
|
|
30
30
|
|
|
31
31
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
@@ -2771,12 +2771,12 @@ var require_ChildNode = __commonJS({
|
|
|
2771
2771
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/ChildNode.js"(exports$1, module) {
|
|
2772
2772
|
var Node2 = require_Node();
|
|
2773
2773
|
var LinkedList = require_LinkedList();
|
|
2774
|
-
var createDocumentFragmentFromArguments = function(
|
|
2775
|
-
var docFrag =
|
|
2774
|
+
var createDocumentFragmentFromArguments = function(document2, args) {
|
|
2775
|
+
var docFrag = document2.createDocumentFragment();
|
|
2776
2776
|
for (var i = 0; i < args.length; i++) {
|
|
2777
2777
|
var argItem = args[i];
|
|
2778
2778
|
var isNode = argItem instanceof Node2;
|
|
2779
|
-
docFrag.appendChild(isNode ? argItem :
|
|
2779
|
+
docFrag.appendChild(isNode ? argItem : document2.createTextNode(String(argItem)));
|
|
2780
2780
|
}
|
|
2781
2781
|
return docFrag;
|
|
2782
2782
|
};
|
|
@@ -2934,7 +2934,7 @@ var require_NamedNodeMap = __commonJS({
|
|
|
2934
2934
|
// node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Element.js
|
|
2935
2935
|
var require_Element = __commonJS({
|
|
2936
2936
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Element.js"(exports$1, module) {
|
|
2937
|
-
module.exports =
|
|
2937
|
+
module.exports = Element2;
|
|
2938
2938
|
var xml = require_xmlnames();
|
|
2939
2939
|
var utils = require_utils();
|
|
2940
2940
|
var NAMESPACE = utils.NAMESPACE;
|
|
@@ -2951,7 +2951,7 @@ var require_Element = __commonJS({
|
|
|
2951
2951
|
var NonDocumentTypeChildNode = require_NonDocumentTypeChildNode();
|
|
2952
2952
|
var NamedNodeMap = require_NamedNodeMap();
|
|
2953
2953
|
var uppercaseCache = /* @__PURE__ */ Object.create(null);
|
|
2954
|
-
function
|
|
2954
|
+
function Element2(doc, localName, namespaceURI, prefix) {
|
|
2955
2955
|
ContainerNode.call(this);
|
|
2956
2956
|
this.nodeType = Node2.ELEMENT_NODE;
|
|
2957
2957
|
this.ownerDocument = doc;
|
|
@@ -2971,7 +2971,7 @@ var require_Element = __commonJS({
|
|
|
2971
2971
|
recursiveGetText(node.childNodes[i], a);
|
|
2972
2972
|
}
|
|
2973
2973
|
}
|
|
2974
|
-
|
|
2974
|
+
Element2.prototype = Object.create(ContainerNode.prototype, {
|
|
2975
2975
|
isHTML: { get: function isHTML() {
|
|
2976
2976
|
return this.namespaceURI === NAMESPACE.HTML && this.ownerDocument.isHTML;
|
|
2977
2977
|
} },
|
|
@@ -3041,7 +3041,7 @@ var require_Element = __commonJS({
|
|
|
3041
3041
|
return NodeUtils.serializeOne(this, { nodeType: 0 });
|
|
3042
3042
|
},
|
|
3043
3043
|
set: function(v) {
|
|
3044
|
-
var
|
|
3044
|
+
var document2 = this.ownerDocument;
|
|
3045
3045
|
var parent = this.parentNode;
|
|
3046
3046
|
if (parent === null) {
|
|
3047
3047
|
return;
|
|
@@ -3052,8 +3052,8 @@ var require_Element = __commonJS({
|
|
|
3052
3052
|
if (parent.nodeType === Node2.DOCUMENT_FRAGMENT_NODE) {
|
|
3053
3053
|
parent = parent.ownerDocument.createElement("body");
|
|
3054
3054
|
}
|
|
3055
|
-
var parser =
|
|
3056
|
-
|
|
3055
|
+
var parser = document2.implementation.mozHTMLParser(
|
|
3056
|
+
document2._address,
|
|
3057
3057
|
parent
|
|
3058
3058
|
);
|
|
3059
3059
|
parser.parse(v === null ? "" : String(v), true);
|
|
@@ -3112,7 +3112,7 @@ var require_Element = __commonJS({
|
|
|
3112
3112
|
default:
|
|
3113
3113
|
utils.SyntaxError();
|
|
3114
3114
|
}
|
|
3115
|
-
if (!(context instanceof
|
|
3115
|
+
if (!(context instanceof Element2) || context.ownerDocument.isHTML && context.localName === "html" && context.namespaceURI === NAMESPACE.HTML) {
|
|
3116
3116
|
context = context.ownerDocument.createElementNS(NAMESPACE.HTML, "body");
|
|
3117
3117
|
}
|
|
3118
3118
|
var parser = this.ownerDocument.implementation.mozHTMLParser(
|
|
@@ -3720,10 +3720,10 @@ var require_Element = __commonJS({
|
|
|
3720
3720
|
return nodes.item ? nodes : new NodeList(nodes);
|
|
3721
3721
|
} }
|
|
3722
3722
|
});
|
|
3723
|
-
Object.defineProperties(
|
|
3724
|
-
Object.defineProperties(
|
|
3723
|
+
Object.defineProperties(Element2.prototype, ChildNode);
|
|
3724
|
+
Object.defineProperties(Element2.prototype, NonDocumentTypeChildNode);
|
|
3725
3725
|
attributes.registerChangeHandler(
|
|
3726
|
-
|
|
3726
|
+
Element2,
|
|
3727
3727
|
"id",
|
|
3728
3728
|
function(element, lname, oldval, newval) {
|
|
3729
3729
|
if (element.rooted) {
|
|
@@ -3737,7 +3737,7 @@ var require_Element = __commonJS({
|
|
|
3737
3737
|
}
|
|
3738
3738
|
);
|
|
3739
3739
|
attributes.registerChangeHandler(
|
|
3740
|
-
|
|
3740
|
+
Element2,
|
|
3741
3741
|
"class",
|
|
3742
3742
|
function(element, lname, oldval, newval) {
|
|
3743
3743
|
if (element._classList) {
|
|
@@ -3836,7 +3836,7 @@ var require_Element = __commonJS({
|
|
|
3836
3836
|
}
|
|
3837
3837
|
}
|
|
3838
3838
|
});
|
|
3839
|
-
|
|
3839
|
+
Element2._Attr = Attr;
|
|
3840
3840
|
function AttributesArray(elt) {
|
|
3841
3841
|
NamedNodeMap.call(this, elt);
|
|
3842
3842
|
for (var name in elt._attrsByQName) {
|
|
@@ -4238,7 +4238,7 @@ var require_DocumentFragment = __commonJS({
|
|
|
4238
4238
|
var Node2 = require_Node();
|
|
4239
4239
|
var NodeList = require_NodeList();
|
|
4240
4240
|
var ContainerNode = require_ContainerNode();
|
|
4241
|
-
var
|
|
4241
|
+
var Element2 = require_Element();
|
|
4242
4242
|
var select = require_select();
|
|
4243
4243
|
var utils = require_utils();
|
|
4244
4244
|
function DocumentFragment(doc) {
|
|
@@ -4256,9 +4256,9 @@ var require_DocumentFragment = __commonJS({
|
|
|
4256
4256
|
}
|
|
4257
4257
|
},
|
|
4258
4258
|
// Copy the text content getter/setter from Element
|
|
4259
|
-
textContent: Object.getOwnPropertyDescriptor(
|
|
4259
|
+
textContent: Object.getOwnPropertyDescriptor(Element2.prototype, "textContent"),
|
|
4260
4260
|
// Copy the text content getter/setter from Element
|
|
4261
|
-
innerText: Object.getOwnPropertyDescriptor(
|
|
4261
|
+
innerText: Object.getOwnPropertyDescriptor(Element2.prototype, "innerText"),
|
|
4262
4262
|
querySelector: { value: function(selector) {
|
|
4263
4263
|
var nodes = this.querySelectorAll(selector);
|
|
4264
4264
|
return nodes.length ? nodes[0] : null;
|
|
@@ -4266,8 +4266,8 @@ var require_DocumentFragment = __commonJS({
|
|
|
4266
4266
|
querySelectorAll: { value: function(selector) {
|
|
4267
4267
|
var context = Object.create(this);
|
|
4268
4268
|
context.isHTML = true;
|
|
4269
|
-
context.getElementsByTagName =
|
|
4270
|
-
context.nextElement = Object.getOwnPropertyDescriptor(
|
|
4269
|
+
context.getElementsByTagName = Element2.prototype.getElementsByTagName;
|
|
4270
|
+
context.nextElement = Object.getOwnPropertyDescriptor(Element2.prototype, "firstElementChild").get;
|
|
4271
4271
|
var nodes = select(selector, context);
|
|
4272
4272
|
return nodes.item ? nodes : new NodeList(nodes);
|
|
4273
4273
|
} },
|
|
@@ -4349,7 +4349,7 @@ var require_ProcessingInstruction = __commonJS({
|
|
|
4349
4349
|
// node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/NodeFilter.js
|
|
4350
4350
|
var require_NodeFilter = __commonJS({
|
|
4351
4351
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/NodeFilter.js"(exports$1, module) {
|
|
4352
|
-
var
|
|
4352
|
+
var NodeFilter2 = {
|
|
4353
4353
|
// Constants for acceptNode()
|
|
4354
4354
|
FILTER_ACCEPT: 1,
|
|
4355
4355
|
FILTER_REJECT: 2,
|
|
@@ -4374,7 +4374,7 @@ var require_NodeFilter = __commonJS({
|
|
|
4374
4374
|
SHOW_NOTATION: 2048
|
|
4375
4375
|
// historical
|
|
4376
4376
|
};
|
|
4377
|
-
module.exports =
|
|
4377
|
+
module.exports = NodeFilter2.constructor = NodeFilter2.prototype = NodeFilter2;
|
|
4378
4378
|
}
|
|
4379
4379
|
});
|
|
4380
4380
|
|
|
@@ -4449,7 +4449,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4449
4449
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/TreeWalker.js"(exports$1, module) {
|
|
4450
4450
|
module.exports = TreeWalker;
|
|
4451
4451
|
var Node2 = require_Node();
|
|
4452
|
-
var
|
|
4452
|
+
var NodeFilter2 = require_NodeFilter();
|
|
4453
4453
|
var NodeTraversal = require_NodeTraversal();
|
|
4454
4454
|
var utils = require_utils();
|
|
4455
4455
|
var mapChild = {
|
|
@@ -4469,11 +4469,11 @@ var require_TreeWalker = __commonJS({
|
|
|
4469
4469
|
node = tw._currentNode[mapChild[type]];
|
|
4470
4470
|
while (node !== null) {
|
|
4471
4471
|
result = tw._internalFilter(node);
|
|
4472
|
-
if (result ===
|
|
4472
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4473
4473
|
tw._currentNode = node;
|
|
4474
4474
|
return node;
|
|
4475
4475
|
}
|
|
4476
|
-
if (result ===
|
|
4476
|
+
if (result === NodeFilter2.FILTER_SKIP) {
|
|
4477
4477
|
child = node[mapChild[type]];
|
|
4478
4478
|
if (child !== null) {
|
|
4479
4479
|
node = child;
|
|
@@ -4507,12 +4507,12 @@ var require_TreeWalker = __commonJS({
|
|
|
4507
4507
|
while (sibling !== null) {
|
|
4508
4508
|
node = sibling;
|
|
4509
4509
|
result = tw._internalFilter(node);
|
|
4510
|
-
if (result ===
|
|
4510
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4511
4511
|
tw._currentNode = node;
|
|
4512
4512
|
return node;
|
|
4513
4513
|
}
|
|
4514
4514
|
sibling = node[mapChild[type]];
|
|
4515
|
-
if (result ===
|
|
4515
|
+
if (result === NodeFilter2.FILTER_REJECT || sibling === null) {
|
|
4516
4516
|
sibling = node[mapSibling[type]];
|
|
4517
4517
|
}
|
|
4518
4518
|
}
|
|
@@ -4520,7 +4520,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4520
4520
|
if (node === null || node === tw.root) {
|
|
4521
4521
|
return null;
|
|
4522
4522
|
}
|
|
4523
|
-
if (tw._internalFilter(node) ===
|
|
4523
|
+
if (tw._internalFilter(node) === NodeFilter2.FILTER_ACCEPT) {
|
|
4524
4524
|
return null;
|
|
4525
4525
|
}
|
|
4526
4526
|
}
|
|
@@ -4568,11 +4568,11 @@ var require_TreeWalker = __commonJS({
|
|
|
4568
4568
|
utils.InvalidStateError();
|
|
4569
4569
|
}
|
|
4570
4570
|
if (!(1 << node.nodeType - 1 & this._whatToShow)) {
|
|
4571
|
-
return
|
|
4571
|
+
return NodeFilter2.FILTER_SKIP;
|
|
4572
4572
|
}
|
|
4573
4573
|
filter = this._filter;
|
|
4574
4574
|
if (filter === null) {
|
|
4575
|
-
result =
|
|
4575
|
+
result = NodeFilter2.FILTER_ACCEPT;
|
|
4576
4576
|
} else {
|
|
4577
4577
|
this._active = true;
|
|
4578
4578
|
try {
|
|
@@ -4601,7 +4601,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4601
4601
|
if (node === null) {
|
|
4602
4602
|
return null;
|
|
4603
4603
|
}
|
|
4604
|
-
if (this._internalFilter(node) ===
|
|
4604
|
+
if (this._internalFilter(node) === NodeFilter2.FILTER_ACCEPT) {
|
|
4605
4605
|
this._currentNode = node;
|
|
4606
4606
|
return node;
|
|
4607
4607
|
}
|
|
@@ -4654,17 +4654,17 @@ var require_TreeWalker = __commonJS({
|
|
|
4654
4654
|
for (previousSibling = node.previousSibling; previousSibling; previousSibling = node.previousSibling) {
|
|
4655
4655
|
node = previousSibling;
|
|
4656
4656
|
result = this._internalFilter(node);
|
|
4657
|
-
if (result ===
|
|
4657
|
+
if (result === NodeFilter2.FILTER_REJECT) {
|
|
4658
4658
|
continue;
|
|
4659
4659
|
}
|
|
4660
4660
|
for (lastChild = node.lastChild; lastChild; lastChild = node.lastChild) {
|
|
4661
4661
|
node = lastChild;
|
|
4662
4662
|
result = this._internalFilter(node);
|
|
4663
|
-
if (result ===
|
|
4663
|
+
if (result === NodeFilter2.FILTER_REJECT) {
|
|
4664
4664
|
break;
|
|
4665
4665
|
}
|
|
4666
4666
|
}
|
|
4667
|
-
if (result ===
|
|
4667
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4668
4668
|
this._currentNode = node;
|
|
4669
4669
|
return node;
|
|
4670
4670
|
}
|
|
@@ -4673,7 +4673,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4673
4673
|
return null;
|
|
4674
4674
|
}
|
|
4675
4675
|
node = node.parentNode;
|
|
4676
|
-
if (this._internalFilter(node) ===
|
|
4676
|
+
if (this._internalFilter(node) === NodeFilter2.FILTER_ACCEPT) {
|
|
4677
4677
|
this._currentNode = node;
|
|
4678
4678
|
return node;
|
|
4679
4679
|
}
|
|
@@ -4690,26 +4690,26 @@ var require_TreeWalker = __commonJS({
|
|
|
4690
4690
|
nextNode: { value: function nextNode() {
|
|
4691
4691
|
var node, result, firstChild, nextSibling;
|
|
4692
4692
|
node = this._currentNode;
|
|
4693
|
-
result =
|
|
4693
|
+
result = NodeFilter2.FILTER_ACCEPT;
|
|
4694
4694
|
CHILDREN:
|
|
4695
4695
|
while (true) {
|
|
4696
4696
|
for (firstChild = node.firstChild; firstChild; firstChild = node.firstChild) {
|
|
4697
4697
|
node = firstChild;
|
|
4698
4698
|
result = this._internalFilter(node);
|
|
4699
|
-
if (result ===
|
|
4699
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4700
4700
|
this._currentNode = node;
|
|
4701
4701
|
return node;
|
|
4702
|
-
} else if (result ===
|
|
4702
|
+
} else if (result === NodeFilter2.FILTER_REJECT) {
|
|
4703
4703
|
break;
|
|
4704
4704
|
}
|
|
4705
4705
|
}
|
|
4706
4706
|
for (nextSibling = NodeTraversal.nextSkippingChildren(node, this.root); nextSibling; nextSibling = NodeTraversal.nextSkippingChildren(node, this.root)) {
|
|
4707
4707
|
node = nextSibling;
|
|
4708
4708
|
result = this._internalFilter(node);
|
|
4709
|
-
if (result ===
|
|
4709
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4710
4710
|
this._currentNode = node;
|
|
4711
4711
|
return node;
|
|
4712
|
-
} else if (result ===
|
|
4712
|
+
} else if (result === NodeFilter2.FILTER_SKIP) {
|
|
4713
4713
|
continue CHILDREN;
|
|
4714
4714
|
}
|
|
4715
4715
|
}
|
|
@@ -4728,7 +4728,7 @@ var require_TreeWalker = __commonJS({
|
|
|
4728
4728
|
var require_NodeIterator = __commonJS({
|
|
4729
4729
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/NodeIterator.js"(exports$1, module) {
|
|
4730
4730
|
module.exports = NodeIterator;
|
|
4731
|
-
var
|
|
4731
|
+
var NodeFilter2 = require_NodeFilter();
|
|
4732
4732
|
var NodeTraversal = require_NodeTraversal();
|
|
4733
4733
|
var utils = require_utils();
|
|
4734
4734
|
function move(node, stayWithin, directionIsNext) {
|
|
@@ -4763,7 +4763,7 @@ var require_NodeIterator = __commonJS({
|
|
|
4763
4763
|
}
|
|
4764
4764
|
}
|
|
4765
4765
|
var result = ni._internalFilter(node);
|
|
4766
|
-
if (result ===
|
|
4766
|
+
if (result === NodeFilter2.FILTER_ACCEPT) {
|
|
4767
4767
|
break;
|
|
4768
4768
|
}
|
|
4769
4769
|
}
|
|
@@ -4811,11 +4811,11 @@ var require_NodeIterator = __commonJS({
|
|
|
4811
4811
|
utils.InvalidStateError();
|
|
4812
4812
|
}
|
|
4813
4813
|
if (!(1 << node.nodeType - 1 & this._whatToShow)) {
|
|
4814
|
-
return
|
|
4814
|
+
return NodeFilter2.FILTER_SKIP;
|
|
4815
4815
|
}
|
|
4816
4816
|
filter = this._filter;
|
|
4817
4817
|
if (filter === null) {
|
|
4818
|
-
result =
|
|
4818
|
+
result = NodeFilter2.FILTER_ACCEPT;
|
|
4819
4819
|
} else {
|
|
4820
4820
|
this._active = true;
|
|
4821
4821
|
try {
|
|
@@ -5025,32 +5025,32 @@ var require_URL = __commonJS({
|
|
|
5025
5025
|
else
|
|
5026
5026
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5027
5027
|
}
|
|
5028
|
-
function remove_dot_segments(
|
|
5029
|
-
if (!
|
|
5028
|
+
function remove_dot_segments(path13) {
|
|
5029
|
+
if (!path13) return path13;
|
|
5030
5030
|
var output = "";
|
|
5031
|
-
while (
|
|
5032
|
-
if (
|
|
5033
|
-
|
|
5031
|
+
while (path13.length > 0) {
|
|
5032
|
+
if (path13 === "." || path13 === "..") {
|
|
5033
|
+
path13 = "";
|
|
5034
5034
|
break;
|
|
5035
5035
|
}
|
|
5036
|
-
var twochars =
|
|
5037
|
-
var threechars =
|
|
5038
|
-
var fourchars =
|
|
5036
|
+
var twochars = path13.substring(0, 2);
|
|
5037
|
+
var threechars = path13.substring(0, 3);
|
|
5038
|
+
var fourchars = path13.substring(0, 4);
|
|
5039
5039
|
if (threechars === "../") {
|
|
5040
|
-
|
|
5040
|
+
path13 = path13.substring(3);
|
|
5041
5041
|
} else if (twochars === "./") {
|
|
5042
|
-
|
|
5042
|
+
path13 = path13.substring(2);
|
|
5043
5043
|
} else if (threechars === "/./") {
|
|
5044
|
-
|
|
5045
|
-
} else if (twochars === "/." &&
|
|
5046
|
-
|
|
5047
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5048
|
-
|
|
5044
|
+
path13 = "/" + path13.substring(3);
|
|
5045
|
+
} else if (twochars === "/." && path13.length === 2) {
|
|
5046
|
+
path13 = "/";
|
|
5047
|
+
} else if (fourchars === "/../" || threechars === "/.." && path13.length === 3) {
|
|
5048
|
+
path13 = "/" + path13.substring(4);
|
|
5049
5049
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5050
5050
|
} else {
|
|
5051
|
-
var segment =
|
|
5051
|
+
var segment = path13.match(/(\/?([^\/]*))/)[0];
|
|
5052
5052
|
output += segment;
|
|
5053
|
-
|
|
5053
|
+
path13 = path13.substring(segment.length);
|
|
5054
5054
|
}
|
|
5055
5055
|
}
|
|
5056
5056
|
return output;
|
|
@@ -5615,9 +5615,9 @@ var require_defineElement = __commonJS({
|
|
|
5615
5615
|
});
|
|
5616
5616
|
return c;
|
|
5617
5617
|
};
|
|
5618
|
-
function EventHandlerBuilder(body,
|
|
5618
|
+
function EventHandlerBuilder(body, document2, form, element) {
|
|
5619
5619
|
this.body = body;
|
|
5620
|
-
this.document =
|
|
5620
|
+
this.document = document2;
|
|
5621
5621
|
this.form = form;
|
|
5622
5622
|
this.element = element;
|
|
5623
5623
|
}
|
|
@@ -5651,7 +5651,7 @@ var require_defineElement = __commonJS({
|
|
|
5651
5651
|
var require_htmlelts = __commonJS({
|
|
5652
5652
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/htmlelts.js"(exports$1) {
|
|
5653
5653
|
var Node2 = require_Node();
|
|
5654
|
-
var
|
|
5654
|
+
var Element2 = require_Element();
|
|
5655
5655
|
var CSSStyleDeclaration = require_CSSStyleDeclaration();
|
|
5656
5656
|
var utils = require_utils();
|
|
5657
5657
|
var URLUtils = require_URLUtils();
|
|
@@ -5719,10 +5719,10 @@ var require_htmlelts = __commonJS({
|
|
|
5719
5719
|
this._form = null;
|
|
5720
5720
|
};
|
|
5721
5721
|
var HTMLElement = exports$1.HTMLElement = define({
|
|
5722
|
-
superclass:
|
|
5722
|
+
superclass: Element2,
|
|
5723
5723
|
name: "HTMLElement",
|
|
5724
5724
|
ctor: function HTMLElement2(doc, localName, prefix) {
|
|
5725
|
-
|
|
5725
|
+
Element2.call(this, doc, localName, utils.NAMESPACE.HTML, prefix);
|
|
5726
5726
|
},
|
|
5727
5727
|
props: {
|
|
5728
5728
|
dangerouslySetInnerHTML: {
|
|
@@ -7204,7 +7204,7 @@ var require_htmlelts = __commonJS({
|
|
|
7204
7204
|
// node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/svg.js
|
|
7205
7205
|
var require_svg = __commonJS({
|
|
7206
7206
|
"node_modules/.pnpm/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/svg.js"(exports$1) {
|
|
7207
|
-
var
|
|
7207
|
+
var Element2 = require_Element();
|
|
7208
7208
|
var defineElement = require_defineElement();
|
|
7209
7209
|
var utils = require_utils();
|
|
7210
7210
|
var CSSStyleDeclaration = require_CSSStyleDeclaration();
|
|
@@ -7218,10 +7218,10 @@ var require_svg = __commonJS({
|
|
|
7218
7218
|
return defineElement(spec, SVGElement, svgElements, svgNameToImpl);
|
|
7219
7219
|
}
|
|
7220
7220
|
var SVGElement = define({
|
|
7221
|
-
superclass:
|
|
7221
|
+
superclass: Element2,
|
|
7222
7222
|
name: "SVGElement",
|
|
7223
7223
|
ctor: function SVGElement2(doc, localName, prefix) {
|
|
7224
|
-
|
|
7224
|
+
Element2.call(this, doc, localName, utils.NAMESPACE.SVG, prefix);
|
|
7225
7225
|
},
|
|
7226
7226
|
props: {
|
|
7227
7227
|
style: { get: function() {
|
|
@@ -7356,7 +7356,7 @@ var require_Document = __commonJS({
|
|
|
7356
7356
|
var Node2 = require_Node();
|
|
7357
7357
|
var NodeList = require_NodeList();
|
|
7358
7358
|
var ContainerNode = require_ContainerNode();
|
|
7359
|
-
var
|
|
7359
|
+
var Element2 = require_Element();
|
|
7360
7360
|
var Text = require_Text();
|
|
7361
7361
|
var Comment = require_Comment();
|
|
7362
7362
|
var Event = require_Event();
|
|
@@ -7365,7 +7365,7 @@ var require_Document = __commonJS({
|
|
|
7365
7365
|
var DOMImplementation = require_DOMImplementation();
|
|
7366
7366
|
var TreeWalker = require_TreeWalker();
|
|
7367
7367
|
var NodeIterator = require_NodeIterator();
|
|
7368
|
-
var
|
|
7368
|
+
var NodeFilter2 = require_NodeFilter();
|
|
7369
7369
|
var URL2 = require_URL();
|
|
7370
7370
|
var select = require_select();
|
|
7371
7371
|
var events = require_events();
|
|
@@ -7504,13 +7504,13 @@ var require_Document = __commonJS({
|
|
|
7504
7504
|
if (this.isHTML) {
|
|
7505
7505
|
localName = utils.toASCIILowerCase(localName);
|
|
7506
7506
|
}
|
|
7507
|
-
return new
|
|
7507
|
+
return new Element2._Attr(null, localName, null, null, "");
|
|
7508
7508
|
} },
|
|
7509
7509
|
createAttributeNS: { value: function(namespace, qualifiedName) {
|
|
7510
7510
|
namespace = namespace === null || namespace === void 0 || namespace === "" ? null : String(namespace);
|
|
7511
7511
|
qualifiedName = String(qualifiedName);
|
|
7512
7512
|
var ve = validateAndExtract(namespace, qualifiedName);
|
|
7513
|
-
return new
|
|
7513
|
+
return new Element2._Attr(null, ve.localName, ve.prefix, ve.namespace, "");
|
|
7514
7514
|
} },
|
|
7515
7515
|
createElement: { value: function(localName) {
|
|
7516
7516
|
localName = String(localName);
|
|
@@ -7522,7 +7522,7 @@ var require_Document = __commonJS({
|
|
|
7522
7522
|
} else if (this.contentType === "application/xhtml+xml") {
|
|
7523
7523
|
return html.createElement(this, localName, null);
|
|
7524
7524
|
} else {
|
|
7525
|
-
return new
|
|
7525
|
+
return new Element2(this, localName, null, null);
|
|
7526
7526
|
}
|
|
7527
7527
|
}, writable: isApiWritable },
|
|
7528
7528
|
createElementNS: { value: function(namespace, qualifiedName) {
|
|
@@ -7539,7 +7539,7 @@ var require_Document = __commonJS({
|
|
|
7539
7539
|
} else if (namespace === NAMESPACE.SVG) {
|
|
7540
7540
|
return svg.createElement(this, localName, prefix);
|
|
7541
7541
|
}
|
|
7542
|
-
return new
|
|
7542
|
+
return new Element2(this, localName, namespace, prefix);
|
|
7543
7543
|
} },
|
|
7544
7544
|
createEvent: { value: function createEvent(interfaceName) {
|
|
7545
7545
|
interfaceName = interfaceName.toLowerCase();
|
|
@@ -7561,7 +7561,7 @@ var require_Document = __commonJS({
|
|
|
7561
7561
|
if (!(root3 instanceof Node2)) {
|
|
7562
7562
|
throw new TypeError("root not a node");
|
|
7563
7563
|
}
|
|
7564
|
-
whatToShow = whatToShow === void 0 ?
|
|
7564
|
+
whatToShow = whatToShow === void 0 ? NodeFilter2.SHOW_ALL : +whatToShow;
|
|
7565
7565
|
filter = filter === void 0 ? null : filter;
|
|
7566
7566
|
return new TreeWalker(root3, whatToShow, filter);
|
|
7567
7567
|
} },
|
|
@@ -7573,7 +7573,7 @@ var require_Document = __commonJS({
|
|
|
7573
7573
|
if (!(root3 instanceof Node2)) {
|
|
7574
7574
|
throw new TypeError("root not a node");
|
|
7575
7575
|
}
|
|
7576
|
-
whatToShow = whatToShow === void 0 ?
|
|
7576
|
+
whatToShow = whatToShow === void 0 ? NodeFilter2.SHOW_ALL : +whatToShow;
|
|
7577
7577
|
filter = filter === void 0 ? null : filter;
|
|
7578
7578
|
return new NodeIterator(root3, whatToShow, filter);
|
|
7579
7579
|
} },
|
|
@@ -7634,10 +7634,10 @@ var require_Document = __commonJS({
|
|
|
7634
7634
|
return this.byId[id] instanceof MultiId;
|
|
7635
7635
|
} },
|
|
7636
7636
|
// Just copy this method from the Element prototype
|
|
7637
|
-
getElementsByName: { value:
|
|
7638
|
-
getElementsByTagName: { value:
|
|
7639
|
-
getElementsByTagNameNS: { value:
|
|
7640
|
-
getElementsByClassName: { value:
|
|
7637
|
+
getElementsByName: { value: Element2.prototype.getElementsByName },
|
|
7638
|
+
getElementsByTagName: { value: Element2.prototype.getElementsByTagName },
|
|
7639
|
+
getElementsByTagNameNS: { value: Element2.prototype.getElementsByTagNameNS },
|
|
7640
|
+
getElementsByClassName: { value: Element2.prototype.getElementsByClassName },
|
|
7641
7641
|
adoptNode: { value: function adoptNode(node) {
|
|
7642
7642
|
if (node.nodeType === Node2.DOCUMENT_NODE) utils.NotSupportedError();
|
|
7643
7643
|
if (node.nodeType === Node2.ATTRIBUTE_NODE) {
|
|
@@ -16463,8 +16463,8 @@ var require_Window = __commonJS({
|
|
|
16463
16463
|
var Location = require_Location();
|
|
16464
16464
|
var utils = require_utils();
|
|
16465
16465
|
module.exports = Window;
|
|
16466
|
-
function Window(
|
|
16467
|
-
this.document =
|
|
16466
|
+
function Window(document2) {
|
|
16467
|
+
this.document = document2 || new DOMImplementation(null).createHTMLDocument("");
|
|
16468
16468
|
this.document._scripting_enabled = true;
|
|
16469
16469
|
this.document.defaultView = this;
|
|
16470
16470
|
this.location = new Location(this, this.document._address || "about:blank");
|
|
@@ -16594,11 +16594,11 @@ var require_lib = __commonJS({
|
|
|
16594
16594
|
};
|
|
16595
16595
|
};
|
|
16596
16596
|
exports$1.createWindow = function(html, address) {
|
|
16597
|
-
var
|
|
16597
|
+
var document2 = exports$1.createDocument(html);
|
|
16598
16598
|
if (address !== void 0) {
|
|
16599
|
-
|
|
16599
|
+
document2._address = address;
|
|
16600
16600
|
}
|
|
16601
|
-
return new impl.Window(
|
|
16601
|
+
return new impl.Window(document2);
|
|
16602
16602
|
};
|
|
16603
16603
|
exports$1.impl = impl;
|
|
16604
16604
|
}
|
|
@@ -16614,6 +16614,8 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16614
16614
|
envVar: zod.z.string().min(1).optional(),
|
|
16615
16615
|
sanitize: zod.z.boolean().optional()
|
|
16616
16616
|
}).optional(),
|
|
16617
|
+
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16618
|
+
respectRobotsTxt: zod.z.boolean().optional(),
|
|
16617
16619
|
source: zod.z.object({
|
|
16618
16620
|
mode: zod.z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16619
16621
|
staticOutputDir: zod.z.string().min(1).optional(),
|
|
@@ -16661,29 +16663,18 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16661
16663
|
prependTitle: zod.z.boolean().optional(),
|
|
16662
16664
|
pageSummaryChunk: zod.z.boolean().optional()
|
|
16663
16665
|
}).optional(),
|
|
16664
|
-
|
|
16665
|
-
|
|
16666
|
-
|
|
16667
|
-
|
|
16668
|
-
|
|
16669
|
-
batchSize: zod.z.number().int().positive().optional(),
|
|
16670
|
-
concurrency: zod.z.number().int().positive().optional(),
|
|
16671
|
-
pricePer1kTokens: zod.z.number().positive().optional()
|
|
16666
|
+
upstash: zod.z.object({
|
|
16667
|
+
url: zod.z.string().url().optional(),
|
|
16668
|
+
token: zod.z.string().min(1).optional(),
|
|
16669
|
+
urlEnv: zod.z.string().min(1).optional(),
|
|
16670
|
+
tokenEnv: zod.z.string().min(1).optional()
|
|
16672
16671
|
}).optional(),
|
|
16673
|
-
|
|
16674
|
-
|
|
16675
|
-
|
|
16676
|
-
|
|
16677
|
-
|
|
16678
|
-
|
|
16679
|
-
authTokenEnv: zod.z.string().optional(),
|
|
16680
|
-
localPath: zod.z.string().optional()
|
|
16681
|
-
}).optional()
|
|
16682
|
-
}).optional(),
|
|
16683
|
-
rerank: zod.z.object({
|
|
16684
|
-
enabled: zod.z.boolean().optional(),
|
|
16685
|
-
topN: zod.z.number().int().positive().optional(),
|
|
16686
|
-
model: zod.z.string().optional()
|
|
16672
|
+
search: zod.z.object({
|
|
16673
|
+
semanticWeight: zod.z.number().min(0).max(1).optional(),
|
|
16674
|
+
inputEnrichment: zod.z.boolean().optional(),
|
|
16675
|
+
reranking: zod.z.boolean().optional(),
|
|
16676
|
+
dualSearch: zod.z.boolean().optional(),
|
|
16677
|
+
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
16687
16678
|
}).optional(),
|
|
16688
16679
|
ranking: zod.z.object({
|
|
16689
16680
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
@@ -16693,11 +16684,12 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16693
16684
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16694
16685
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16695
16686
|
minScore: zod.z.number().min(0).max(1).optional(),
|
|
16687
|
+
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
16696
16688
|
weights: zod.z.object({
|
|
16697
16689
|
incomingLinks: zod.z.number().optional(),
|
|
16698
16690
|
depth: zod.z.number().optional(),
|
|
16699
|
-
|
|
16700
|
-
|
|
16691
|
+
aggregation: zod.z.number().optional(),
|
|
16692
|
+
titleMatch: zod.z.number().optional()
|
|
16701
16693
|
}).optional()
|
|
16702
16694
|
}).optional(),
|
|
16703
16695
|
api: zod.z.object({
|
|
@@ -16719,8 +16711,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16719
16711
|
}).optional()
|
|
16720
16712
|
}).optional(),
|
|
16721
16713
|
state: zod.z.object({
|
|
16722
|
-
dir: zod.z.string().optional()
|
|
16723
|
-
writeMirror: zod.z.boolean().optional()
|
|
16714
|
+
dir: zod.z.string().optional()
|
|
16724
16715
|
}).optional()
|
|
16725
16716
|
});
|
|
16726
16717
|
|
|
@@ -16744,6 +16735,8 @@ function createDefaultConfig(projectId) {
|
|
|
16744
16735
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16745
16736
|
sanitize: true
|
|
16746
16737
|
},
|
|
16738
|
+
exclude: [],
|
|
16739
|
+
respectRobotsTxt: true,
|
|
16747
16740
|
source: {
|
|
16748
16741
|
mode: "static-output",
|
|
16749
16742
|
staticOutputDir: "build",
|
|
@@ -16772,24 +16765,16 @@ function createDefaultConfig(projectId) {
|
|
|
16772
16765
|
prependTitle: true,
|
|
16773
16766
|
pageSummaryChunk: true
|
|
16774
16767
|
},
|
|
16775
|
-
|
|
16776
|
-
|
|
16777
|
-
|
|
16778
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16779
|
-
batchSize: 64,
|
|
16780
|
-
concurrency: 4
|
|
16781
|
-
},
|
|
16782
|
-
vector: {
|
|
16783
|
-
turso: {
|
|
16784
|
-
urlEnv: "TURSO_DATABASE_URL",
|
|
16785
|
-
authTokenEnv: "TURSO_AUTH_TOKEN",
|
|
16786
|
-
localPath: ".searchsocket/vectors.db"
|
|
16787
|
-
}
|
|
16768
|
+
upstash: {
|
|
16769
|
+
urlEnv: "UPSTASH_SEARCH_REST_URL",
|
|
16770
|
+
tokenEnv: "UPSTASH_SEARCH_REST_TOKEN"
|
|
16788
16771
|
},
|
|
16789
|
-
|
|
16790
|
-
|
|
16791
|
-
|
|
16792
|
-
|
|
16772
|
+
search: {
|
|
16773
|
+
semanticWeight: 0.75,
|
|
16774
|
+
inputEnrichment: true,
|
|
16775
|
+
reranking: true,
|
|
16776
|
+
dualSearch: true,
|
|
16777
|
+
pageSearchWeight: 0.3
|
|
16793
16778
|
},
|
|
16794
16779
|
ranking: {
|
|
16795
16780
|
enableIncomingLinkBoost: true,
|
|
@@ -16798,12 +16783,13 @@ function createDefaultConfig(projectId) {
|
|
|
16798
16783
|
aggregationCap: 5,
|
|
16799
16784
|
aggregationDecay: 0.5,
|
|
16800
16785
|
minChunkScoreRatio: 0.5,
|
|
16801
|
-
minScore: 0,
|
|
16786
|
+
minScore: 0.3,
|
|
16787
|
+
scoreGapThreshold: 0.4,
|
|
16802
16788
|
weights: {
|
|
16803
16789
|
incomingLinks: 0.05,
|
|
16804
16790
|
depth: 0.03,
|
|
16805
|
-
|
|
16806
|
-
|
|
16791
|
+
aggregation: 0.1,
|
|
16792
|
+
titleMatch: 0.15
|
|
16807
16793
|
}
|
|
16808
16794
|
},
|
|
16809
16795
|
api: {
|
|
@@ -16821,8 +16807,7 @@ function createDefaultConfig(projectId) {
|
|
|
16821
16807
|
}
|
|
16822
16808
|
},
|
|
16823
16809
|
state: {
|
|
16824
|
-
dir: ".searchsocket"
|
|
16825
|
-
writeMirror: false
|
|
16810
|
+
dir: ".searchsocket"
|
|
16826
16811
|
}
|
|
16827
16812
|
};
|
|
16828
16813
|
}
|
|
@@ -16908,6 +16893,8 @@ ${issues}`
|
|
|
16908
16893
|
...defaults.scope,
|
|
16909
16894
|
...parsed.scope
|
|
16910
16895
|
},
|
|
16896
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16897
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16911
16898
|
source: {
|
|
16912
16899
|
...defaults.source,
|
|
16913
16900
|
...parsed.source,
|
|
@@ -16944,21 +16931,13 @@ ${issues}`
|
|
|
16944
16931
|
...defaults.chunking,
|
|
16945
16932
|
...parsed.chunking
|
|
16946
16933
|
},
|
|
16947
|
-
|
|
16948
|
-
...defaults.
|
|
16949
|
-
...parsed.
|
|
16934
|
+
upstash: {
|
|
16935
|
+
...defaults.upstash,
|
|
16936
|
+
...parsed.upstash
|
|
16950
16937
|
},
|
|
16951
|
-
|
|
16952
|
-
...defaults.
|
|
16953
|
-
...parsed.
|
|
16954
|
-
turso: {
|
|
16955
|
-
...defaults.vector.turso,
|
|
16956
|
-
...parsed.vector?.turso
|
|
16957
|
-
}
|
|
16958
|
-
},
|
|
16959
|
-
rerank: {
|
|
16960
|
-
...defaults.rerank,
|
|
16961
|
-
...parsed.rerank
|
|
16938
|
+
search: {
|
|
16939
|
+
...defaults.search,
|
|
16940
|
+
...parsed.search
|
|
16962
16941
|
},
|
|
16963
16942
|
ranking: {
|
|
16964
16943
|
...defaults.ranking,
|
|
@@ -17137,660 +17116,245 @@ function resolveScope(config, override) {
|
|
|
17137
17116
|
scopeId: `${config.project.id}:${scopeName}`
|
|
17138
17117
|
};
|
|
17139
17118
|
}
|
|
17140
|
-
function sleep(ms) {
|
|
17141
|
-
return new Promise((resolve) => {
|
|
17142
|
-
setTimeout(resolve, ms);
|
|
17143
|
-
});
|
|
17144
|
-
}
|
|
17145
|
-
var JinaEmbeddingsProvider = class {
|
|
17146
|
-
apiKey;
|
|
17147
|
-
batchSize;
|
|
17148
|
-
concurrency;
|
|
17149
|
-
defaultTask;
|
|
17150
|
-
constructor(options) {
|
|
17151
|
-
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17152
|
-
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
17153
|
-
}
|
|
17154
|
-
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17155
|
-
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17156
|
-
}
|
|
17157
|
-
this.apiKey = options.apiKey;
|
|
17158
|
-
this.batchSize = options.batchSize;
|
|
17159
|
-
this.concurrency = options.concurrency;
|
|
17160
|
-
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17161
|
-
}
|
|
17162
|
-
estimateTokens(text) {
|
|
17163
|
-
const normalized = text.trim();
|
|
17164
|
-
if (!normalized) {
|
|
17165
|
-
return 0;
|
|
17166
|
-
}
|
|
17167
|
-
const wordCount = normalized.match(/[A-Za-z0-9_]+/g)?.length ?? 0;
|
|
17168
|
-
const punctuationCount = normalized.match(/[^\s\w]/g)?.length ?? 0;
|
|
17169
|
-
const cjkCount = normalized.match(/[\u3400-\u9fff]/g)?.length ?? 0;
|
|
17170
|
-
const charEstimate = Math.ceil(normalized.length / 4);
|
|
17171
|
-
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17172
|
-
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17173
|
-
}
|
|
17174
|
-
async embedTexts(texts, modelId, task) {
|
|
17175
|
-
if (texts.length === 0) {
|
|
17176
|
-
return [];
|
|
17177
|
-
}
|
|
17178
|
-
const batches = [];
|
|
17179
|
-
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
17180
|
-
batches.push({
|
|
17181
|
-
index: i,
|
|
17182
|
-
values: texts.slice(i, i + this.batchSize)
|
|
17183
|
-
});
|
|
17184
|
-
}
|
|
17185
|
-
const outputs = new Array(batches.length);
|
|
17186
|
-
const limit = pLimit2__default.default(this.concurrency);
|
|
17187
|
-
await Promise.all(
|
|
17188
|
-
batches.map(
|
|
17189
|
-
(batch, position) => limit(async () => {
|
|
17190
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17191
|
-
})
|
|
17192
|
-
)
|
|
17193
|
-
);
|
|
17194
|
-
return outputs.flat();
|
|
17195
|
-
}
|
|
17196
|
-
async embedWithRetry(texts, modelId, task) {
|
|
17197
|
-
const maxAttempts = 5;
|
|
17198
|
-
let attempt = 0;
|
|
17199
|
-
while (attempt < maxAttempts) {
|
|
17200
|
-
attempt += 1;
|
|
17201
|
-
let response;
|
|
17202
|
-
try {
|
|
17203
|
-
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17204
|
-
method: "POST",
|
|
17205
|
-
headers: {
|
|
17206
|
-
"content-type": "application/json",
|
|
17207
|
-
authorization: `Bearer ${this.apiKey}`
|
|
17208
|
-
},
|
|
17209
|
-
body: JSON.stringify({
|
|
17210
|
-
model: modelId,
|
|
17211
|
-
input: texts,
|
|
17212
|
-
task
|
|
17213
|
-
})
|
|
17214
|
-
});
|
|
17215
|
-
} catch (error) {
|
|
17216
|
-
if (attempt >= maxAttempts) {
|
|
17217
|
-
throw error;
|
|
17218
|
-
}
|
|
17219
|
-
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17220
|
-
continue;
|
|
17221
|
-
}
|
|
17222
|
-
if (!response.ok) {
|
|
17223
|
-
const retryable = response.status === 429 || response.status >= 500;
|
|
17224
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17225
|
-
const errorBody = await response.text();
|
|
17226
|
-
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17227
|
-
}
|
|
17228
|
-
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17229
|
-
continue;
|
|
17230
|
-
}
|
|
17231
|
-
const payload = await response.json();
|
|
17232
|
-
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17233
|
-
throw new Error("Invalid Jina embeddings response format");
|
|
17234
|
-
}
|
|
17235
|
-
return payload.data.map((entry) => entry.embedding);
|
|
17236
|
-
}
|
|
17237
|
-
throw new Error("Unreachable retry state");
|
|
17238
|
-
}
|
|
17239
|
-
};
|
|
17240
|
-
|
|
17241
|
-
// src/embeddings/factory.ts
|
|
17242
|
-
function createEmbeddingsProvider(config) {
|
|
17243
|
-
if (config.embeddings.provider !== "jina") {
|
|
17244
|
-
throw new SearchSocketError(
|
|
17245
|
-
"CONFIG_MISSING",
|
|
17246
|
-
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17247
|
-
);
|
|
17248
|
-
}
|
|
17249
|
-
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17250
|
-
if (!apiKey) {
|
|
17251
|
-
throw new SearchSocketError(
|
|
17252
|
-
"CONFIG_MISSING",
|
|
17253
|
-
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17254
|
-
);
|
|
17255
|
-
}
|
|
17256
|
-
return new JinaEmbeddingsProvider({
|
|
17257
|
-
apiKey,
|
|
17258
|
-
batchSize: config.embeddings.batchSize,
|
|
17259
|
-
concurrency: config.embeddings.concurrency
|
|
17260
|
-
});
|
|
17261
|
-
}
|
|
17262
|
-
|
|
17263
|
-
// src/rerank/jina.ts
|
|
17264
|
-
function sleep2(ms) {
|
|
17265
|
-
return new Promise((resolve) => {
|
|
17266
|
-
setTimeout(resolve, ms);
|
|
17267
|
-
});
|
|
17268
|
-
}
|
|
17269
|
-
var JinaReranker = class {
|
|
17270
|
-
apiKey;
|
|
17271
|
-
model;
|
|
17272
|
-
maxRetries;
|
|
17273
|
-
constructor(options) {
|
|
17274
|
-
this.apiKey = options.apiKey;
|
|
17275
|
-
this.model = options.model;
|
|
17276
|
-
this.maxRetries = options.maxRetries ?? 2;
|
|
17277
|
-
}
|
|
17278
|
-
async rerank(query, candidates, topN) {
|
|
17279
|
-
if (candidates.length === 0) {
|
|
17280
|
-
return [];
|
|
17281
|
-
}
|
|
17282
|
-
const body = {
|
|
17283
|
-
model: this.model,
|
|
17284
|
-
query,
|
|
17285
|
-
documents: candidates.map((candidate) => candidate.text),
|
|
17286
|
-
top_n: topN ?? candidates.length,
|
|
17287
|
-
return_documents: false
|
|
17288
|
-
};
|
|
17289
|
-
let attempt = 0;
|
|
17290
|
-
while (attempt <= this.maxRetries) {
|
|
17291
|
-
attempt += 1;
|
|
17292
|
-
let response;
|
|
17293
|
-
try {
|
|
17294
|
-
response = await fetch("https://api.jina.ai/v1/rerank", {
|
|
17295
|
-
method: "POST",
|
|
17296
|
-
headers: {
|
|
17297
|
-
"content-type": "application/json",
|
|
17298
|
-
authorization: `Bearer ${this.apiKey}`
|
|
17299
|
-
},
|
|
17300
|
-
body: JSON.stringify(body)
|
|
17301
|
-
});
|
|
17302
|
-
} catch (error) {
|
|
17303
|
-
if (attempt <= this.maxRetries) {
|
|
17304
|
-
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
17305
|
-
continue;
|
|
17306
|
-
}
|
|
17307
|
-
throw error;
|
|
17308
|
-
}
|
|
17309
|
-
if (!response.ok) {
|
|
17310
|
-
const retryable = response.status === 429 || response.status >= 500;
|
|
17311
|
-
if (retryable && attempt <= this.maxRetries) {
|
|
17312
|
-
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
17313
|
-
continue;
|
|
17314
|
-
}
|
|
17315
|
-
const errorBody = await response.text();
|
|
17316
|
-
throw new Error(`Jina rerank failed (${response.status}): ${errorBody}`);
|
|
17317
|
-
}
|
|
17318
|
-
const payload = await response.json();
|
|
17319
|
-
const rawResults = payload.results ?? payload.data ?? [];
|
|
17320
|
-
if (!Array.isArray(rawResults)) {
|
|
17321
|
-
throw new Error("Invalid Jina rerank response format");
|
|
17322
|
-
}
|
|
17323
|
-
return rawResults.flatMap((item) => {
|
|
17324
|
-
const index = item.index;
|
|
17325
|
-
if (typeof index !== "number" || index < 0 || index >= candidates.length) {
|
|
17326
|
-
return [];
|
|
17327
|
-
}
|
|
17328
|
-
const candidate = candidates[index];
|
|
17329
|
-
if (!candidate) {
|
|
17330
|
-
return [];
|
|
17331
|
-
}
|
|
17332
|
-
const score = typeof item.relevance_score === "number" ? item.relevance_score : item.score ?? 0;
|
|
17333
|
-
return [
|
|
17334
|
-
{
|
|
17335
|
-
id: candidate.id,
|
|
17336
|
-
score
|
|
17337
|
-
}
|
|
17338
|
-
];
|
|
17339
|
-
}).sort((a, b) => b.score - a.score);
|
|
17340
|
-
}
|
|
17341
|
-
throw new Error("Jina rerank request failed after retries");
|
|
17342
|
-
}
|
|
17343
|
-
};
|
|
17344
|
-
|
|
17345
|
-
// src/rerank/factory.ts
|
|
17346
|
-
function createReranker(config) {
|
|
17347
|
-
if (!config.rerank.enabled) {
|
|
17348
|
-
return null;
|
|
17349
|
-
}
|
|
17350
|
-
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17351
|
-
if (!apiKey) {
|
|
17352
|
-
return null;
|
|
17353
|
-
}
|
|
17354
|
-
return new JinaReranker({
|
|
17355
|
-
apiKey,
|
|
17356
|
-
model: config.rerank.model
|
|
17357
|
-
});
|
|
17358
|
-
}
|
|
17359
17119
|
function ensureStateDirs(cwd, stateDir, scope) {
|
|
17360
17120
|
const statePath = path__default.default.resolve(cwd, stateDir);
|
|
17361
|
-
|
|
17362
|
-
|
|
17363
|
-
return { statePath, pagesPath };
|
|
17121
|
+
fs__default.default.mkdirSync(statePath, { recursive: true });
|
|
17122
|
+
return { statePath };
|
|
17364
17123
|
}
|
|
17365
17124
|
|
|
17366
|
-
// src/vector/
|
|
17367
|
-
|
|
17125
|
+
// src/vector/upstash.ts
|
|
17126
|
+
function chunkIndexName(scope) {
|
|
17127
|
+
return `${scope.projectId}--${scope.scopeName}`;
|
|
17128
|
+
}
|
|
17129
|
+
function pageIndexName(scope) {
|
|
17130
|
+
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17131
|
+
}
|
|
17132
|
+
var UpstashSearchStore = class {
|
|
17368
17133
|
client;
|
|
17369
|
-
dimension;
|
|
17370
|
-
chunksReady = false;
|
|
17371
|
-
registryReady = false;
|
|
17372
|
-
pagesReady = false;
|
|
17373
17134
|
constructor(opts) {
|
|
17374
17135
|
this.client = opts.client;
|
|
17375
|
-
this.dimension = opts.dimension;
|
|
17376
|
-
}
|
|
17377
|
-
async ensureRegistry() {
|
|
17378
|
-
if (this.registryReady) return;
|
|
17379
|
-
await this.client.execute(`
|
|
17380
|
-
CREATE TABLE IF NOT EXISTS registry (
|
|
17381
|
-
scope_key TEXT PRIMARY KEY,
|
|
17382
|
-
project_id TEXT NOT NULL,
|
|
17383
|
-
scope_name TEXT NOT NULL,
|
|
17384
|
-
model_id TEXT NOT NULL,
|
|
17385
|
-
last_indexed_at TEXT NOT NULL,
|
|
17386
|
-
vector_count INTEGER,
|
|
17387
|
-
last_estimate_tokens INTEGER,
|
|
17388
|
-
last_estimate_cost_usd REAL,
|
|
17389
|
-
last_estimate_changed_chunks INTEGER
|
|
17390
|
-
)
|
|
17391
|
-
`);
|
|
17392
|
-
const estimateCols = [
|
|
17393
|
-
{ name: "last_estimate_tokens", def: "INTEGER" },
|
|
17394
|
-
{ name: "last_estimate_cost_usd", def: "REAL" },
|
|
17395
|
-
{ name: "last_estimate_changed_chunks", def: "INTEGER" }
|
|
17396
|
-
];
|
|
17397
|
-
for (const col of estimateCols) {
|
|
17398
|
-
try {
|
|
17399
|
-
await this.client.execute(`ALTER TABLE registry ADD COLUMN ${col.name} ${col.def}`);
|
|
17400
|
-
} catch (error) {
|
|
17401
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17402
|
-
throw error;
|
|
17403
|
-
}
|
|
17404
|
-
}
|
|
17405
|
-
}
|
|
17406
|
-
this.registryReady = true;
|
|
17407
|
-
}
|
|
17408
|
-
async ensureChunks(dim) {
|
|
17409
|
-
if (this.chunksReady) return;
|
|
17410
|
-
const exists = await this.chunksTableExists();
|
|
17411
|
-
if (exists) {
|
|
17412
|
-
const currentDim = await this.getChunksDimension();
|
|
17413
|
-
if (currentDim !== null && currentDim !== dim) {
|
|
17414
|
-
await this.client.batch([
|
|
17415
|
-
"DROP INDEX IF EXISTS idx",
|
|
17416
|
-
"DROP TABLE IF EXISTS chunks"
|
|
17417
|
-
]);
|
|
17418
|
-
}
|
|
17419
|
-
}
|
|
17420
|
-
await this.client.batch([
|
|
17421
|
-
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17422
|
-
id TEXT PRIMARY KEY,
|
|
17423
|
-
project_id TEXT NOT NULL,
|
|
17424
|
-
scope_name TEXT NOT NULL,
|
|
17425
|
-
url TEXT NOT NULL,
|
|
17426
|
-
path TEXT NOT NULL,
|
|
17427
|
-
title TEXT NOT NULL,
|
|
17428
|
-
section_title TEXT NOT NULL DEFAULT '',
|
|
17429
|
-
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17430
|
-
snippet TEXT NOT NULL DEFAULT '',
|
|
17431
|
-
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17432
|
-
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17433
|
-
content_hash TEXT NOT NULL DEFAULT '',
|
|
17434
|
-
model_id TEXT NOT NULL DEFAULT '',
|
|
17435
|
-
depth INTEGER NOT NULL DEFAULT 0,
|
|
17436
|
-
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17437
|
-
route_file TEXT NOT NULL DEFAULT '',
|
|
17438
|
-
tags TEXT NOT NULL DEFAULT '[]',
|
|
17439
|
-
description TEXT NOT NULL DEFAULT '',
|
|
17440
|
-
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17441
|
-
embedding F32_BLOB(${dim})
|
|
17442
|
-
)`,
|
|
17443
|
-
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17444
|
-
]);
|
|
17445
|
-
this.chunksReady = true;
|
|
17446
|
-
}
|
|
17447
|
-
async ensurePages() {
|
|
17448
|
-
if (this.pagesReady) return;
|
|
17449
|
-
await this.client.execute(`
|
|
17450
|
-
CREATE TABLE IF NOT EXISTS pages (
|
|
17451
|
-
project_id TEXT NOT NULL,
|
|
17452
|
-
scope_name TEXT NOT NULL,
|
|
17453
|
-
url TEXT NOT NULL,
|
|
17454
|
-
title TEXT NOT NULL,
|
|
17455
|
-
markdown TEXT NOT NULL,
|
|
17456
|
-
route_file TEXT NOT NULL DEFAULT '',
|
|
17457
|
-
route_resolution TEXT NOT NULL DEFAULT 'exact',
|
|
17458
|
-
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17459
|
-
outgoing_links INTEGER NOT NULL DEFAULT 0,
|
|
17460
|
-
depth INTEGER NOT NULL DEFAULT 0,
|
|
17461
|
-
tags TEXT NOT NULL DEFAULT '[]',
|
|
17462
|
-
indexed_at TEXT NOT NULL,
|
|
17463
|
-
PRIMARY KEY (project_id, scope_name, url)
|
|
17464
|
-
)
|
|
17465
|
-
`);
|
|
17466
|
-
this.pagesReady = true;
|
|
17467
|
-
}
|
|
17468
|
-
async chunksTableExists() {
|
|
17469
|
-
try {
|
|
17470
|
-
await this.client.execute("SELECT 1 FROM chunks LIMIT 0");
|
|
17471
|
-
return true;
|
|
17472
|
-
} catch (error) {
|
|
17473
|
-
if (error instanceof Error && error.message.includes("no such table")) {
|
|
17474
|
-
return false;
|
|
17475
|
-
}
|
|
17476
|
-
throw error;
|
|
17477
|
-
}
|
|
17478
17136
|
}
|
|
17479
|
-
|
|
17480
|
-
|
|
17481
|
-
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17482
|
-
*/
|
|
17483
|
-
async getChunksDimension() {
|
|
17484
|
-
try {
|
|
17485
|
-
const rs = await this.client.execute(
|
|
17486
|
-
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17487
|
-
);
|
|
17488
|
-
if (rs.rows.length === 0) return null;
|
|
17489
|
-
const sql = rs.rows[0].sql;
|
|
17490
|
-
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17491
|
-
return match ? parseInt(match[1], 10) : null;
|
|
17492
|
-
} catch {
|
|
17493
|
-
return null;
|
|
17494
|
-
}
|
|
17137
|
+
chunkIndex(scope) {
|
|
17138
|
+
return this.client.index(chunkIndexName(scope));
|
|
17495
17139
|
}
|
|
17496
|
-
|
|
17497
|
-
|
|
17498
|
-
* Used by `clean --remote` for a full reset.
|
|
17499
|
-
*/
|
|
17500
|
-
async dropAllTables() {
|
|
17501
|
-
await this.client.batch([
|
|
17502
|
-
"DROP INDEX IF EXISTS idx",
|
|
17503
|
-
"DROP TABLE IF EXISTS chunks",
|
|
17504
|
-
"DROP TABLE IF EXISTS registry",
|
|
17505
|
-
"DROP TABLE IF EXISTS pages"
|
|
17506
|
-
]);
|
|
17507
|
-
this.chunksReady = false;
|
|
17508
|
-
this.registryReady = false;
|
|
17509
|
-
this.pagesReady = false;
|
|
17140
|
+
pageIndex(scope) {
|
|
17141
|
+
return this.client.index(pageIndexName(scope));
|
|
17510
17142
|
}
|
|
17511
|
-
async
|
|
17512
|
-
if (
|
|
17513
|
-
const
|
|
17514
|
-
await this.ensureChunks(dim);
|
|
17143
|
+
async upsertChunks(chunks, scope) {
|
|
17144
|
+
if (chunks.length === 0) return;
|
|
17145
|
+
const index = this.chunkIndex(scope);
|
|
17515
17146
|
const BATCH_SIZE = 100;
|
|
17516
|
-
for (let i = 0; i <
|
|
17517
|
-
const batch =
|
|
17518
|
-
|
|
17519
|
-
sql: `INSERT OR REPLACE INTO chunks
|
|
17520
|
-
(id, project_id, scope_name, url, path, title, section_title,
|
|
17521
|
-
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17522
|
-
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17523
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17524
|
-
args: [
|
|
17525
|
-
r.id,
|
|
17526
|
-
r.metadata.projectId,
|
|
17527
|
-
r.metadata.scopeName,
|
|
17528
|
-
r.metadata.url,
|
|
17529
|
-
r.metadata.path,
|
|
17530
|
-
r.metadata.title,
|
|
17531
|
-
r.metadata.sectionTitle,
|
|
17532
|
-
JSON.stringify(r.metadata.headingPath),
|
|
17533
|
-
r.metadata.snippet,
|
|
17534
|
-
r.metadata.chunkText,
|
|
17535
|
-
r.metadata.ordinal,
|
|
17536
|
-
r.metadata.contentHash,
|
|
17537
|
-
r.metadata.modelId,
|
|
17538
|
-
r.metadata.depth,
|
|
17539
|
-
r.metadata.incomingLinks,
|
|
17540
|
-
r.metadata.routeFile,
|
|
17541
|
-
JSON.stringify(r.metadata.tags),
|
|
17542
|
-
r.metadata.description ?? "",
|
|
17543
|
-
JSON.stringify(r.metadata.keywords ?? []),
|
|
17544
|
-
JSON.stringify(r.vector)
|
|
17545
|
-
]
|
|
17546
|
-
}));
|
|
17547
|
-
await this.client.batch(stmts);
|
|
17147
|
+
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17148
|
+
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17149
|
+
await index.upsert(batch);
|
|
17548
17150
|
}
|
|
17549
17151
|
}
|
|
17550
|
-
async query
|
|
17551
|
-
const
|
|
17552
|
-
await
|
|
17553
|
-
|
|
17554
|
-
|
|
17555
|
-
|
|
17556
|
-
|
|
17557
|
-
|
|
17558
|
-
|
|
17559
|
-
c.description, c.keywords,
|
|
17560
|
-
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17561
|
-
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17562
|
-
JOIN chunks AS c ON c.rowid = v.id`,
|
|
17563
|
-
args: [queryJson, queryJson, opts.topK]
|
|
17152
|
+
async search(query, opts, scope) {
|
|
17153
|
+
const index = this.chunkIndex(scope);
|
|
17154
|
+
const results = await index.search({
|
|
17155
|
+
query,
|
|
17156
|
+
limit: opts.limit,
|
|
17157
|
+
semanticWeight: opts.semanticWeight,
|
|
17158
|
+
inputEnrichment: opts.inputEnrichment,
|
|
17159
|
+
reranking: opts.reranking,
|
|
17160
|
+
filter: opts.filter
|
|
17564
17161
|
});
|
|
17565
|
-
|
|
17566
|
-
|
|
17567
|
-
|
|
17568
|
-
|
|
17569
|
-
|
|
17570
|
-
|
|
17571
|
-
|
|
17572
|
-
|
|
17573
|
-
|
|
17574
|
-
|
|
17575
|
-
|
|
17576
|
-
|
|
17577
|
-
|
|
17578
|
-
|
|
17579
|
-
|
|
17580
|
-
|
|
17162
|
+
return results.map((doc) => ({
|
|
17163
|
+
id: doc.id,
|
|
17164
|
+
score: doc.score,
|
|
17165
|
+
metadata: {
|
|
17166
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17167
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17168
|
+
url: doc.content.url,
|
|
17169
|
+
path: doc.metadata?.path ?? "",
|
|
17170
|
+
title: doc.content.title,
|
|
17171
|
+
sectionTitle: doc.content.sectionTitle,
|
|
17172
|
+
headingPath: doc.content.headingPath ? doc.content.headingPath.split(" > ").filter(Boolean) : [],
|
|
17173
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17174
|
+
chunkText: doc.content.text,
|
|
17175
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17176
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17177
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17178
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17179
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17180
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
17181
|
+
description: doc.metadata?.description || void 0,
|
|
17182
|
+
keywords: doc.metadata?.keywords ? doc.metadata.keywords.split(",").filter(Boolean) : void 0
|
|
17581
17183
|
}
|
|
17582
|
-
|
|
17583
|
-
|
|
17584
|
-
|
|
17585
|
-
|
|
17586
|
-
|
|
17587
|
-
|
|
17588
|
-
|
|
17589
|
-
|
|
17590
|
-
|
|
17591
|
-
|
|
17592
|
-
|
|
17593
|
-
|
|
17594
|
-
|
|
17595
|
-
})();
|
|
17596
|
-
hits.push({
|
|
17597
|
-
id: row.id,
|
|
17598
|
-
score,
|
|
17599
|
-
metadata: {
|
|
17600
|
-
projectId,
|
|
17601
|
-
scopeName,
|
|
17602
|
-
url: row.url,
|
|
17603
|
-
path: rowPath,
|
|
17604
|
-
title: row.title,
|
|
17605
|
-
sectionTitle: row.section_title,
|
|
17606
|
-
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17607
|
-
snippet: row.snippet,
|
|
17608
|
-
chunkText: row.chunk_text || "",
|
|
17609
|
-
ordinal: row.ordinal || 0,
|
|
17610
|
-
contentHash: row.content_hash,
|
|
17611
|
-
modelId: row.model_id,
|
|
17612
|
-
depth: row.depth,
|
|
17613
|
-
incomingLinks: row.incoming_links,
|
|
17614
|
-
routeFile: row.route_file,
|
|
17615
|
-
tags,
|
|
17616
|
-
description,
|
|
17617
|
-
keywords
|
|
17618
|
-
}
|
|
17184
|
+
}));
|
|
17185
|
+
}
|
|
17186
|
+
async searchPages(query, opts, scope) {
|
|
17187
|
+
const index = this.pageIndex(scope);
|
|
17188
|
+
let results;
|
|
17189
|
+
try {
|
|
17190
|
+
results = await index.search({
|
|
17191
|
+
query,
|
|
17192
|
+
limit: opts.limit,
|
|
17193
|
+
semanticWeight: opts.semanticWeight,
|
|
17194
|
+
inputEnrichment: opts.inputEnrichment,
|
|
17195
|
+
reranking: true,
|
|
17196
|
+
filter: opts.filter
|
|
17619
17197
|
});
|
|
17198
|
+
} catch {
|
|
17199
|
+
return [];
|
|
17620
17200
|
}
|
|
17621
|
-
|
|
17622
|
-
|
|
17201
|
+
return results.map((doc) => ({
|
|
17202
|
+
id: doc.id,
|
|
17203
|
+
score: doc.score,
|
|
17204
|
+
title: doc.content.title,
|
|
17205
|
+
url: doc.content.url,
|
|
17206
|
+
description: doc.content.description ?? "",
|
|
17207
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
17208
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17209
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17210
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17211
|
+
}));
|
|
17623
17212
|
}
|
|
17624
17213
|
async deleteByIds(ids, scope) {
|
|
17625
17214
|
if (ids.length === 0) return;
|
|
17215
|
+
const index = this.chunkIndex(scope);
|
|
17626
17216
|
const BATCH_SIZE = 500;
|
|
17627
17217
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17628
17218
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17629
|
-
|
|
17630
|
-
await this.client.execute({
|
|
17631
|
-
sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ? AND id IN (${placeholders})`,
|
|
17632
|
-
args: [scope.projectId, scope.scopeName, ...batch]
|
|
17633
|
-
});
|
|
17219
|
+
await index.delete(batch);
|
|
17634
17220
|
}
|
|
17635
17221
|
}
|
|
17636
17222
|
async deleteScope(scope) {
|
|
17637
|
-
await this.ensureRegistry();
|
|
17638
17223
|
try {
|
|
17639
|
-
|
|
17640
|
-
|
|
17641
|
-
|
|
17642
|
-
});
|
|
17643
|
-
} catch (error) {
|
|
17644
|
-
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
17645
|
-
throw error;
|
|
17646
|
-
}
|
|
17224
|
+
const chunkIdx = this.chunkIndex(scope);
|
|
17225
|
+
await chunkIdx.deleteIndex();
|
|
17226
|
+
} catch {
|
|
17647
17227
|
}
|
|
17648
17228
|
try {
|
|
17649
|
-
|
|
17650
|
-
|
|
17651
|
-
|
|
17652
|
-
});
|
|
17653
|
-
} catch (error) {
|
|
17654
|
-
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
17655
|
-
throw error;
|
|
17656
|
-
}
|
|
17229
|
+
const pageIdx = this.pageIndex(scope);
|
|
17230
|
+
await pageIdx.deleteIndex();
|
|
17231
|
+
} catch {
|
|
17657
17232
|
}
|
|
17658
|
-
await this.client.execute({
|
|
17659
|
-
sql: `DELETE FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
17660
|
-
args: [scope.projectId, scope.scopeName]
|
|
17661
|
-
});
|
|
17662
17233
|
}
|
|
17663
|
-
async listScopes(
|
|
17664
|
-
await this.
|
|
17665
|
-
const
|
|
17666
|
-
|
|
17667
|
-
|
|
17668
|
-
|
|
17669
|
-
|
|
17670
|
-
|
|
17671
|
-
|
|
17672
|
-
|
|
17673
|
-
|
|
17674
|
-
|
|
17675
|
-
|
|
17676
|
-
|
|
17677
|
-
|
|
17678
|
-
|
|
17679
|
-
|
|
17680
|
-
|
|
17681
|
-
|
|
17682
|
-
|
|
17683
|
-
|
|
17684
|
-
|
|
17685
|
-
|
|
17686
|
-
|
|
17687
|
-
|
|
17688
|
-
|
|
17689
|
-
|
|
17690
|
-
|
|
17691
|
-
|
|
17692
|
-
|
|
17693
|
-
|
|
17694
|
-
|
|
17695
|
-
|
|
17696
|
-
|
|
17697
|
-
|
|
17698
|
-
info.lastEstimateCostUSD ?? null,
|
|
17699
|
-
info.lastEstimateChangedChunks ?? null
|
|
17700
|
-
]
|
|
17701
|
-
});
|
|
17234
|
+
async listScopes(projectId) {
|
|
17235
|
+
const allIndexes = await this.client.listIndexes();
|
|
17236
|
+
const prefix = `${projectId}--`;
|
|
17237
|
+
const scopeNames = /* @__PURE__ */ new Set();
|
|
17238
|
+
for (const name of allIndexes) {
|
|
17239
|
+
if (name.startsWith(prefix) && !name.endsWith("--pages")) {
|
|
17240
|
+
const scopeName = name.slice(prefix.length);
|
|
17241
|
+
scopeNames.add(scopeName);
|
|
17242
|
+
}
|
|
17243
|
+
}
|
|
17244
|
+
const scopes = [];
|
|
17245
|
+
for (const scopeName of scopeNames) {
|
|
17246
|
+
const scope = {
|
|
17247
|
+
projectId,
|
|
17248
|
+
scopeName,
|
|
17249
|
+
scopeId: `${projectId}:${scopeName}`
|
|
17250
|
+
};
|
|
17251
|
+
try {
|
|
17252
|
+
const info = await this.chunkIndex(scope).info();
|
|
17253
|
+
scopes.push({
|
|
17254
|
+
projectId,
|
|
17255
|
+
scopeName,
|
|
17256
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17257
|
+
documentCount: info.documentCount
|
|
17258
|
+
});
|
|
17259
|
+
} catch {
|
|
17260
|
+
scopes.push({
|
|
17261
|
+
projectId,
|
|
17262
|
+
scopeName,
|
|
17263
|
+
lastIndexedAt: "unknown",
|
|
17264
|
+
documentCount: 0
|
|
17265
|
+
});
|
|
17266
|
+
}
|
|
17267
|
+
}
|
|
17268
|
+
return scopes;
|
|
17702
17269
|
}
|
|
17703
17270
|
async getContentHashes(scope) {
|
|
17704
|
-
const exists = await this.chunksTableExists();
|
|
17705
|
-
if (!exists) return /* @__PURE__ */ new Map();
|
|
17706
|
-
const rs = await this.client.execute({
|
|
17707
|
-
sql: `SELECT id, content_hash FROM chunks WHERE project_id = ? AND scope_name = ?`,
|
|
17708
|
-
args: [scope.projectId, scope.scopeName]
|
|
17709
|
-
});
|
|
17710
17271
|
const map = /* @__PURE__ */ new Map();
|
|
17711
|
-
|
|
17712
|
-
|
|
17272
|
+
const index = this.chunkIndex(scope);
|
|
17273
|
+
let cursor = "0";
|
|
17274
|
+
try {
|
|
17275
|
+
for (; ; ) {
|
|
17276
|
+
const result = await index.range({ cursor, limit: 100 });
|
|
17277
|
+
for (const doc of result.documents) {
|
|
17278
|
+
if (doc.metadata?.contentHash) {
|
|
17279
|
+
map.set(doc.id, doc.metadata.contentHash);
|
|
17280
|
+
}
|
|
17281
|
+
}
|
|
17282
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17283
|
+
cursor = result.nextCursor;
|
|
17284
|
+
}
|
|
17285
|
+
} catch {
|
|
17713
17286
|
}
|
|
17714
17287
|
return map;
|
|
17715
17288
|
}
|
|
17716
17289
|
async upsertPages(pages, scope) {
|
|
17717
17290
|
if (pages.length === 0) return;
|
|
17718
|
-
|
|
17719
|
-
|
|
17720
|
-
if (page.projectId !== scope.projectId || page.scopeName !== scope.scopeName) {
|
|
17721
|
-
throw new Error(
|
|
17722
|
-
`Page scope mismatch: page has ${page.projectId}:${page.scopeName} but scope is ${scope.projectId}:${scope.scopeName}`
|
|
17723
|
-
);
|
|
17724
|
-
}
|
|
17725
|
-
}
|
|
17726
|
-
const BATCH_SIZE = 100;
|
|
17291
|
+
const index = this.pageIndex(scope);
|
|
17292
|
+
const BATCH_SIZE = 50;
|
|
17727
17293
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17728
17294
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17729
|
-
const
|
|
17730
|
-
|
|
17731
|
-
|
|
17732
|
-
|
|
17733
|
-
|
|
17734
|
-
|
|
17735
|
-
p.
|
|
17736
|
-
p.
|
|
17737
|
-
p.
|
|
17738
|
-
p.
|
|
17739
|
-
|
|
17740
|
-
|
|
17741
|
-
p.
|
|
17742
|
-
p.
|
|
17743
|
-
p.
|
|
17744
|
-
p.
|
|
17745
|
-
|
|
17746
|
-
p.
|
|
17747
|
-
|
|
17295
|
+
const docs = batch.map((p) => ({
|
|
17296
|
+
id: p.url,
|
|
17297
|
+
content: {
|
|
17298
|
+
title: p.title,
|
|
17299
|
+
url: p.url,
|
|
17300
|
+
type: "page",
|
|
17301
|
+
description: p.description ?? "",
|
|
17302
|
+
keywords: (p.keywords ?? []).join(","),
|
|
17303
|
+
summary: p.summary ?? "",
|
|
17304
|
+
tags: p.tags.join(",")
|
|
17305
|
+
},
|
|
17306
|
+
metadata: {
|
|
17307
|
+
markdown: p.markdown,
|
|
17308
|
+
projectId: p.projectId,
|
|
17309
|
+
scopeName: p.scopeName,
|
|
17310
|
+
routeFile: p.routeFile,
|
|
17311
|
+
routeResolution: p.routeResolution,
|
|
17312
|
+
incomingLinks: p.incomingLinks,
|
|
17313
|
+
outgoingLinks: p.outgoingLinks,
|
|
17314
|
+
depth: p.depth,
|
|
17315
|
+
indexedAt: p.indexedAt
|
|
17316
|
+
}
|
|
17748
17317
|
}));
|
|
17749
|
-
await
|
|
17318
|
+
await index.upsert(docs);
|
|
17750
17319
|
}
|
|
17751
17320
|
}
|
|
17752
17321
|
async getPage(url, scope) {
|
|
17753
|
-
|
|
17754
|
-
|
|
17755
|
-
|
|
17756
|
-
|
|
17757
|
-
|
|
17758
|
-
|
|
17759
|
-
|
|
17760
|
-
|
|
17761
|
-
|
|
17762
|
-
|
|
17763
|
-
|
|
17764
|
-
|
|
17765
|
-
|
|
17766
|
-
|
|
17767
|
-
|
|
17768
|
-
|
|
17769
|
-
|
|
17770
|
-
|
|
17771
|
-
|
|
17772
|
-
|
|
17773
|
-
|
|
17322
|
+
const index = this.pageIndex(scope);
|
|
17323
|
+
try {
|
|
17324
|
+
const results = await index.fetch([url]);
|
|
17325
|
+
const doc = results[0];
|
|
17326
|
+
if (!doc) return null;
|
|
17327
|
+
return {
|
|
17328
|
+
url: doc.content.url,
|
|
17329
|
+
title: doc.content.title,
|
|
17330
|
+
markdown: doc.metadata.markdown,
|
|
17331
|
+
projectId: doc.metadata.projectId,
|
|
17332
|
+
scopeName: doc.metadata.scopeName,
|
|
17333
|
+
routeFile: doc.metadata.routeFile,
|
|
17334
|
+
routeResolution: doc.metadata.routeResolution,
|
|
17335
|
+
incomingLinks: doc.metadata.incomingLinks,
|
|
17336
|
+
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17337
|
+
depth: doc.metadata.depth,
|
|
17338
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
17339
|
+
indexedAt: doc.metadata.indexedAt,
|
|
17340
|
+
summary: doc.content.summary || void 0,
|
|
17341
|
+
description: doc.content.description || void 0,
|
|
17342
|
+
keywords: doc.content.keywords ? doc.content.keywords.split(",").filter(Boolean) : void 0
|
|
17343
|
+
};
|
|
17344
|
+
} catch {
|
|
17345
|
+
return null;
|
|
17346
|
+
}
|
|
17774
17347
|
}
|
|
17775
17348
|
async deletePages(scope) {
|
|
17776
|
-
|
|
17777
|
-
|
|
17778
|
-
|
|
17779
|
-
|
|
17780
|
-
}
|
|
17781
|
-
}
|
|
17782
|
-
async getScopeModelId(scope) {
|
|
17783
|
-
await this.ensureRegistry();
|
|
17784
|
-
const rs = await this.client.execute({
|
|
17785
|
-
sql: `SELECT model_id FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
17786
|
-
args: [scope.projectId, scope.scopeName]
|
|
17787
|
-
});
|
|
17788
|
-
if (rs.rows.length === 0) return null;
|
|
17789
|
-
return rs.rows[0].model_id;
|
|
17349
|
+
try {
|
|
17350
|
+
const index = this.pageIndex(scope);
|
|
17351
|
+
await index.reset();
|
|
17352
|
+
} catch {
|
|
17353
|
+
}
|
|
17790
17354
|
}
|
|
17791
17355
|
async health() {
|
|
17792
17356
|
try {
|
|
17793
|
-
await this.client.
|
|
17357
|
+
await this.client.info();
|
|
17794
17358
|
return { ok: true };
|
|
17795
17359
|
} catch (error) {
|
|
17796
17360
|
return {
|
|
@@ -17799,40 +17363,34 @@ var TursoVectorStore = class {
|
|
|
17799
17363
|
};
|
|
17800
17364
|
}
|
|
17801
17365
|
}
|
|
17366
|
+
async dropAllIndexes(projectId) {
|
|
17367
|
+
const allIndexes = await this.client.listIndexes();
|
|
17368
|
+
const prefix = `${projectId}--`;
|
|
17369
|
+
for (const name of allIndexes) {
|
|
17370
|
+
if (name.startsWith(prefix)) {
|
|
17371
|
+
try {
|
|
17372
|
+
const index = this.client.index(name);
|
|
17373
|
+
await index.deleteIndex();
|
|
17374
|
+
} catch {
|
|
17375
|
+
}
|
|
17376
|
+
}
|
|
17377
|
+
}
|
|
17378
|
+
}
|
|
17802
17379
|
};
|
|
17803
17380
|
|
|
17804
17381
|
// src/vector/factory.ts
|
|
17805
|
-
async function
|
|
17806
|
-
const
|
|
17807
|
-
const
|
|
17808
|
-
if (
|
|
17809
|
-
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17810
|
-
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17811
|
-
const client2 = createClient2({
|
|
17812
|
-
url: remoteUrl,
|
|
17813
|
-
authToken
|
|
17814
|
-
});
|
|
17815
|
-
return new TursoVectorStore({
|
|
17816
|
-
client: client2,
|
|
17817
|
-
dimension: config.vector.dimension
|
|
17818
|
-
});
|
|
17819
|
-
}
|
|
17820
|
-
if (isServerless()) {
|
|
17382
|
+
async function createUpstashStore(config) {
|
|
17383
|
+
const url = config.upstash.url ?? process.env[config.upstash.urlEnv];
|
|
17384
|
+
const token = config.upstash.token ?? process.env[config.upstash.tokenEnv];
|
|
17385
|
+
if (!url || !token) {
|
|
17821
17386
|
throw new SearchSocketError(
|
|
17822
17387
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17823
|
-
`
|
|
17388
|
+
`Missing Upstash Search credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17824
17389
|
);
|
|
17825
17390
|
}
|
|
17826
|
-
const {
|
|
17827
|
-
const
|
|
17828
|
-
|
|
17829
|
-
const client = createClient({
|
|
17830
|
-
url: `file:${localPath}`
|
|
17831
|
-
});
|
|
17832
|
-
return new TursoVectorStore({
|
|
17833
|
-
client,
|
|
17834
|
-
dimension: config.vector.dimension
|
|
17835
|
-
});
|
|
17391
|
+
const { Search } = await import('@upstash/search');
|
|
17392
|
+
const client = new Search({ url, token });
|
|
17393
|
+
return new UpstashSearchStore({ client });
|
|
17836
17394
|
}
|
|
17837
17395
|
function sha1(input) {
|
|
17838
17396
|
return crypto.createHash("sha1").update(input).digest("hex");
|
|
@@ -17851,13 +17409,6 @@ function normalizeUrlPath(rawPath) {
|
|
|
17851
17409
|
}
|
|
17852
17410
|
return out;
|
|
17853
17411
|
}
|
|
17854
|
-
function urlPathToMirrorRelative(urlPath) {
|
|
17855
|
-
const normalized = normalizeUrlPath(urlPath);
|
|
17856
|
-
if (normalized === "/") {
|
|
17857
|
-
return "index.md";
|
|
17858
|
-
}
|
|
17859
|
-
return `${normalized.slice(1)}.md`;
|
|
17860
|
-
}
|
|
17861
17412
|
function staticHtmlFileToUrl(filePath, rootDir) {
|
|
17862
17413
|
const relative = path__default.default.relative(rootDir, filePath).replace(/\\/g, "/");
|
|
17863
17414
|
if (relative === "index.html") {
|
|
@@ -18132,7 +17683,7 @@ function buildEmbeddingText(chunk, prependTitle) {
|
|
|
18132
17683
|
|
|
18133
17684
|
${chunk.chunkText}`;
|
|
18134
17685
|
}
|
|
18135
|
-
function
|
|
17686
|
+
function chunkPage(page, config, scope) {
|
|
18136
17687
|
const sections = parseHeadingSections(page.markdown, config.chunking.headingPathDepth);
|
|
18137
17688
|
const rawChunks = sections.flatMap((section) => splitSection(section, config.chunking));
|
|
18138
17689
|
const chunks = [];
|
|
@@ -19049,6 +18600,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19049
18600
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19050
18601
|
return null;
|
|
19051
18602
|
}
|
|
18603
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
18604
|
+
let weight;
|
|
18605
|
+
if (weightRaw !== void 0) {
|
|
18606
|
+
const parsed = Number(weightRaw);
|
|
18607
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
18608
|
+
weight = parsed;
|
|
18609
|
+
}
|
|
18610
|
+
}
|
|
18611
|
+
if (weight === 0) {
|
|
18612
|
+
return null;
|
|
18613
|
+
}
|
|
19052
18614
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19053
18615
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19054
18616
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19104,7 +18666,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19104
18666
|
noindex: false,
|
|
19105
18667
|
tags,
|
|
19106
18668
|
description,
|
|
19107
|
-
keywords
|
|
18669
|
+
keywords,
|
|
18670
|
+
weight
|
|
19108
18671
|
};
|
|
19109
18672
|
}
|
|
19110
18673
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19117,6 +18680,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19117
18680
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19118
18681
|
return null;
|
|
19119
18682
|
}
|
|
18683
|
+
let mdWeight;
|
|
18684
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
18685
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
18686
|
+
mdWeight = rawWeight;
|
|
18687
|
+
}
|
|
18688
|
+
if (mdWeight === 0) {
|
|
18689
|
+
return null;
|
|
18690
|
+
}
|
|
19120
18691
|
const content = parsed.content;
|
|
19121
18692
|
const normalized = normalizeMarkdown(content);
|
|
19122
18693
|
if (!normalizeText(normalized)) {
|
|
@@ -19139,56 +18710,10 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19139
18710
|
noindex: false,
|
|
19140
18711
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19141
18712
|
description: fmDescription,
|
|
19142
|
-
keywords: fmKeywords
|
|
18713
|
+
keywords: fmKeywords,
|
|
18714
|
+
weight: mdWeight
|
|
19143
18715
|
};
|
|
19144
18716
|
}
|
|
19145
|
-
function yamlString(value) {
|
|
19146
|
-
return JSON.stringify(value);
|
|
19147
|
-
}
|
|
19148
|
-
function yamlArray(values) {
|
|
19149
|
-
return `[${values.map((v) => JSON.stringify(v)).join(", ")}]`;
|
|
19150
|
-
}
|
|
19151
|
-
function buildMirrorMarkdown(page) {
|
|
19152
|
-
const frontmatterLines = [
|
|
19153
|
-
"---",
|
|
19154
|
-
`url: ${yamlString(page.url)}`,
|
|
19155
|
-
`title: ${yamlString(page.title)}`,
|
|
19156
|
-
`scope: ${yamlString(page.scope)}`,
|
|
19157
|
-
`routeFile: ${yamlString(page.routeFile)}`,
|
|
19158
|
-
`routeResolution: ${yamlString(page.routeResolution)}`,
|
|
19159
|
-
`generatedAt: ${yamlString(page.generatedAt)}`,
|
|
19160
|
-
`incomingLinks: ${page.incomingLinks}`,
|
|
19161
|
-
`outgoingLinks: ${page.outgoingLinks}`,
|
|
19162
|
-
`depth: ${page.depth}`,
|
|
19163
|
-
`tags: ${yamlArray(page.tags)}`,
|
|
19164
|
-
"---",
|
|
19165
|
-
""
|
|
19166
|
-
];
|
|
19167
|
-
return `${frontmatterLines.join("\n")}${normalizeMarkdown(page.markdown)}`;
|
|
19168
|
-
}
|
|
19169
|
-
function stripGeneratedAt(content) {
|
|
19170
|
-
return content.replace(/^generatedAt: .*$/m, "");
|
|
19171
|
-
}
|
|
19172
|
-
async function writeMirrorPage(statePath, scope, page) {
|
|
19173
|
-
const relative = urlPathToMirrorRelative(page.url);
|
|
19174
|
-
const outputPath = path__default.default.join(statePath, "pages", scope.scopeName, relative);
|
|
19175
|
-
await fs4__default.default.mkdir(path__default.default.dirname(outputPath), { recursive: true });
|
|
19176
|
-
const newContent = buildMirrorMarkdown(page);
|
|
19177
|
-
try {
|
|
19178
|
-
const existing = await fs4__default.default.readFile(outputPath, "utf8");
|
|
19179
|
-
if (stripGeneratedAt(existing) === stripGeneratedAt(newContent)) {
|
|
19180
|
-
return outputPath;
|
|
19181
|
-
}
|
|
19182
|
-
} catch {
|
|
19183
|
-
}
|
|
19184
|
-
await fs4__default.default.writeFile(outputPath, newContent, "utf8");
|
|
19185
|
-
return outputPath;
|
|
19186
|
-
}
|
|
19187
|
-
async function cleanMirrorForScope(statePath, scope) {
|
|
19188
|
-
const target = path__default.default.join(statePath, "pages", scope.scopeName);
|
|
19189
|
-
await fs4__default.default.rm(target, { recursive: true, force: true });
|
|
19190
|
-
await fs4__default.default.mkdir(target, { recursive: true });
|
|
19191
|
-
}
|
|
19192
18717
|
function segmentToRegex(segment) {
|
|
19193
18718
|
if (segment.startsWith("(") && segment.endsWith(")")) {
|
|
19194
18719
|
return { regex: "", score: 0 };
|
|
@@ -19335,6 +18860,38 @@ var Logger = class {
|
|
|
19335
18860
|
`);
|
|
19336
18861
|
}
|
|
19337
18862
|
};
|
|
18863
|
+
|
|
18864
|
+
// src/utils/pattern.ts
|
|
18865
|
+
function matchUrlPattern(url, pattern) {
|
|
18866
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
18867
|
+
const normalizedUrl = norm(url);
|
|
18868
|
+
const normalizedPattern = norm(pattern);
|
|
18869
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
18870
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
18871
|
+
if (prefix === "") {
|
|
18872
|
+
return true;
|
|
18873
|
+
}
|
|
18874
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
18875
|
+
}
|
|
18876
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
18877
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
18878
|
+
if (prefix === "") {
|
|
18879
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
18880
|
+
}
|
|
18881
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
18882
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
18883
|
+
return rest.length > 0 && !rest.includes("/");
|
|
18884
|
+
}
|
|
18885
|
+
return normalizedUrl === normalizedPattern;
|
|
18886
|
+
}
|
|
18887
|
+
function matchUrlPatterns(url, patterns) {
|
|
18888
|
+
for (const pattern of patterns) {
|
|
18889
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
18890
|
+
}
|
|
18891
|
+
return false;
|
|
18892
|
+
}
|
|
18893
|
+
|
|
18894
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
19338
18895
|
function routeIdToFile(routeId) {
|
|
19339
18896
|
if (routeId === "/") {
|
|
19340
18897
|
return "src/routes/+page.svelte";
|
|
@@ -19349,7 +18906,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
19349
18906
|
const manifestPath = path__default.default.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
19350
18907
|
let content;
|
|
19351
18908
|
try {
|
|
19352
|
-
content = await
|
|
18909
|
+
content = await fs3__default.default.readFile(manifestPath, "utf8");
|
|
19353
18910
|
} catch {
|
|
19354
18911
|
throw new SearchSocketError(
|
|
19355
18912
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19408,15 +18965,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19408
18965
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19409
18966
|
}
|
|
19410
18967
|
function isExcluded(url, patterns) {
|
|
19411
|
-
|
|
19412
|
-
if (pattern.endsWith("/*")) {
|
|
19413
|
-
const prefix = pattern.slice(0, -1);
|
|
19414
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19415
|
-
} else if (url === pattern) {
|
|
19416
|
-
return true;
|
|
19417
|
-
}
|
|
19418
|
-
}
|
|
19419
|
-
return false;
|
|
18968
|
+
return matchUrlPatterns(url, patterns);
|
|
19420
18969
|
}
|
|
19421
18970
|
function findFreePort() {
|
|
19422
18971
|
return new Promise((resolve, reject) => {
|
|
@@ -19530,7 +19079,7 @@ async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
|
19530
19079
|
const visited = /* @__PURE__ */ new Set();
|
|
19531
19080
|
const pages = [];
|
|
19532
19081
|
const queue = [];
|
|
19533
|
-
const limit =
|
|
19082
|
+
const limit = pLimit__default.default(8);
|
|
19534
19083
|
for (const seed of seedUrls) {
|
|
19535
19084
|
const normalized = normalizeUrlPath(seed);
|
|
19536
19085
|
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
@@ -19612,7 +19161,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19612
19161
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19613
19162
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19614
19163
|
try {
|
|
19615
|
-
const concurrencyLimit =
|
|
19164
|
+
const concurrencyLimit = pLimit__default.default(8);
|
|
19616
19165
|
const results = await Promise.allSettled(
|
|
19617
19166
|
selected.map(
|
|
19618
19167
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19686,7 +19235,7 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19686
19235
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19687
19236
|
const pages = [];
|
|
19688
19237
|
for (const filePath of selected) {
|
|
19689
|
-
const raw = await
|
|
19238
|
+
const raw = await fs3__default.default.readFile(filePath, "utf8");
|
|
19690
19239
|
const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
|
|
19691
19240
|
pages.push({
|
|
19692
19241
|
url: filePathToUrl(filePath, baseDir),
|
|
@@ -19781,7 +19330,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
19781
19330
|
const routes = await resolveRoutes(config);
|
|
19782
19331
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
19783
19332
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
19784
|
-
const concurrencyLimit =
|
|
19333
|
+
const concurrencyLimit = pLimit__default.default(8);
|
|
19785
19334
|
const results = await Promise.allSettled(
|
|
19786
19335
|
selected.map(
|
|
19787
19336
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19822,7 +19371,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19822
19371
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19823
19372
|
const pages = [];
|
|
19824
19373
|
for (const filePath of selected) {
|
|
19825
|
-
const html = await
|
|
19374
|
+
const html = await fs3__default.default.readFile(filePath, "utf8");
|
|
19826
19375
|
pages.push({
|
|
19827
19376
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19828
19377
|
html,
|
|
@@ -19832,43 +19381,293 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19832
19381
|
}
|
|
19833
19382
|
return pages;
|
|
19834
19383
|
}
|
|
19835
|
-
|
|
19836
|
-
|
|
19837
|
-
|
|
19838
|
-
|
|
19839
|
-
|
|
19840
|
-
|
|
19841
|
-
|
|
19384
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
19385
|
+
const lines = content.split(/\r?\n/);
|
|
19386
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
19387
|
+
let currentAgents = [];
|
|
19388
|
+
for (const rawLine of lines) {
|
|
19389
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
19390
|
+
if (!line) continue;
|
|
19391
|
+
const colonIdx = line.indexOf(":");
|
|
19392
|
+
if (colonIdx === -1) continue;
|
|
19393
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
19394
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
19395
|
+
if (directive === "user-agent") {
|
|
19396
|
+
const agentName = value.toLowerCase();
|
|
19397
|
+
currentAgents.push(agentName);
|
|
19398
|
+
if (!agentGroups.has(agentName)) {
|
|
19399
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
19400
|
+
}
|
|
19401
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
19402
|
+
for (const agent of currentAgents) {
|
|
19403
|
+
agentGroups.get(agent).disallow.push(value);
|
|
19404
|
+
}
|
|
19405
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
19406
|
+
for (const agent of currentAgents) {
|
|
19407
|
+
agentGroups.get(agent).allow.push(value);
|
|
19408
|
+
}
|
|
19409
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
19410
|
+
currentAgents = [];
|
|
19411
|
+
}
|
|
19412
|
+
}
|
|
19413
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
19414
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
19415
|
+
return specific;
|
|
19416
|
+
}
|
|
19417
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
19418
|
+
}
|
|
19419
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
19420
|
+
let longestDisallow = "";
|
|
19421
|
+
for (const pattern of rules3.disallow) {
|
|
19422
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
19423
|
+
longestDisallow = pattern;
|
|
19424
|
+
}
|
|
19425
|
+
}
|
|
19426
|
+
if (!longestDisallow) return false;
|
|
19427
|
+
let longestAllow = "";
|
|
19428
|
+
for (const pattern of rules3.allow) {
|
|
19429
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
19430
|
+
longestAllow = pattern;
|
|
19431
|
+
}
|
|
19432
|
+
}
|
|
19433
|
+
return longestAllow.length < longestDisallow.length;
|
|
19434
|
+
}
|
|
19435
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
19436
|
+
try {
|
|
19437
|
+
const content = await fs3__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
19438
|
+
return parseRobotsTxt(content);
|
|
19439
|
+
} catch {
|
|
19440
|
+
return null;
|
|
19441
|
+
}
|
|
19442
|
+
}
|
|
19443
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
19444
|
+
try {
|
|
19445
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
19446
|
+
const response = await fetch(url);
|
|
19447
|
+
if (!response.ok) return null;
|
|
19448
|
+
const content = await response.text();
|
|
19449
|
+
return parseRobotsTxt(content);
|
|
19450
|
+
} catch {
|
|
19451
|
+
return null;
|
|
19452
|
+
}
|
|
19453
|
+
}
|
|
19454
|
+
|
|
19455
|
+
// src/search/ranking.ts
|
|
19456
|
+
function nonNegativeOrZero(value) {
|
|
19457
|
+
if (!Number.isFinite(value)) {
|
|
19458
|
+
return 0;
|
|
19459
|
+
}
|
|
19460
|
+
return Math.max(0, value);
|
|
19461
|
+
}
|
|
19462
|
+
function normalizeForTitleMatch(text) {
|
|
19463
|
+
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
19464
|
+
}
|
|
19465
|
+
function rankHits(hits, config, query) {
|
|
19466
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
19467
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
19468
|
+
return hits.map((hit) => {
|
|
19469
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
19470
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
19471
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19472
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
19473
|
+
}
|
|
19474
|
+
if (config.ranking.enableDepthBoost) {
|
|
19475
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19476
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
19477
|
+
}
|
|
19478
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
19479
|
+
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
19480
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
19481
|
+
score += titleMatchWeight;
|
|
19482
|
+
}
|
|
19483
|
+
}
|
|
19484
|
+
return {
|
|
19485
|
+
hit,
|
|
19486
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19487
|
+
};
|
|
19488
|
+
}).sort((a, b) => {
|
|
19489
|
+
const delta = b.finalScore - a.finalScore;
|
|
19490
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19491
|
+
});
|
|
19492
|
+
}
|
|
19493
|
+
function trimByScoreGap(results, config) {
|
|
19494
|
+
if (results.length === 0) return results;
|
|
19495
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
19496
|
+
const minScore = config.ranking.minScore;
|
|
19497
|
+
if (minScore > 0 && results.length > 0) {
|
|
19498
|
+
const sortedScores = results.map((r) => r.pageScore).sort((a, b) => a - b);
|
|
19499
|
+
const mid = Math.floor(sortedScores.length / 2);
|
|
19500
|
+
const median = sortedScores.length % 2 === 0 ? (sortedScores[mid - 1] + sortedScores[mid]) / 2 : sortedScores[mid];
|
|
19501
|
+
if (median < minScore) return [];
|
|
19502
|
+
}
|
|
19503
|
+
if (threshold > 0 && results.length > 1) {
|
|
19504
|
+
for (let i = 1; i < results.length; i++) {
|
|
19505
|
+
const prev = results[i - 1].pageScore;
|
|
19506
|
+
const current = results[i].pageScore;
|
|
19507
|
+
if (prev > 0) {
|
|
19508
|
+
const gap = (prev - current) / prev;
|
|
19509
|
+
if (gap >= threshold) {
|
|
19510
|
+
return results.slice(0, i);
|
|
19511
|
+
}
|
|
19512
|
+
}
|
|
19513
|
+
}
|
|
19514
|
+
}
|
|
19515
|
+
return results;
|
|
19516
|
+
}
|
|
19517
|
+
function findPageWeight(url, pageWeights) {
|
|
19518
|
+
let bestPattern = "";
|
|
19519
|
+
let bestWeight = 1;
|
|
19520
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
19521
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
19522
|
+
bestPattern = pattern;
|
|
19523
|
+
bestWeight = weight;
|
|
19524
|
+
}
|
|
19525
|
+
}
|
|
19526
|
+
return bestWeight;
|
|
19527
|
+
}
|
|
19528
|
+
function aggregateByPage(ranked, config) {
|
|
19529
|
+
const groups = /* @__PURE__ */ new Map();
|
|
19530
|
+
for (const hit of ranked) {
|
|
19531
|
+
const url = hit.hit.metadata.url;
|
|
19532
|
+
const group = groups.get(url);
|
|
19533
|
+
if (group) group.push(hit);
|
|
19534
|
+
else groups.set(url, [hit]);
|
|
19535
|
+
}
|
|
19536
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
19537
|
+
const pages = [];
|
|
19538
|
+
for (const [url, chunks] of groups) {
|
|
19539
|
+
chunks.sort((a, b) => {
|
|
19540
|
+
const delta = b.finalScore - a.finalScore;
|
|
19541
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19542
|
+
});
|
|
19543
|
+
const best = chunks[0];
|
|
19544
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
19545
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
19546
|
+
let aggregationBonus = 0;
|
|
19547
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
19548
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
19549
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
19550
|
+
}
|
|
19551
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
19552
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
19553
|
+
if (pageWeight === 0) continue;
|
|
19554
|
+
if (pageWeight !== 1) {
|
|
19555
|
+
pageScore *= pageWeight;
|
|
19556
|
+
}
|
|
19557
|
+
pages.push({
|
|
19558
|
+
url,
|
|
19559
|
+
title: best.hit.metadata.title,
|
|
19560
|
+
routeFile: best.hit.metadata.routeFile,
|
|
19561
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
19562
|
+
bestChunk: best,
|
|
19563
|
+
matchingChunks: chunks
|
|
19564
|
+
});
|
|
19565
|
+
}
|
|
19566
|
+
return pages.sort((a, b) => {
|
|
19567
|
+
const delta = b.pageScore - a.pageScore;
|
|
19568
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19569
|
+
});
|
|
19570
|
+
}
|
|
19571
|
+
function mergePageAndChunkResults(pageHits, rankedChunks, config) {
|
|
19572
|
+
if (pageHits.length === 0) return rankedChunks;
|
|
19573
|
+
const w = config.search.pageSearchWeight;
|
|
19574
|
+
const pageScoreMap = /* @__PURE__ */ new Map();
|
|
19575
|
+
for (const ph of pageHits) {
|
|
19576
|
+
pageScoreMap.set(ph.url, ph);
|
|
19577
|
+
}
|
|
19578
|
+
const pagesWithChunks = /* @__PURE__ */ new Set();
|
|
19579
|
+
const merged = rankedChunks.map((ranked) => {
|
|
19580
|
+
const url = ranked.hit.metadata.url;
|
|
19581
|
+
const pageHit = pageScoreMap.get(url);
|
|
19582
|
+
if (pageHit) {
|
|
19583
|
+
pagesWithChunks.add(url);
|
|
19584
|
+
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
19585
|
+
return {
|
|
19586
|
+
hit: ranked.hit,
|
|
19587
|
+
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
19588
|
+
};
|
|
19589
|
+
}
|
|
19590
|
+
return ranked;
|
|
19591
|
+
});
|
|
19592
|
+
for (const [url, pageHit] of pageScoreMap) {
|
|
19593
|
+
if (pagesWithChunks.has(url)) continue;
|
|
19594
|
+
const syntheticScore = pageHit.score * w;
|
|
19595
|
+
const syntheticHit = {
|
|
19596
|
+
id: `page:${url}`,
|
|
19597
|
+
score: pageHit.score,
|
|
19598
|
+
metadata: {
|
|
19599
|
+
projectId: "",
|
|
19600
|
+
scopeName: "",
|
|
19601
|
+
url: pageHit.url,
|
|
19602
|
+
path: pageHit.url,
|
|
19603
|
+
title: pageHit.title,
|
|
19604
|
+
sectionTitle: "",
|
|
19605
|
+
headingPath: [],
|
|
19606
|
+
snippet: pageHit.description || pageHit.title,
|
|
19607
|
+
chunkText: pageHit.description || pageHit.title,
|
|
19608
|
+
ordinal: 0,
|
|
19609
|
+
contentHash: "",
|
|
19610
|
+
depth: pageHit.depth,
|
|
19611
|
+
incomingLinks: pageHit.incomingLinks,
|
|
19612
|
+
routeFile: pageHit.routeFile,
|
|
19613
|
+
tags: pageHit.tags
|
|
19614
|
+
}
|
|
19615
|
+
};
|
|
19616
|
+
merged.push({
|
|
19617
|
+
hit: syntheticHit,
|
|
19618
|
+
finalScore: Number.isFinite(syntheticScore) ? syntheticScore : 0
|
|
19619
|
+
});
|
|
19620
|
+
}
|
|
19621
|
+
return merged.sort((a, b) => {
|
|
19622
|
+
const delta = b.finalScore - a.finalScore;
|
|
19623
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19624
|
+
});
|
|
19625
|
+
}
|
|
19626
|
+
|
|
19627
|
+
// src/utils/time.ts
|
|
19628
|
+
function nowIso() {
|
|
19629
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
19630
|
+
}
|
|
19631
|
+
function hrTimeMs(start) {
|
|
19632
|
+
return Number(process.hrtime.bigint() - start) / 1e6;
|
|
19633
|
+
}
|
|
19634
|
+
|
|
19635
|
+
// src/indexing/pipeline.ts
|
|
19636
|
+
function buildPageSummary(page, maxChars = 3500) {
|
|
19637
|
+
const parts = [page.title];
|
|
19638
|
+
if (page.description) {
|
|
19639
|
+
parts.push(page.description);
|
|
19640
|
+
}
|
|
19641
|
+
if (page.keywords && page.keywords.length > 0) {
|
|
19642
|
+
parts.push(page.keywords.join(", "));
|
|
19643
|
+
}
|
|
19644
|
+
const plainBody = page.markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/!?\[([^\]]*)\]\([^)]*\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/[>*_|~\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
19645
|
+
if (plainBody) {
|
|
19646
|
+
parts.push(plainBody);
|
|
19647
|
+
}
|
|
19648
|
+
const joined = parts.join("\n\n");
|
|
19649
|
+
if (joined.length <= maxChars) return joined;
|
|
19650
|
+
return joined.slice(0, maxChars).trim();
|
|
19842
19651
|
}
|
|
19843
|
-
|
|
19844
|
-
// src/indexing/pipeline.ts
|
|
19845
|
-
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19846
|
-
"jina-embeddings-v3": 2e-5
|
|
19847
|
-
};
|
|
19848
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
19849
19652
|
var IndexPipeline = class _IndexPipeline {
|
|
19850
19653
|
cwd;
|
|
19851
19654
|
config;
|
|
19852
|
-
|
|
19853
|
-
vectorStore;
|
|
19655
|
+
store;
|
|
19854
19656
|
logger;
|
|
19855
19657
|
constructor(options) {
|
|
19856
19658
|
this.cwd = options.cwd;
|
|
19857
19659
|
this.config = options.config;
|
|
19858
|
-
this.
|
|
19859
|
-
this.vectorStore = options.vectorStore;
|
|
19660
|
+
this.store = options.store;
|
|
19860
19661
|
this.logger = options.logger;
|
|
19861
19662
|
}
|
|
19862
19663
|
static async create(options = {}) {
|
|
19863
19664
|
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
19864
19665
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
19865
|
-
const
|
|
19866
|
-
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
19666
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
19867
19667
|
return new _IndexPipeline({
|
|
19868
19668
|
cwd,
|
|
19869
19669
|
config,
|
|
19870
|
-
|
|
19871
|
-
vectorStore,
|
|
19670
|
+
store,
|
|
19872
19671
|
logger: options.logger ?? new Logger()
|
|
19873
19672
|
});
|
|
19874
19673
|
}
|
|
@@ -19888,25 +19687,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19888
19687
|
stageTimingsMs[name] = Math.round(hrTimeMs(start));
|
|
19889
19688
|
};
|
|
19890
19689
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19891
|
-
|
|
19690
|
+
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
19892
19691
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19893
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode},
|
|
19692
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-search)`);
|
|
19894
19693
|
if (options.force) {
|
|
19895
19694
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19896
|
-
await cleanMirrorForScope(statePath, scope);
|
|
19897
19695
|
}
|
|
19898
19696
|
if (options.dryRun) {
|
|
19899
19697
|
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
19900
19698
|
}
|
|
19901
19699
|
const manifestStart = stageStart();
|
|
19902
|
-
const existingHashes = await this.
|
|
19903
|
-
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
19904
|
-
if (existingModelId && existingModelId !== this.config.embeddings.model && !options.force) {
|
|
19905
|
-
throw new SearchSocketError(
|
|
19906
|
-
"EMBEDDING_MODEL_MISMATCH",
|
|
19907
|
-
`Scope ${scope.scopeName} uses model ${existingModelId}. Re-run with --force to migrate.`
|
|
19908
|
-
);
|
|
19909
|
-
}
|
|
19700
|
+
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
|
|
19910
19701
|
stageEnd("manifest", manifestStart);
|
|
19911
19702
|
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
19912
19703
|
const sourceStart = stageStart();
|
|
@@ -19923,6 +19714,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19923
19714
|
}
|
|
19924
19715
|
stageEnd("source", sourceStart);
|
|
19925
19716
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
19717
|
+
const filterStart = stageStart();
|
|
19718
|
+
let filteredSourcePages = sourcePages;
|
|
19719
|
+
if (this.config.exclude.length > 0) {
|
|
19720
|
+
const beforeExclude = filteredSourcePages.length;
|
|
19721
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
19722
|
+
const url = normalizeUrlPath(p.url);
|
|
19723
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
19724
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
19725
|
+
return false;
|
|
19726
|
+
}
|
|
19727
|
+
return true;
|
|
19728
|
+
});
|
|
19729
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
19730
|
+
if (excludedCount > 0) {
|
|
19731
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
19732
|
+
}
|
|
19733
|
+
}
|
|
19734
|
+
if (this.config.respectRobotsTxt) {
|
|
19735
|
+
let robotsRules = null;
|
|
19736
|
+
if (sourceMode === "static-output") {
|
|
19737
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
19738
|
+
path__default.default.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
19739
|
+
);
|
|
19740
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
19741
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
19742
|
+
path__default.default.resolve(this.cwd, this.config.source.build.outputDir)
|
|
19743
|
+
);
|
|
19744
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
19745
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
19746
|
+
}
|
|
19747
|
+
if (robotsRules) {
|
|
19748
|
+
const beforeRobots = filteredSourcePages.length;
|
|
19749
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
19750
|
+
const url = normalizeUrlPath(p.url);
|
|
19751
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
19752
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
19753
|
+
return false;
|
|
19754
|
+
}
|
|
19755
|
+
return true;
|
|
19756
|
+
});
|
|
19757
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
19758
|
+
if (robotsExcluded > 0) {
|
|
19759
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
19760
|
+
}
|
|
19761
|
+
}
|
|
19762
|
+
}
|
|
19763
|
+
stageEnd("filter", filterStart);
|
|
19926
19764
|
const routeStart = stageStart();
|
|
19927
19765
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19928
19766
|
stageEnd("route_map", routeStart);
|
|
@@ -19930,7 +19768,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19930
19768
|
const extractStart = stageStart();
|
|
19931
19769
|
this.logger.info("Extracting content...");
|
|
19932
19770
|
const extractedPages = [];
|
|
19933
|
-
for (const sourcePage of
|
|
19771
|
+
for (const sourcePage of filteredSourcePages) {
|
|
19934
19772
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
19935
19773
|
if (!extracted) {
|
|
19936
19774
|
this.logger.warn(
|
|
@@ -19956,16 +19794,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19956
19794
|
seenUrls.add(page.url);
|
|
19957
19795
|
uniquePages.push(page);
|
|
19958
19796
|
}
|
|
19797
|
+
const indexablePages = [];
|
|
19798
|
+
for (const page of uniquePages) {
|
|
19799
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
19800
|
+
if (effectiveWeight === 0) {
|
|
19801
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
19802
|
+
continue;
|
|
19803
|
+
}
|
|
19804
|
+
indexablePages.push(page);
|
|
19805
|
+
}
|
|
19806
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
19807
|
+
if (zeroWeightCount > 0) {
|
|
19808
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
19809
|
+
}
|
|
19959
19810
|
stageEnd("extract", extractStart);
|
|
19960
|
-
const skippedPages =
|
|
19961
|
-
this.logger.info(`Extracted ${
|
|
19811
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
19812
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19962
19813
|
const linkStart = stageStart();
|
|
19963
|
-
const pageSet = new Set(
|
|
19814
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19964
19815
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
19965
|
-
for (const page of
|
|
19816
|
+
for (const page of indexablePages) {
|
|
19966
19817
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19967
19818
|
}
|
|
19968
|
-
for (const page of
|
|
19819
|
+
for (const page of indexablePages) {
|
|
19969
19820
|
for (const outgoing of page.outgoingLinks) {
|
|
19970
19821
|
if (!pageSet.has(outgoing)) {
|
|
19971
19822
|
continue;
|
|
@@ -19975,9 +19826,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19975
19826
|
}
|
|
19976
19827
|
stageEnd("links", linkStart);
|
|
19977
19828
|
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
19978
|
-
const
|
|
19979
|
-
this.logger.info("
|
|
19980
|
-
const
|
|
19829
|
+
const pagesStart = stageStart();
|
|
19830
|
+
this.logger.info("Building indexed pages...");
|
|
19831
|
+
const pages = [];
|
|
19981
19832
|
let routeExact = 0;
|
|
19982
19833
|
let routeBestEffort = 0;
|
|
19983
19834
|
const precomputedRoutes = /* @__PURE__ */ new Map();
|
|
@@ -19989,7 +19840,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19989
19840
|
});
|
|
19990
19841
|
}
|
|
19991
19842
|
}
|
|
19992
|
-
for (const page of
|
|
19843
|
+
for (const page of indexablePages) {
|
|
19993
19844
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19994
19845
|
if (routeMatch.routeResolution === "best-effort") {
|
|
19995
19846
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20006,7 +19857,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20006
19857
|
} else {
|
|
20007
19858
|
routeExact += 1;
|
|
20008
19859
|
}
|
|
20009
|
-
const
|
|
19860
|
+
const indexedPage = {
|
|
20010
19861
|
url: page.url,
|
|
20011
19862
|
title: page.title,
|
|
20012
19863
|
scope: scope.scopeName,
|
|
@@ -20021,35 +19872,38 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20021
19872
|
description: page.description,
|
|
20022
19873
|
keywords: page.keywords
|
|
20023
19874
|
};
|
|
20024
|
-
|
|
20025
|
-
|
|
20026
|
-
await writeMirrorPage(statePath, scope, mirror);
|
|
20027
|
-
}
|
|
20028
|
-
this.logger.event("markdown_written", { url: page.url });
|
|
19875
|
+
pages.push(indexedPage);
|
|
19876
|
+
this.logger.event("page_indexed", { url: page.url });
|
|
20029
19877
|
}
|
|
20030
19878
|
if (!options.dryRun) {
|
|
20031
|
-
const pageRecords =
|
|
20032
|
-
|
|
20033
|
-
|
|
20034
|
-
|
|
20035
|
-
|
|
20036
|
-
|
|
20037
|
-
|
|
20038
|
-
|
|
20039
|
-
|
|
20040
|
-
|
|
20041
|
-
|
|
20042
|
-
|
|
20043
|
-
|
|
20044
|
-
|
|
20045
|
-
|
|
20046
|
-
|
|
19879
|
+
const pageRecords = pages.map((p) => {
|
|
19880
|
+
const summary = buildPageSummary(p);
|
|
19881
|
+
return {
|
|
19882
|
+
url: p.url,
|
|
19883
|
+
title: p.title,
|
|
19884
|
+
markdown: p.markdown,
|
|
19885
|
+
projectId: scope.projectId,
|
|
19886
|
+
scopeName: scope.scopeName,
|
|
19887
|
+
routeFile: p.routeFile,
|
|
19888
|
+
routeResolution: p.routeResolution,
|
|
19889
|
+
incomingLinks: p.incomingLinks,
|
|
19890
|
+
outgoingLinks: p.outgoingLinks,
|
|
19891
|
+
depth: p.depth,
|
|
19892
|
+
tags: p.tags,
|
|
19893
|
+
indexedAt: p.generatedAt,
|
|
19894
|
+
summary,
|
|
19895
|
+
description: p.description,
|
|
19896
|
+
keywords: p.keywords
|
|
19897
|
+
};
|
|
19898
|
+
});
|
|
19899
|
+
await this.store.deletePages(scope);
|
|
19900
|
+
await this.store.upsertPages(pageRecords, scope);
|
|
20047
19901
|
}
|
|
20048
|
-
stageEnd("
|
|
20049
|
-
this.logger.info(`
|
|
19902
|
+
stageEnd("pages", pagesStart);
|
|
19903
|
+
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
20050
19904
|
const chunkStart = stageStart();
|
|
20051
19905
|
this.logger.info("Chunking pages...");
|
|
20052
|
-
let chunks =
|
|
19906
|
+
let chunks = pages.flatMap((page) => chunkPage(page, this.config, scope));
|
|
20053
19907
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
20054
19908
|
if (typeof maxChunks === "number") {
|
|
20055
19909
|
chunks = chunks.slice(0, maxChunks);
|
|
@@ -20081,259 +19935,90 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20081
19935
|
});
|
|
20082
19936
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20083
19937
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20084
|
-
const
|
|
20085
|
-
|
|
20086
|
-
for (const chunk of changedChunks) {
|
|
20087
|
-
chunkTokenEstimates.set(chunk.chunkKey, this.embeddings.estimateTokens(buildEmbeddingText(chunk, this.config.chunking.prependTitle)));
|
|
20088
|
-
}
|
|
20089
|
-
const estimatedTokens = changedChunks.reduce(
|
|
20090
|
-
(sum, chunk) => sum + (chunkTokenEstimates.get(chunk.chunkKey) ?? 0),
|
|
20091
|
-
0
|
|
20092
|
-
);
|
|
20093
|
-
const pricePer1k = this.config.embeddings.pricePer1kTokens ?? EMBEDDING_PRICE_PER_1K_TOKENS_USD[this.config.embeddings.model] ?? DEFAULT_EMBEDDING_PRICE_PER_1K;
|
|
20094
|
-
const estimatedCostUSD = estimatedTokens / 1e3 * pricePer1k;
|
|
20095
|
-
let newEmbeddings = 0;
|
|
20096
|
-
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
19938
|
+
const upsertStart = stageStart();
|
|
19939
|
+
let documentsUpserted = 0;
|
|
20097
19940
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20098
|
-
this.logger.info(`
|
|
20099
|
-
const
|
|
20100
|
-
|
|
20101
|
-
|
|
20102
|
-
|
|
20103
|
-
|
|
20104
|
-
|
|
20105
|
-
|
|
20106
|
-
|
|
20107
|
-
|
|
20108
|
-
);
|
|
20109
|
-
|
|
20110
|
-
for (let i = 0; i < changedChunks.length; i += 1) {
|
|
20111
|
-
const chunk = changedChunks[i];
|
|
20112
|
-
const embedding = embeddings[i];
|
|
20113
|
-
if (!chunk || !embedding || embedding.length === 0 || embedding.some((value) => !Number.isFinite(value))) {
|
|
20114
|
-
throw new SearchSocketError(
|
|
20115
|
-
"VECTOR_BACKEND_UNAVAILABLE",
|
|
20116
|
-
`Embedding provider returned an invalid vector for chunk index ${i}.`
|
|
20117
|
-
);
|
|
20118
|
-
}
|
|
20119
|
-
vectorsByChunk.set(chunk.chunkKey, embedding);
|
|
20120
|
-
newEmbeddings += 1;
|
|
20121
|
-
this.logger.event("embedded_new", { chunkKey: chunk.chunkKey });
|
|
20122
|
-
}
|
|
20123
|
-
}
|
|
20124
|
-
stageEnd("embedding", embedStart);
|
|
20125
|
-
if (changedChunks.length > 0) {
|
|
20126
|
-
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20127
|
-
} else {
|
|
20128
|
-
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20129
|
-
}
|
|
20130
|
-
const syncStart = stageStart();
|
|
20131
|
-
if (!options.dryRun) {
|
|
20132
|
-
this.logger.info("Syncing vectors...");
|
|
20133
|
-
const upserts = [];
|
|
20134
|
-
for (const chunk of changedChunks) {
|
|
20135
|
-
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
20136
|
-
if (!vector) {
|
|
20137
|
-
continue;
|
|
20138
|
-
}
|
|
20139
|
-
upserts.push({
|
|
19941
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Search...`);
|
|
19942
|
+
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
19943
|
+
const docs = changedChunks.map((chunk) => {
|
|
19944
|
+
const title = chunk.title;
|
|
19945
|
+
const sectionTitle = chunk.sectionTitle ?? "";
|
|
19946
|
+
const url = chunk.url;
|
|
19947
|
+
const tags = chunk.tags.join(",");
|
|
19948
|
+
const headingPath = chunk.headingPath.join(" > ");
|
|
19949
|
+
const otherFieldsLen = title.length + sectionTitle.length + url.length + tags.length + headingPath.length;
|
|
19950
|
+
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
19951
|
+
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
19952
|
+
return {
|
|
20140
19953
|
id: chunk.chunkKey,
|
|
20141
|
-
|
|
19954
|
+
content: { title, sectionTitle, text, url, tags, headingPath },
|
|
20142
19955
|
metadata: {
|
|
20143
19956
|
projectId: scope.projectId,
|
|
20144
19957
|
scopeName: scope.scopeName,
|
|
20145
|
-
url: chunk.url,
|
|
20146
19958
|
path: chunk.path,
|
|
20147
|
-
title: chunk.title,
|
|
20148
|
-
sectionTitle: chunk.sectionTitle ?? "",
|
|
20149
|
-
headingPath: chunk.headingPath,
|
|
20150
19959
|
snippet: chunk.snippet,
|
|
20151
|
-
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20152
19960
|
ordinal: chunk.ordinal,
|
|
20153
19961
|
contentHash: chunk.contentHash,
|
|
20154
|
-
modelId: this.config.embeddings.model,
|
|
20155
19962
|
depth: chunk.depth,
|
|
20156
19963
|
incomingLinks: chunk.incomingLinks,
|
|
20157
19964
|
routeFile: chunk.routeFile,
|
|
20158
|
-
|
|
20159
|
-
|
|
20160
|
-
keywords: chunk.keywords
|
|
19965
|
+
description: chunk.description ?? "",
|
|
19966
|
+
keywords: (chunk.keywords ?? []).join(",")
|
|
20161
19967
|
}
|
|
20162
|
-
}
|
|
20163
|
-
}
|
|
20164
|
-
if (upserts.length > 0) {
|
|
20165
|
-
await this.vectorStore.upsert(upserts, scope);
|
|
20166
|
-
this.logger.event("upserted", { count: upserts.length });
|
|
20167
|
-
}
|
|
20168
|
-
if (deletes.length > 0) {
|
|
20169
|
-
await this.vectorStore.deleteByIds(deletes, scope);
|
|
20170
|
-
this.logger.event("deleted", { count: deletes.length });
|
|
20171
|
-
}
|
|
20172
|
-
}
|
|
20173
|
-
stageEnd("sync", syncStart);
|
|
20174
|
-
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
20175
|
-
const finalizeStart = stageStart();
|
|
20176
|
-
if (!options.dryRun) {
|
|
20177
|
-
const scopeInfo = {
|
|
20178
|
-
projectId: scope.projectId,
|
|
20179
|
-
scopeName: scope.scopeName,
|
|
20180
|
-
modelId: this.config.embeddings.model,
|
|
20181
|
-
lastIndexedAt: nowIso(),
|
|
20182
|
-
vectorCount: chunks.length,
|
|
20183
|
-
lastEstimateTokens: estimatedTokens,
|
|
20184
|
-
lastEstimateCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
20185
|
-
lastEstimateChangedChunks: changedChunks.length
|
|
20186
|
-
};
|
|
20187
|
-
await this.vectorStore.recordScope(scopeInfo);
|
|
20188
|
-
this.logger.event("registry_updated", {
|
|
20189
|
-
scope: scope.scopeName,
|
|
20190
|
-
vectorCount: chunks.length
|
|
19968
|
+
};
|
|
20191
19969
|
});
|
|
19970
|
+
await this.store.upsertChunks(docs, scope);
|
|
19971
|
+
documentsUpserted = docs.length;
|
|
19972
|
+
this.logger.event("upserted", { count: docs.length });
|
|
19973
|
+
}
|
|
19974
|
+
if (!options.dryRun && deletes.length > 0) {
|
|
19975
|
+
await this.store.deleteByIds(deletes, scope);
|
|
19976
|
+
this.logger.event("deleted", { count: deletes.length });
|
|
19977
|
+
}
|
|
19978
|
+
stageEnd("upsert", upsertStart);
|
|
19979
|
+
if (changedChunks.length > 0) {
|
|
19980
|
+
this.logger.info(`Upserted ${documentsUpserted} document${documentsUpserted === 1 ? "" : "s"} (${stageTimingsMs["upsert"]}ms)`);
|
|
19981
|
+
} else {
|
|
19982
|
+
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
20192
19983
|
}
|
|
20193
|
-
stageEnd("finalize", finalizeStart);
|
|
20194
19984
|
this.logger.info("Done.");
|
|
20195
19985
|
return {
|
|
20196
|
-
pagesProcessed:
|
|
19986
|
+
pagesProcessed: pages.length,
|
|
20197
19987
|
chunksTotal: chunks.length,
|
|
20198
19988
|
chunksChanged: changedChunks.length,
|
|
20199
|
-
|
|
19989
|
+
documentsUpserted,
|
|
20200
19990
|
deletes: deletes.length,
|
|
20201
|
-
estimatedTokens,
|
|
20202
|
-
estimatedCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
20203
19991
|
routeExact,
|
|
20204
19992
|
routeBestEffort,
|
|
20205
19993
|
stageTimingsMs
|
|
20206
19994
|
};
|
|
20207
19995
|
}
|
|
20208
19996
|
};
|
|
20209
|
-
|
|
20210
|
-
// src/search/ranking.ts
|
|
20211
|
-
function nonNegativeOrZero(value) {
|
|
20212
|
-
if (!Number.isFinite(value)) {
|
|
20213
|
-
return 0;
|
|
20214
|
-
}
|
|
20215
|
-
return Math.max(0, value);
|
|
20216
|
-
}
|
|
20217
|
-
function rankHits(hits, config) {
|
|
20218
|
-
return hits.map((hit) => {
|
|
20219
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20220
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
20221
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
20222
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
20223
|
-
}
|
|
20224
|
-
if (config.ranking.enableDepthBoost) {
|
|
20225
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
20226
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
20227
|
-
}
|
|
20228
|
-
return {
|
|
20229
|
-
hit,
|
|
20230
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
20231
|
-
};
|
|
20232
|
-
}).sort((a, b) => {
|
|
20233
|
-
const delta = b.finalScore - a.finalScore;
|
|
20234
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20235
|
-
});
|
|
20236
|
-
}
|
|
20237
|
-
function findPageWeight(url, pageWeights) {
|
|
20238
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
20239
|
-
const normalizedUrl = norm(url);
|
|
20240
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20241
|
-
if (norm(pattern) === normalizedUrl) {
|
|
20242
|
-
return weight;
|
|
20243
|
-
}
|
|
20244
|
-
}
|
|
20245
|
-
let bestPrefix = "";
|
|
20246
|
-
let bestWeight = 1;
|
|
20247
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20248
|
-
const normalizedPattern = norm(pattern);
|
|
20249
|
-
if (normalizedPattern === "/") continue;
|
|
20250
|
-
const prefix = `${normalizedPattern}/`;
|
|
20251
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
20252
|
-
bestPrefix = prefix;
|
|
20253
|
-
bestWeight = weight;
|
|
20254
|
-
}
|
|
20255
|
-
}
|
|
20256
|
-
return bestWeight;
|
|
20257
|
-
}
|
|
20258
|
-
function aggregateByPage(ranked, config) {
|
|
20259
|
-
const groups = /* @__PURE__ */ new Map();
|
|
20260
|
-
for (const hit of ranked) {
|
|
20261
|
-
const url = hit.hit.metadata.url;
|
|
20262
|
-
const group = groups.get(url);
|
|
20263
|
-
if (group) group.push(hit);
|
|
20264
|
-
else groups.set(url, [hit]);
|
|
20265
|
-
}
|
|
20266
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
20267
|
-
const pages = [];
|
|
20268
|
-
for (const [url, chunks] of groups) {
|
|
20269
|
-
chunks.sort((a, b) => {
|
|
20270
|
-
const delta = b.finalScore - a.finalScore;
|
|
20271
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20272
|
-
});
|
|
20273
|
-
const best = chunks[0];
|
|
20274
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20275
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
20276
|
-
let aggregationBonus = 0;
|
|
20277
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
20278
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20279
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20280
|
-
}
|
|
20281
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20282
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20283
|
-
if (pageWeight === 0) continue;
|
|
20284
|
-
if (pageWeight !== 1) {
|
|
20285
|
-
pageScore *= pageWeight;
|
|
20286
|
-
}
|
|
20287
|
-
pages.push({
|
|
20288
|
-
url,
|
|
20289
|
-
title: best.hit.metadata.title,
|
|
20290
|
-
routeFile: best.hit.metadata.routeFile,
|
|
20291
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20292
|
-
bestChunk: best,
|
|
20293
|
-
matchingChunks: chunks
|
|
20294
|
-
});
|
|
20295
|
-
}
|
|
20296
|
-
return pages.sort((a, b) => {
|
|
20297
|
-
const delta = b.pageScore - a.pageScore;
|
|
20298
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20299
|
-
});
|
|
20300
|
-
}
|
|
20301
|
-
|
|
20302
|
-
// src/search/engine.ts
|
|
20303
19997
|
var requestSchema = zod.z.object({
|
|
20304
19998
|
q: zod.z.string().trim().min(1),
|
|
20305
19999
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
20306
20000
|
scope: zod.z.string().optional(),
|
|
20307
20001
|
pathPrefix: zod.z.string().optional(),
|
|
20308
20002
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20309
|
-
rerank: zod.z.boolean().optional(),
|
|
20310
20003
|
groupBy: zod.z.enum(["page", "chunk"]).optional()
|
|
20311
20004
|
});
|
|
20312
20005
|
var SearchEngine = class _SearchEngine {
|
|
20313
20006
|
cwd;
|
|
20314
20007
|
config;
|
|
20315
|
-
|
|
20316
|
-
vectorStore;
|
|
20317
|
-
reranker;
|
|
20008
|
+
store;
|
|
20318
20009
|
constructor(options) {
|
|
20319
20010
|
this.cwd = options.cwd;
|
|
20320
20011
|
this.config = options.config;
|
|
20321
|
-
this.
|
|
20322
|
-
this.vectorStore = options.vectorStore;
|
|
20323
|
-
this.reranker = options.reranker;
|
|
20012
|
+
this.store = options.store;
|
|
20324
20013
|
}
|
|
20325
20014
|
static async create(options = {}) {
|
|
20326
20015
|
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
20327
20016
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
20328
|
-
const
|
|
20329
|
-
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
20330
|
-
const reranker = options.reranker === void 0 ? createReranker(config) : options.reranker;
|
|
20017
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
20331
20018
|
return new _SearchEngine({
|
|
20332
20019
|
cwd,
|
|
20333
20020
|
config,
|
|
20334
|
-
|
|
20335
|
-
vectorStore,
|
|
20336
|
-
reranker
|
|
20021
|
+
store
|
|
20337
20022
|
});
|
|
20338
20023
|
}
|
|
20339
20024
|
getConfig() {
|
|
@@ -20347,99 +20032,130 @@ var SearchEngine = class _SearchEngine {
|
|
|
20347
20032
|
const input = parsed.data;
|
|
20348
20033
|
const totalStart = process.hrtime.bigint();
|
|
20349
20034
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20350
|
-
await this.assertModelCompatibility(resolvedScope);
|
|
20351
20035
|
const topK = input.topK ?? 10;
|
|
20352
|
-
const wantsRerank = Boolean(input.rerank);
|
|
20353
20036
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20354
20037
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20355
|
-
const
|
|
20356
|
-
|
|
20357
|
-
|
|
20358
|
-
|
|
20359
|
-
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
20038
|
+
const filterParts = [];
|
|
20039
|
+
if (input.pathPrefix) {
|
|
20040
|
+
const prefix = input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}`;
|
|
20041
|
+
filterParts.push(`url GLOB '${prefix}*'`);
|
|
20360
20042
|
}
|
|
20361
|
-
|
|
20362
|
-
|
|
20363
|
-
|
|
20364
|
-
|
|
20365
|
-
{
|
|
20366
|
-
topK: candidateK,
|
|
20367
|
-
pathPrefix: input.pathPrefix,
|
|
20368
|
-
tags: input.tags
|
|
20369
|
-
},
|
|
20370
|
-
resolvedScope
|
|
20371
|
-
);
|
|
20372
|
-
const vectorMs = hrTimeMs(vectorStart);
|
|
20373
|
-
const ranked = rankHits(hits, this.config);
|
|
20374
|
-
let usedRerank = false;
|
|
20375
|
-
let rerankMs = 0;
|
|
20376
|
-
let ordered = ranked;
|
|
20377
|
-
if (wantsRerank) {
|
|
20378
|
-
const rerankStart = process.hrtime.bigint();
|
|
20379
|
-
ordered = await this.rerankHits(input.q, ranked, topK);
|
|
20380
|
-
rerankMs = hrTimeMs(rerankStart);
|
|
20381
|
-
usedRerank = true;
|
|
20043
|
+
if (input.tags && input.tags.length > 0) {
|
|
20044
|
+
for (const tag of input.tags) {
|
|
20045
|
+
filterParts.push(`tags GLOB '*${tag}*'`);
|
|
20046
|
+
}
|
|
20382
20047
|
}
|
|
20383
|
-
|
|
20384
|
-
const
|
|
20048
|
+
const filter = filterParts.length > 0 ? filterParts.join(" AND ") : void 0;
|
|
20049
|
+
const useDualSearch = this.config.search.dualSearch && groupByPage;
|
|
20050
|
+
const searchStart = process.hrtime.bigint();
|
|
20051
|
+
let ranked;
|
|
20052
|
+
if (useDualSearch) {
|
|
20053
|
+
const chunkLimit = Math.max(topK * 10, 100);
|
|
20054
|
+
const pageLimit = 20;
|
|
20055
|
+
const [pageHits, chunkHits] = await Promise.all([
|
|
20056
|
+
this.store.searchPages(
|
|
20057
|
+
input.q,
|
|
20058
|
+
{
|
|
20059
|
+
limit: pageLimit,
|
|
20060
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
20061
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
20062
|
+
filter
|
|
20063
|
+
},
|
|
20064
|
+
resolvedScope
|
|
20065
|
+
),
|
|
20066
|
+
this.store.search(
|
|
20067
|
+
input.q,
|
|
20068
|
+
{
|
|
20069
|
+
limit: chunkLimit,
|
|
20070
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
20071
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
20072
|
+
reranking: false,
|
|
20073
|
+
filter
|
|
20074
|
+
},
|
|
20075
|
+
resolvedScope
|
|
20076
|
+
)
|
|
20077
|
+
]);
|
|
20078
|
+
const rankedChunks = rankHits(chunkHits, this.config, input.q);
|
|
20079
|
+
ranked = mergePageAndChunkResults(pageHits, rankedChunks, this.config);
|
|
20080
|
+
} else {
|
|
20081
|
+
const hits = await this.store.search(
|
|
20082
|
+
input.q,
|
|
20083
|
+
{
|
|
20084
|
+
limit: candidateK,
|
|
20085
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
20086
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
20087
|
+
reranking: this.config.search.reranking,
|
|
20088
|
+
filter
|
|
20089
|
+
},
|
|
20090
|
+
resolvedScope
|
|
20091
|
+
);
|
|
20092
|
+
ranked = rankHits(hits, this.config, input.q);
|
|
20093
|
+
}
|
|
20094
|
+
const searchMs = hrTimeMs(searchStart);
|
|
20095
|
+
const results = this.buildResults(ranked, topK, groupByPage, input.q);
|
|
20096
|
+
return {
|
|
20097
|
+
q: input.q,
|
|
20098
|
+
scope: resolvedScope.scopeName,
|
|
20099
|
+
results,
|
|
20100
|
+
meta: {
|
|
20101
|
+
timingsMs: {
|
|
20102
|
+
search: Math.round(searchMs),
|
|
20103
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20104
|
+
}
|
|
20105
|
+
}
|
|
20106
|
+
};
|
|
20107
|
+
}
|
|
20108
|
+
ensureSnippet(hit) {
|
|
20109
|
+
const snippet = hit.hit.metadata.snippet;
|
|
20110
|
+
if (snippet && snippet.length >= 30) return snippet;
|
|
20111
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
20112
|
+
if (chunkText) return toSnippet(chunkText);
|
|
20113
|
+
return snippet || "";
|
|
20114
|
+
}
|
|
20115
|
+
buildResults(ordered, topK, groupByPage, _query) {
|
|
20385
20116
|
if (groupByPage) {
|
|
20386
20117
|
let pages = aggregateByPage(ordered, this.config);
|
|
20387
|
-
|
|
20388
|
-
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20389
|
-
}
|
|
20118
|
+
pages = trimByScoreGap(pages, this.config);
|
|
20390
20119
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20391
|
-
|
|
20120
|
+
return pages.slice(0, topK).map((page) => {
|
|
20392
20121
|
const bestScore = page.bestChunk.finalScore;
|
|
20393
|
-
const
|
|
20394
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20122
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20123
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
20395
20124
|
return {
|
|
20396
20125
|
url: page.url,
|
|
20397
20126
|
title: page.title,
|
|
20398
20127
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
20399
|
-
snippet: page.bestChunk
|
|
20128
|
+
snippet: this.ensureSnippet(page.bestChunk),
|
|
20400
20129
|
score: Number(page.pageScore.toFixed(6)),
|
|
20401
20130
|
routeFile: page.routeFile,
|
|
20402
20131
|
chunks: meaningful.length > 1 ? meaningful.map((c) => ({
|
|
20403
20132
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
20404
|
-
snippet: c
|
|
20133
|
+
snippet: this.ensureSnippet(c),
|
|
20405
20134
|
headingPath: c.hit.metadata.headingPath,
|
|
20406
20135
|
score: Number(c.finalScore.toFixed(6))
|
|
20407
20136
|
})) : void 0
|
|
20408
20137
|
};
|
|
20409
20138
|
});
|
|
20410
20139
|
} else {
|
|
20140
|
+
let filtered = ordered;
|
|
20141
|
+
const minScore = this.config.ranking.minScore;
|
|
20411
20142
|
if (minScore > 0) {
|
|
20412
|
-
|
|
20143
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20413
20144
|
}
|
|
20414
|
-
|
|
20145
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20415
20146
|
url: hit.metadata.url,
|
|
20416
20147
|
title: hit.metadata.title,
|
|
20417
20148
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
20418
|
-
snippet: hit
|
|
20149
|
+
snippet: this.ensureSnippet({ hit, finalScore }),
|
|
20419
20150
|
score: Number(finalScore.toFixed(6)),
|
|
20420
20151
|
routeFile: hit.metadata.routeFile
|
|
20421
20152
|
}));
|
|
20422
20153
|
}
|
|
20423
|
-
return {
|
|
20424
|
-
q: input.q,
|
|
20425
|
-
scope: resolvedScope.scopeName,
|
|
20426
|
-
results,
|
|
20427
|
-
meta: {
|
|
20428
|
-
timingsMs: {
|
|
20429
|
-
embed: Math.round(embedMs),
|
|
20430
|
-
vector: Math.round(vectorMs),
|
|
20431
|
-
rerank: Math.round(rerankMs),
|
|
20432
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
20433
|
-
},
|
|
20434
|
-
usedRerank,
|
|
20435
|
-
modelId: this.config.embeddings.model
|
|
20436
|
-
}
|
|
20437
|
-
};
|
|
20438
20154
|
}
|
|
20439
20155
|
async getPage(pathOrUrl, scope) {
|
|
20440
20156
|
const resolvedScope = resolveScope(this.config, scope);
|
|
20441
20157
|
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
20442
|
-
const page = await this.
|
|
20158
|
+
const page = await this.store.getPage(urlPath, resolvedScope);
|
|
20443
20159
|
if (!page) {
|
|
20444
20160
|
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
20445
20161
|
}
|
|
@@ -20460,7 +20176,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20460
20176
|
};
|
|
20461
20177
|
}
|
|
20462
20178
|
async health() {
|
|
20463
|
-
return this.
|
|
20179
|
+
return this.store.health();
|
|
20464
20180
|
}
|
|
20465
20181
|
resolveInputPath(pathOrUrl) {
|
|
20466
20182
|
try {
|
|
@@ -20472,90 +20188,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
20472
20188
|
const withoutQueryOrHash = pathOrUrl.split(/[?#]/)[0] ?? pathOrUrl;
|
|
20473
20189
|
return normalizeUrlPath(withoutQueryOrHash);
|
|
20474
20190
|
}
|
|
20475
|
-
async assertModelCompatibility(scope) {
|
|
20476
|
-
const modelId = await this.vectorStore.getScopeModelId(scope);
|
|
20477
|
-
if (modelId && modelId !== this.config.embeddings.model) {
|
|
20478
|
-
throw new SearchSocketError(
|
|
20479
|
-
"EMBEDDING_MODEL_MISMATCH",
|
|
20480
|
-
`Scope ${scope.scopeName} was indexed with ${modelId}. Current config uses ${this.config.embeddings.model}. Re-index with --force.`
|
|
20481
|
-
);
|
|
20482
|
-
}
|
|
20483
|
-
}
|
|
20484
|
-
async rerankHits(query, ranked, topK) {
|
|
20485
|
-
if (!this.config.rerank.enabled) {
|
|
20486
|
-
throw new SearchSocketError(
|
|
20487
|
-
"INVALID_REQUEST",
|
|
20488
|
-
"rerank=true requested but rerank.enabled is not set to true.",
|
|
20489
|
-
400
|
|
20490
|
-
);
|
|
20491
|
-
}
|
|
20492
|
-
if (!this.reranker) {
|
|
20493
|
-
throw new SearchSocketError(
|
|
20494
|
-
"CONFIG_MISSING",
|
|
20495
|
-
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
20496
|
-
400
|
|
20497
|
-
);
|
|
20498
|
-
}
|
|
20499
|
-
const pageGroups = /* @__PURE__ */ new Map();
|
|
20500
|
-
for (const entry of ranked) {
|
|
20501
|
-
const url = entry.hit.metadata.url;
|
|
20502
|
-
const group = pageGroups.get(url);
|
|
20503
|
-
if (group) group.push(entry);
|
|
20504
|
-
else pageGroups.set(url, [entry]);
|
|
20505
|
-
}
|
|
20506
|
-
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20507
|
-
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20508
|
-
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20509
|
-
const MAX_DOC_CHARS = 2e3;
|
|
20510
|
-
const pageCandidates = [];
|
|
20511
|
-
for (const [url, chunks] of pageGroups) {
|
|
20512
|
-
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
20513
|
-
const bestScore = byScore[0].finalScore;
|
|
20514
|
-
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
20515
|
-
const selected = byScore.filter(
|
|
20516
|
-
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
20517
|
-
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
20518
|
-
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
20519
|
-
const first = selected[0].hit.metadata;
|
|
20520
|
-
const parts = [first.title];
|
|
20521
|
-
if (first.description) {
|
|
20522
|
-
parts.push(first.description);
|
|
20523
|
-
}
|
|
20524
|
-
if (first.keywords && first.keywords.length > 0) {
|
|
20525
|
-
parts.push(first.keywords.join(", "));
|
|
20526
|
-
}
|
|
20527
|
-
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20528
|
-
parts.push(body);
|
|
20529
|
-
let text = parts.join("\n\n");
|
|
20530
|
-
if (text.length > MAX_DOC_CHARS) {
|
|
20531
|
-
text = text.slice(0, MAX_DOC_CHARS);
|
|
20532
|
-
}
|
|
20533
|
-
pageCandidates.push({ id: url, text });
|
|
20534
|
-
}
|
|
20535
|
-
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
20536
|
-
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
20537
|
-
const reranked = await this.reranker.rerank(
|
|
20538
|
-
query,
|
|
20539
|
-
cappedCandidates,
|
|
20540
|
-
maxCandidates
|
|
20541
|
-
);
|
|
20542
|
-
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20543
|
-
return ranked.map((entry) => {
|
|
20544
|
-
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
20545
|
-
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
20546
|
-
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
20547
|
-
return { ...entry, finalScore: base };
|
|
20548
|
-
}
|
|
20549
|
-
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
20550
|
-
return {
|
|
20551
|
-
...entry,
|
|
20552
|
-
finalScore: Number.isFinite(combined) ? combined : base
|
|
20553
|
-
};
|
|
20554
|
-
}).sort((a, b) => {
|
|
20555
|
-
const delta = b.finalScore - a.finalScore;
|
|
20556
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20557
|
-
});
|
|
20558
|
-
}
|
|
20559
20191
|
};
|
|
20560
20192
|
function createServer(engine) {
|
|
20561
20193
|
const server = new mcp_js.McpServer({
|
|
@@ -20565,7 +20197,7 @@ function createServer(engine) {
|
|
|
20565
20197
|
server.registerTool(
|
|
20566
20198
|
"search",
|
|
20567
20199
|
{
|
|
20568
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
20200
|
+
description: "Semantic site search powered by Upstash Search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and groupBy.",
|
|
20569
20201
|
inputSchema: {
|
|
20570
20202
|
query: zod.z.string().min(1),
|
|
20571
20203
|
scope: zod.z.string().optional(),
|
|
@@ -20867,7 +20499,8 @@ function searchsocketHandle(options = {}) {
|
|
|
20867
20499
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20868
20500
|
}
|
|
20869
20501
|
const engine = await getEngine();
|
|
20870
|
-
const
|
|
20502
|
+
const searchRequest = body;
|
|
20503
|
+
const result = await engine.search(searchRequest);
|
|
20871
20504
|
return withCors(
|
|
20872
20505
|
new Response(JSON.stringify(result), {
|
|
20873
20506
|
status: 200,
|
|
@@ -20952,13 +20585,6 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20952
20585
|
let running = false;
|
|
20953
20586
|
return {
|
|
20954
20587
|
name: "searchsocket:auto-index",
|
|
20955
|
-
config() {
|
|
20956
|
-
return {
|
|
20957
|
-
ssr: {
|
|
20958
|
-
external: ["@libsql/client", "libsql"]
|
|
20959
|
-
}
|
|
20960
|
-
};
|
|
20961
|
-
},
|
|
20962
20588
|
async closeBundle() {
|
|
20963
20589
|
if (executed || running) {
|
|
20964
20590
|
return;
|
|
@@ -20980,15 +20606,14 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20980
20606
|
});
|
|
20981
20607
|
const stats = await pipeline.run({
|
|
20982
20608
|
changedOnly: options.changedOnly ?? true,
|
|
20983
|
-
force: options.force ?? false,
|
|
20609
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20984
20610
|
dryRun: options.dryRun ?? false,
|
|
20985
20611
|
scopeOverride: options.scope,
|
|
20986
20612
|
verbose: options.verbose
|
|
20987
20613
|
});
|
|
20988
20614
|
logger3.info(
|
|
20989
|
-
`[searchsocket] indexed pages=${stats.pagesProcessed} chunks=${stats.chunksTotal} changed=${stats.chunksChanged}
|
|
20615
|
+
`[searchsocket] indexed pages=${stats.pagesProcessed} chunks=${stats.chunksTotal} changed=${stats.chunksChanged} upserted=${stats.documentsUpserted}`
|
|
20990
20616
|
);
|
|
20991
|
-
logger3.info("[searchsocket] markdown mirror written under .searchsocket/pages/<scope> (safe to commit for content workflows).");
|
|
20992
20617
|
executed = true;
|
|
20993
20618
|
} finally {
|
|
20994
20619
|
running = false;
|
|
@@ -21040,12 +20665,10 @@ function createSearchClient(options = {}) {
|
|
|
21040
20665
|
*/
|
|
21041
20666
|
|
|
21042
20667
|
exports.IndexPipeline = IndexPipeline;
|
|
21043
|
-
exports.JinaReranker = JinaReranker;
|
|
21044
20668
|
exports.SearchEngine = SearchEngine;
|
|
21045
|
-
exports.
|
|
21046
|
-
exports.createReranker = createReranker;
|
|
20669
|
+
exports.UpstashSearchStore = UpstashSearchStore;
|
|
21047
20670
|
exports.createSearchClient = createSearchClient;
|
|
21048
|
-
exports.
|
|
20671
|
+
exports.createUpstashStore = createUpstashStore;
|
|
21049
20672
|
exports.isServerless = isServerless;
|
|
21050
20673
|
exports.loadConfig = loadConfig;
|
|
21051
20674
|
exports.mergeConfig = mergeConfig;
|