vectra 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/LocalDocumentIndex.d.ts +5 -2
- package/lib/LocalDocumentIndex.d.ts.map +1 -1
- package/lib/LocalDocumentIndex.js +20 -12
- package/lib/LocalDocumentIndex.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +1 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -1
- package/lib/OpenAIEmbeddings.js +3 -1
- package/lib/OpenAIEmbeddings.js.map +1 -1
- package/lib/TextSplitter.d.ts +2 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +101 -49
- package/lib/TextSplitter.js.map +1 -1
- package/lib/WebFetcher.d.ts +6 -4
- package/lib/WebFetcher.d.ts.map +1 -1
- package/lib/WebFetcher.js +132 -52
- package/lib/WebFetcher.js.map +1 -1
- package/lib/types.d.ts +8 -1
- package/lib/types.d.ts.map +1 -1
- package/lib/vectra-cli.js +8 -8
- package/lib/vectra-cli.js.map +1 -1
- package/package.json +3 -1
- package/src/LocalDocumentIndex.ts +20 -13
- package/src/OpenAIEmbeddings.ts +4 -1
- package/src/TextSplitter.ts +104 -49
- package/src/WebFetcher.ts +159 -58
- package/src/types.ts +6 -1
- package/src/vectra-cli.ts +8 -8
package/lib/WebFetcher.js
CHANGED
|
@@ -38,6 +38,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
38
38
|
exports.WebFetcher = void 0;
|
|
39
39
|
const axios_1 = __importDefault(require("axios"));
|
|
40
40
|
const cheerio = __importStar(require("cheerio"));
|
|
41
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
41
42
|
const ALLOWED_CONTENT_TYPES = [
|
|
42
43
|
"text/html",
|
|
43
44
|
"application/json",
|
|
@@ -62,59 +63,11 @@ const DEFAULT_HEADERS = {
|
|
|
62
63
|
class WebFetcher {
|
|
63
64
|
constructor(config) {
|
|
64
65
|
this._config = Object.assign({
|
|
65
|
-
|
|
66
|
+
htmlToMarkdown: true,
|
|
66
67
|
summarizeHtml: false,
|
|
67
68
|
}, config);
|
|
68
69
|
}
|
|
69
70
|
fetch(uri) {
|
|
70
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
71
|
-
const { data, contentType } = yield this.fetchPage(uri);
|
|
72
|
-
if (contentType === "text/html" && this._config.htmlToText) {
|
|
73
|
-
return this.extractText(data, uri, this._config.summarizeHtml);
|
|
74
|
-
}
|
|
75
|
-
else {
|
|
76
|
-
return data;
|
|
77
|
-
}
|
|
78
|
-
});
|
|
79
|
-
}
|
|
80
|
-
extractText(html, baseUrl, summarize) {
|
|
81
|
-
// Parse all elements including <noscript> tags
|
|
82
|
-
const $ = cheerio.load(html, { scriptingEnabled: true });
|
|
83
|
-
// If we want a summary, just get use the <body/>
|
|
84
|
-
let text = '';
|
|
85
|
-
$(`${summarize ? 'body ' : '*'}:not(style):not(script):not(svg)`).each((i, elem) => {
|
|
86
|
-
var _a, _b;
|
|
87
|
-
// Remove any children to avoid duplicate text
|
|
88
|
-
let content = $(elem).clone().children().remove().end().text().trim();
|
|
89
|
-
const $el = $(elem);
|
|
90
|
-
// Print links in markdown format
|
|
91
|
-
let href = $el.attr("href");
|
|
92
|
-
if (((_a = $el.prop("tagName")) === null || _a === void 0 ? void 0 : _a.toLowerCase()) === "a" && href) {
|
|
93
|
-
if (!href.startsWith("http")) {
|
|
94
|
-
// Try converting to a relevant link
|
|
95
|
-
try {
|
|
96
|
-
href = new URL(href, baseUrl).toString();
|
|
97
|
-
}
|
|
98
|
-
catch (_c) {
|
|
99
|
-
// Leave as is
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
// If the link has content, use that as the text
|
|
103
|
-
const altText = (_b = $el.find("img[alt]").attr("alt")) === null || _b === void 0 ? void 0 : _b.trim();
|
|
104
|
-
if (altText) {
|
|
105
|
-
content += ` ${altText}`;
|
|
106
|
-
}
|
|
107
|
-
text += ` [${content}](${href})`;
|
|
108
|
-
}
|
|
109
|
-
// otherwise just print the content
|
|
110
|
-
else if (content !== "") {
|
|
111
|
-
text += ` ${content}`;
|
|
112
|
-
}
|
|
113
|
-
});
|
|
114
|
-
// Remove newlines
|
|
115
|
-
return text.trim().replace(/\n+/g, ' ');
|
|
116
|
-
}
|
|
117
|
-
fetchPage(baseUrl) {
|
|
118
71
|
return __awaiter(this, void 0, void 0, function* () {
|
|
119
72
|
const httpClient = axios_1.default.create({
|
|
120
73
|
validateStatus: () => true,
|
|
@@ -122,11 +75,11 @@ class WebFetcher {
|
|
|
122
75
|
// Clone headers to avoid mutating the original
|
|
123
76
|
const headers = Object.assign({}, DEFAULT_HEADERS, this._config.headers);
|
|
124
77
|
// get hostname from url
|
|
125
|
-
const host = new URL(
|
|
78
|
+
const host = new URL(uri).hostname;
|
|
126
79
|
headers['Host'] = host;
|
|
127
80
|
headers['Alt-Used'] = host;
|
|
128
81
|
// Fetch page and check for errors
|
|
129
|
-
const response = yield httpClient.get(
|
|
82
|
+
const response = yield httpClient.get(uri, Object.assign({ headers }, this._config.requestConfig));
|
|
130
83
|
if (response.status >= 400) {
|
|
131
84
|
throw new Error(`Site returned an HTTP status of ${response.status}`);
|
|
132
85
|
}
|
|
@@ -136,9 +89,136 @@ class WebFetcher {
|
|
|
136
89
|
if (!contentTypeArray[0] || !ALLOWED_CONTENT_TYPES.includes(contentTypeArray[0])) {
|
|
137
90
|
throw new Error(`Site returned an invalid content type of ${contentType}`);
|
|
138
91
|
}
|
|
139
|
-
|
|
92
|
+
// Convert content type to doc type
|
|
93
|
+
const docType = contentTypeArray[0] != 'text/plain' ? contentTypeArray[0].split('/')[1] : undefined;
|
|
94
|
+
if (docType == 'html' && this._config.htmlToMarkdown) {
|
|
95
|
+
const text = this.htmlToMarkdown(response.data, uri);
|
|
96
|
+
return { text, docType: 'md' };
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
const text = response.data;
|
|
100
|
+
return { text, docType };
|
|
101
|
+
}
|
|
140
102
|
});
|
|
141
103
|
}
|
|
104
|
+
htmlToMarkdown(html, baseUrl) {
|
|
105
|
+
var _a;
|
|
106
|
+
// Parse HTML and remove scripts
|
|
107
|
+
const $ = cheerio.load(html, { scriptingEnabled: true });
|
|
108
|
+
// Remove scripts and convert relative links to absolute
|
|
109
|
+
$('script').remove();
|
|
110
|
+
$('a').each((i, elem) => {
|
|
111
|
+
const $el = $(elem);
|
|
112
|
+
const href = $el.attr("href");
|
|
113
|
+
if (href && !href.startsWith("http")) {
|
|
114
|
+
// Try converting to an absolute link
|
|
115
|
+
try {
|
|
116
|
+
$el.attr("href", new URL(href, baseUrl).toString());
|
|
117
|
+
}
|
|
118
|
+
catch (_a) {
|
|
119
|
+
// Leave as is
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
// Convert to markdown
|
|
124
|
+
const body = (_a = $('body').html()) !== null && _a !== void 0 ? _a : '';
|
|
125
|
+
const turndownService = new turndown_1.default({
|
|
126
|
+
hr: '\n\n---\n\n',
|
|
127
|
+
});
|
|
128
|
+
convertTables(turndownService);
|
|
129
|
+
const md = turndownService.turndown(body);
|
|
130
|
+
// Remove any overly long header text
|
|
131
|
+
const contentStart = Math.min(md.indexOf('\n'), md.indexOf(' '));
|
|
132
|
+
if (contentStart > 64) {
|
|
133
|
+
return md.slice(contentStart);
|
|
134
|
+
}
|
|
135
|
+
else {
|
|
136
|
+
return md;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
142
139
|
}
|
|
143
140
|
exports.WebFetcher = WebFetcher;
|
|
141
|
+
function convertTables(turndownService) {
|
|
142
|
+
turndownService.addRule('tableCell', {
|
|
143
|
+
filter: ['th', 'td'],
|
|
144
|
+
replacement: function (content, node) {
|
|
145
|
+
return cell(content, node);
|
|
146
|
+
}
|
|
147
|
+
});
|
|
148
|
+
turndownService.addRule('tableRow', {
|
|
149
|
+
filter: 'tr',
|
|
150
|
+
replacement: function (content, node) {
|
|
151
|
+
var borderCells = '';
|
|
152
|
+
var alignMap = { left: ':--', right: '--:', center: ':-:' };
|
|
153
|
+
if (isHeadingRow(node)) {
|
|
154
|
+
for (var i = 0; i < node.childNodes.length; i++) {
|
|
155
|
+
var border = '---';
|
|
156
|
+
var align = (node.childNodes[i].getAttribute('align') || '').toLowerCase();
|
|
157
|
+
if (align)
|
|
158
|
+
border = alignMap[align] || border;
|
|
159
|
+
borderCells += cell(border, node.childNodes[i]);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
return '\n' + content + (borderCells ? '\n' + borderCells : '');
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
turndownService.addRule('table', {
|
|
166
|
+
filter: ['table'],
|
|
167
|
+
replacement: function (content, node) {
|
|
168
|
+
// Ensure there are no blank lines
|
|
169
|
+
content = content.replace('\n\n', '\n');
|
|
170
|
+
return '\n\n' + content + '\n\n';
|
|
171
|
+
}
|
|
172
|
+
});
|
|
173
|
+
turndownService.addRule('tableSection', {
|
|
174
|
+
filter: ['thead', 'tbody', 'tfoot'],
|
|
175
|
+
replacement: function (content) {
|
|
176
|
+
return content;
|
|
177
|
+
}
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
const indexOf = Array.prototype.indexOf;
|
|
181
|
+
const every = Array.prototype.every;
|
|
182
|
+
// A tr is a heading row if:
|
|
183
|
+
// - the parent is a THEAD
|
|
184
|
+
// - or if its the first child of the TABLE or the first TBODY (possibly
|
|
185
|
+
// following a blank THEAD)
|
|
186
|
+
// - and every cell is a TH
|
|
187
|
+
function isHeadingRow(tr) {
|
|
188
|
+
var parentNode = tr.parentNode;
|
|
189
|
+
return (parentNode.nodeName === 'THEAD' ||
|
|
190
|
+
(parentNode.firstChild === tr &&
|
|
191
|
+
(parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
|
|
192
|
+
every.call(tr.childNodes, function (n) { return n.nodeName === 'TH'; })));
|
|
193
|
+
}
|
|
194
|
+
function isFirstTbody(element) {
|
|
195
|
+
var previousSibling = element.previousSibling;
|
|
196
|
+
return (element.nodeName === 'TBODY' && (!previousSibling ||
|
|
197
|
+
(previousSibling.nodeName === 'THEAD' &&
|
|
198
|
+
/^\s*$/i.test(previousSibling.textContent))));
|
|
199
|
+
}
|
|
200
|
+
function cell(content, node) {
|
|
201
|
+
var index = indexOf.call(node.parentNode.childNodes, node);
|
|
202
|
+
var prefix = ' ';
|
|
203
|
+
if (index === 0) {
|
|
204
|
+
prefix = '| ';
|
|
205
|
+
}
|
|
206
|
+
return cleanContent(prefix + content + ' |');
|
|
207
|
+
}
|
|
208
|
+
function cleanContent(content) {
|
|
209
|
+
let output = '';
|
|
210
|
+
const chars = ['\n', '\r', '\t', '\f', '\v', '\u00a0', '\u2028', '\u2029', ' '];
|
|
211
|
+
for (let i = 0; i < content.length; i++) {
|
|
212
|
+
if (chars.includes(content[i])) {
|
|
213
|
+
if (output[output.length - 1] != ' ') {
|
|
214
|
+
output += ' ';
|
|
215
|
+
}
|
|
216
|
+
continue;
|
|
217
|
+
}
|
|
218
|
+
else {
|
|
219
|
+
output += content[i];
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
return output;
|
|
223
|
+
}
|
|
144
224
|
//# sourceMappingURL=WebFetcher.js.map
|
package/lib/WebFetcher.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"WebFetcher.js","sourceRoot":"","sources":["../src/WebFetcher.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,kDAAkD;
|
|
1
|
+
{"version":3,"file":"WebFetcher.js","sourceRoot":"","sources":["../src/WebFetcher.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,kDAAkD;AAElD,iDAAmC;AACnC,wDAAwC;AAGxC,MAAM,qBAAqB,GAAG;IAC1B,WAAW;IACX,kBAAkB;IAClB,iBAAiB;IACjB,wBAAwB;IACxB,YAAY;CACf,CAAC;AAGF,MAAM,eAAe,GAAG;IACpB,MAAM,EAAE,uFAAuF;IAC/F,iBAAiB,EAAE,eAAe;IAClC,iBAAiB,EAAE,gBAAgB;IACnC,UAAU,EAAE,4BAA4B;IACxC,UAAU,EAAE,YAAY;IACxB,IAAI,EAAE,4BAA4B;IAClC,OAAO,EAAE,yBAAyB;IAClC,gBAAgB,EAAE,UAAU;IAC5B,gBAAgB,EAAE,UAAU;IAC5B,gBAAgB,EAAE,YAAY;IAC9B,2BAA2B,EAAE,GAAG;IAChC,YAAY,EAAE,gFAAgF;CACjG,CAAC;AASF,MAAa,UAAU;IAGnB,YAAmB,MAAkC;QACjD,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;YACzB,cAAc,EAAE,IAAI;YACpB,aAAa,EAAE,KAAK;SACH,EAAE,MAAM,CAAC,CAAC;IACnC,CAAC;IAEY,KAAK,CAAC,GAAW;;YAC1B,MAAM,UAAU,GAAG,eAAK,CAAC,MAAM,CAAC;gBAC5B,cAAc,EAAE,GAAG,EAAE,CAAC,IAAI;aAC7B,CAAC,CAAC;YAEH,+CAA+C;YAC/C,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,EAAE,EAAE,eAAe,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;YAExE,wBAAwB;YACxB,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACnC,OAAO,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC;YACvB,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI,CAAC;YAE3B,kCAAkC;YAClC,MAAM,QAAQ,GAAG,MAAM,UAAU,CAAC,GAAG,CAAC,GAAG,kBACrC,OAAO,IACJ,IAAI,CAAC,OAAO,CAAC,aAAa,EAC/B,CAAC;YACH,IAAI,QAAQ,CAAC,MAAM,IAAI,GAAG,EAAE;gBACxB,MAAM,IAAI,KAAK,CAAC,mCAAmC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;aACzE;YAED,+BAA+B;YAC/B,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;YACrD,MAAM,gBAAgB,GAAG,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAChD,IAAI,CAAC,gBAAgB,CAAC,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,EAAE;gBAC9E,MAAM,IAAI,KAAK,CAAC,4CAA4C,WAAW,EAAE,CAAC,CAAC;aAC9E;YAED,mCAAmC;YACnC,MAAM,OAAO,GAAG,gBAAgB,CAAC,CAAC,CAAC,IAAI,YAAY,CAAC,CAAC,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YACpG,IAAI,OAAO,IAAI,MAAM,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE;gBAClD,MAAM,IAAI,GAAG,IAAI,CAAC,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBACrD,OAAO,EAAC,IAAI,EAAE,OAAO,EAAE,IAAI,EAAC,CAAC;aAChC;iBAAM;gBACH,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;gBAC3B,OAAO,EAAC,IAAI,EAAE,OAAO,EAAC,CAAC;aAC1B;QACL,CAAC;KAAA;IAGO,cAAc,CAAC,IAAY,EAAE,OAAe;;QAChD,gCAAgC;QAChC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,gBAAgB,EAAE,IAAI,EAAE,CAAC,CAAC;QAEzD,wDAAwD;QACxD,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,EAAE,CAAC;QACrB,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;YACpB,MAAM,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;YACpB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAC9B,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE;gBAClC,qCAAqC;gBACrC,IAAI;oBACA,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;iBACvD;gBAAC,WAAM;oBACJ,cAAc;iBACjB;aACJ;QACL,CAAC,CAAC,CAAC;QAEH,sBAAsB;QACtB,MAAM,IAAI,GAAG,MAAA,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,mCAAI,EAAE,CAAC;QACpC,MAAM,eAAe,GAAG,IAAI,kBAAe,CAAC;YACxC,EAAE,EAAE,aAAa;SACpB,CAAC,CAAC;QACH,aAAa,CAAC,eAAe,CAAC,CAAC;QAC/B,MAAM,EAAE,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAE1C,qCAAqC;QACrC,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC;QACjE,IAAI,YAAY,GAAG,EAAE,EAAE;YACnB,OAAO,EAAE,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;SACjC;aAAM;YACH,OAAO,EAAE,CAAC;SACb;IACL,CAAC;CACJ;AAtFD,gCAsFC;AAED,SAAS,aAAa,CAAC,eAAgC;IACnD,eAAe,CAAC,OAAO,CAAC,WAAW,EAAE;QACjC,MAAM,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC;QACpB,WAAW,EAAE,UAAU,OAAO,EAAE,IAAI;YAChC,OAAO,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,CAAA;QAC9B,CAAC;KACJ,CAAC,CAAC;IAEH,eAAe,CAAC,OAAO,CAAC,UAAU,EAAE;QAChC,MAAM,EAAE,IAAI;QACZ,WAAW,EAAE,UAAU,OAAO,EAAE,IAAI;YAChC,IAAI,WAAW,GAAG,EAAE,CAAA;YACpB,IAAI,QAAQ,GAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,CAAA;YAEhE,IAAI,YAAY,CAAC,IAAI,CAAC,EAAE;gBACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;oBAC7C,IAAI,MAAM,GAAG,KAAK,CAAA;oBAClB,IAAI,KAAK,GAAW,CAChB,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CACjD,CAAC,WAAW,EAAE,CAAA;oBAEf,IAAI,KAAK;wBAAE,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,MAAM,CAAA;oBAE7C,WAAW,IAAI,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAA;iBAClD;aACJ;YACD,OAAO,IAAI,GAAG,OAAO,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,GAAG,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC,CAAA;QACnE,CAAC;KACJ,CAAC,CAAC;IAEH,eAAe,CAAC,OAAO,CAAC,OAAO,EAAE;QAC7B,MAAM,EAAE,CAAC,OAAO,CAAC;QACjB,WAAW,EAAE,UAAU,OAAO,EAAE,IAAI;YAChC,kCAAkC;YAClC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,CAAA;YACvC,OAAO,MAAM,GAAG,OAAO,GAAG,MAAM,CAAA;QACpC,CAAC;KACJ,CAAC,CAAC;IAEH,eAAe,CAAC,OAAO,CAAC,cAAc,EAAE;QACpC,MAAM,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC;QACnC,WAAW,EAAE,UAAU,OAAO;YAC1B,OAAO,OAAO,CAAA;QAClB,CAAC;KACJ,CAAC,CAAC;AACP,CAAC;AAED,MAAM,OAAO,GAAG,KAAK,CAAC,SAAS,CAAC,OAAO,CAAA;AACvC,MAAM,KAAK,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,CAAA;AAEnC,4BAA4B;AAC5B,0BAA0B;AAC1B,wEAAwE;AACxE,6BAA6B;AAC7B,2BAA2B;AAC3B,SAAS,YAAY,CAAC,EAAO;IACzB,IAAI,UAAU,GAAG,EAAE,CAAC,UAAU,CAAA;IAC9B,OAAO,CACH,UAAU,CAAC,QAAQ,KAAK,OAAO;QAC/B,CACI,UAAU,CAAC,UAAU,KAAK,EAAE;YAC5B,CAAC,UAAU,CAAC,QAAQ,KAAK,OAAO,IAAI,YAAY,CAAC,UAAU,CAAC,CAAC;YAC7D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,UAAU,EAAE,UAAU,CAAC,IAAI,OAAO,CAAC,CAAC,QAAQ,KAAK,IAAI,CAAA,CAAC,CAAC,CAAC,CACzE,CACJ,CAAA;AACL,CAAC;AAED,SAAS,YAAY,CAAC,OAAY;IAC9B,IAAI,eAAe,GAAG,OAAO,CAAC,eAAe,CAAA;IAC7C,OAAO,CACH,OAAO,CAAC,QAAQ,KAAK,OAAO,IAAI,CAC5B,CAAC,eAAe;QAChB,CACI,eAAe,CAAC,QAAQ,KAAK,OAAO;YACpC,QAAQ,CAAC,IAAI,CAAC,eAAe,CAAC,WAAW,CAAC,CAC7C,CACJ,CACJ,CAAA;AACL,CAAC;AAED,SAAS,IAAI,CAAC,OAAe,EAAE,IAAS;IACpC,IAAI,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,UAAU,EAAE,IAAI,CAAC,CAAA;IAC1D,IAAI,MAAM,GAAG,GAAG,CAAA;IAChB,IAAI,KAAK,KAAK,CAAC,EAAE;QACb,MAAM,GAAG,IAAI,CAAA;KAChB;IACD,OAAO,YAAY,CAAC,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,CAAC;AACjD,CAAC;AAED,SAAS,YAAY,CAAC,OAAe;IACjC,IAAI,MAAM,GAAG,EAAE,CAAC;IAChB,MAAM,KAAK,GAAG,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,CAAC,CAAC;IAChF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;QACrC,IAAI,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE;YAC5B,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,IAAI,GAAG,EAAE;gBAClC,MAAM,IAAI,GAAG,CAAC;aACjB;YACD,SAAS;SACZ;aAAM;YACH,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,CAAC;SACxB;KACJ;IACD,OAAO,MAAM,CAAC;AAClB,CAAC"}
|
package/lib/types.d.ts
CHANGED
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
* An AI model that can be used to create embeddings.
|
|
3
3
|
*/
|
|
4
4
|
export interface EmbeddingsModel {
|
|
5
|
+
/**
|
|
6
|
+
* Maximum number of tokens
|
|
7
|
+
*/
|
|
8
|
+
readonly maxTokens: number;
|
|
5
9
|
/**
|
|
6
10
|
* Creates embeddings for the given inputs.
|
|
7
11
|
* @param inputs Text inputs to create embeddings for.
|
|
@@ -43,7 +47,10 @@ export interface TextChunk {
|
|
|
43
47
|
endOverlap: number[];
|
|
44
48
|
}
|
|
45
49
|
export interface TextFetcher {
|
|
46
|
-
fetch(uri: string): Promise<
|
|
50
|
+
fetch(uri: string): Promise<{
|
|
51
|
+
text: string;
|
|
52
|
+
docType: string | undefined;
|
|
53
|
+
}>;
|
|
47
54
|
}
|
|
48
55
|
export interface IndexStats {
|
|
49
56
|
version: number;
|
package/lib/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAEA;;GAEG;AACH,MAAM,WAAW,eAAe;IAC5B;;;;OAIG;IACH,gBAAgB,CAAC,MAAM,EAAE,MAAM,GAAC,MAAM,EAAE,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAAC;CAC1E;AAED;;;;;;GAMG;AACH,MAAM,MAAM,wBAAwB,GAAG,SAAS,GAAG,OAAO,GAAG,cAAc,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,kBAAkB;IAC/B;;OAEG;IACH,MAAM,EAAE,wBAAwB,CAAC;IAEjC;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IAEpB;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,SAAS;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,UAAU,EAAE,MAAM,EAAE,CAAC;CACxB;AAED,MAAM,WAAW,WAAW;IACxB,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAEA;;GAEG;AACH,MAAM,WAAW,eAAe;IAC5B;;OAEG;IACH,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAE3B;;;;OAIG;IACH,gBAAgB,CAAC,MAAM,EAAE,MAAM,GAAC,MAAM,EAAE,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAAC;CAC1E;AAED;;;;;;GAMG;AACH,MAAM,MAAM,wBAAwB,GAAG,SAAS,GAAG,OAAO,GAAG,cAAc,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,kBAAkB;IAC/B;;OAEG;IACH,MAAM,EAAE,wBAAwB,CAAC;IAEjC;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IAEpB;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,SAAS;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,UAAU,EAAE,MAAM,EAAE,CAAC;CACxB;AAED,MAAM,WAAW,WAAW;IACxB,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,GAAC,SAAS,CAAC;KAAE,CAAC,CAAC;CAC7E;AAED,MAAM,WAAW,UAAU;IACvB,OAAO,EAAE,MAAM,CAAC;IAChB,eAAe,EAAE;QACb,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;IACF,KAAK,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,SAAS,CAAC,SAAS,GAAG,MAAM,CAAC,MAAM,EAAC,aAAa,CAAC;IAC/D,EAAE,EAAE,MAAM,CAAC;IACX,QAAQ,EAAE,SAAS,CAAC;IACpB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,cAAc;IAE3B;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,GAAC,MAAM,GAAC,OAAO,CAAC;IAE9B;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,GAAC,MAAM,GAAC,OAAO,CAAC;IAE9B;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;OAEG;IACH,KAAK,CAAC,EAAE,CAAC,MAAM,GAAC,MAAM,CAAC,EAAE,CAAC;IAE1B;;OAEG;IACH,MAAM,CAAC,EAAE,CAAC,MAAM,GAAC,MAAM,CAAC,EAAE,CAAC;IAE3B;;OAEG;IACH,MAAM,CAAC,EAAE,cAAc,EAAE,CAAC;IAE1B;;OAEG;IACH,KAAK,CAAC,EAAE,cAAc,EAAE,CAAC;IAEzB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CAC1B;AAED,MAAM,MAAM,aAAa,GAAG,MAAM,GAAC,MAAM,GAAC,OAAO,CAAC;AAElD,MAAM,WAAW,WAAW,CAAC,SAAS,GAAG,MAAM,CAAC,MAAM,EAAC,aAAa,CAAC;IACjE,IAAI,EAAE,SAAS,CAAC,SAAS,CAAC,CAAC;IAC3B,KAAK,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,SAAS;IACtB,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,CAAC;IACjC,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;CAClC;AAED,MAAM,WAAW,qBAAqB;IAClC,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,CAAC,GAAG,EAAE,MAAM,GAAG,aAAa,CAAC;CAChC;AAED,MAAM,WAAW,oBAAoB;IACjC,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,eAAe,EAAE;QACb,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;CACL;AAED,MAAM,WAAW,mBAAmB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;CACjB"}
|
package/lib/vectra-cli.js
CHANGED
|
@@ -60,7 +60,7 @@ function run() {
|
|
|
60
60
|
const index = new LocalDocumentIndex_1.LocalDocumentIndex({ folderPath });
|
|
61
61
|
yield index.deleteIndex();
|
|
62
62
|
}))
|
|
63
|
-
.command('add
|
|
63
|
+
.command('add <index>', `adds one or more web pages to an index`, (yargs) => {
|
|
64
64
|
return yargs
|
|
65
65
|
.option('keys', {
|
|
66
66
|
alias: 'k',
|
|
@@ -117,9 +117,9 @@ function run() {
|
|
|
117
117
|
for (const uri of uris) {
|
|
118
118
|
try {
|
|
119
119
|
console.log(internals_1.Colorize.progress(`fetching ${uri}`));
|
|
120
|
-
const
|
|
120
|
+
const { text, docType } = yield fetcher.fetch(uri);
|
|
121
121
|
console.log(internals_1.Colorize.replaceLine(internals_1.Colorize.progress(`indexing ${uri}`)));
|
|
122
|
-
yield index.upsertDocument(uri,
|
|
122
|
+
yield index.upsertDocument(uri, text, docType);
|
|
123
123
|
console.log(internals_1.Colorize.replaceLine(internals_1.Colorize.success(`added ${uri}`)));
|
|
124
124
|
}
|
|
125
125
|
catch (err) {
|
|
@@ -179,25 +179,25 @@ function run() {
|
|
|
179
179
|
.option('document-count', {
|
|
180
180
|
alias: 'dc',
|
|
181
181
|
describe: 'max number of documents to return (defaults to 10)',
|
|
182
|
-
type: '
|
|
182
|
+
type: 'number',
|
|
183
183
|
default: 10
|
|
184
184
|
})
|
|
185
185
|
.option('chunk-count', {
|
|
186
186
|
alias: 'cc',
|
|
187
187
|
describe: 'max number of chunks to return (defaults to 50)',
|
|
188
|
-
type: '
|
|
188
|
+
type: 'number',
|
|
189
189
|
default: 50
|
|
190
190
|
})
|
|
191
191
|
.option('section-count', {
|
|
192
192
|
alias: 'sc',
|
|
193
193
|
describe: 'max number of document sections to render (defaults to 1)',
|
|
194
|
-
type: '
|
|
194
|
+
type: 'number',
|
|
195
195
|
default: 1
|
|
196
196
|
})
|
|
197
197
|
.option('tokens', {
|
|
198
198
|
alias: 't',
|
|
199
199
|
describe: 'max number of tokens to render for each document section (defaults to 2000)',
|
|
200
|
-
type: '
|
|
200
|
+
type: 'number',
|
|
201
201
|
default: 2000
|
|
202
202
|
})
|
|
203
203
|
.option('format', {
|
|
@@ -233,7 +233,7 @@ function run() {
|
|
|
233
233
|
const sections = yield result.renderSections(args.tokens, args.sectionCount);
|
|
234
234
|
for (let i = 0; i < sections.length; i++) {
|
|
235
235
|
const section = sections[i];
|
|
236
|
-
console.log(internals_1.Colorize.title(args.sectionCount
|
|
236
|
+
console.log(internals_1.Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
|
|
237
237
|
console.log(internals_1.Colorize.value('score', section.score));
|
|
238
238
|
console.log(internals_1.Colorize.value('tokens', section.tokenCount));
|
|
239
239
|
console.log(internals_1.Colorize.output(section.text));
|
package/lib/vectra-cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"vectra-cli.js","sourceRoot":"","sources":["../src/vectra-cli.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,gDAAkC;AAClC,wDAAgC;AAChC,2CAAwC;AACxC,6DAA0D;AAC1D,6CAA0C;AAC1C,yDAAsD;AACtD,2CAAuC;AAEvC,SAAsB,GAAG;;QACrB,kBAAkB;QAClB,MAAM,IAAI,GAAG,MAAM,IAAA,eAAK,EAAC,IAAA,iBAAO,EAAC,OAAO,CAAC,IAAI,CAAC,CAAC;aAC1C,UAAU,CAAC,QAAQ,CAAC;aACpB,OAAO,CAAC,gBAAgB,EAAE,0BAA0B,EAAE,EAAE,EAAE,CAAO,IAAI,EAAE,EAAE;YACtE,MAAM,UAAU,GAAG,IAAI,CAAC,KAAe,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC;YACrD,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,MAAM,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC,CAAC;YAChE,MAAM,KAAK,CAAC,WAAW,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC;QAClE,CAAC,CAAA,CAAC;aACD,OAAO,CAAC,gBAAgB,EAAE,gCAAgC,EAAE,EAAE,EAAE,CAAO,IAAI,EAAE,EAAE;YAC5E,MAAM,UAAU,GAAG,IAAI,CAAC,KAAe,CAAC;YACxC,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,MAAM,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC,CAAC;YAChE,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC;YACrD,MAAM,KAAK,CAAC,WAAW,EAAE,CAAC;QAC9B,CAAC,CAAA,CAAC;aACD,OAAO,CAAC,
|
|
1
|
+
{"version":3,"file":"vectra-cli.js","sourceRoot":"","sources":["../src/vectra-cli.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,gDAAkC;AAClC,wDAAgC;AAChC,2CAAwC;AACxC,6DAA0D;AAC1D,6CAA0C;AAC1C,yDAAsD;AACtD,2CAAuC;AAEvC,SAAsB,GAAG;;QACrB,kBAAkB;QAClB,MAAM,IAAI,GAAG,MAAM,IAAA,eAAK,EAAC,IAAA,iBAAO,EAAC,OAAO,CAAC,IAAI,CAAC,CAAC;aAC1C,UAAU,CAAC,QAAQ,CAAC;aACpB,OAAO,CAAC,gBAAgB,EAAE,0BAA0B,EAAE,EAAE,EAAE,CAAO,IAAI,EAAE,EAAE;YACtE,MAAM,UAAU,GAAG,IAAI,CAAC,KAAe,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC;YACrD,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,MAAM,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC,CAAC;YAChE,MAAM,KAAK,CAAC,WAAW,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC;QAClE,CAAC,CAAA,CAAC;aACD,OAAO,CAAC,gBAAgB,EAAE,gCAAgC,EAAE,EAAE,EAAE,CAAO,IAAI,EAAE,EAAE;YAC5E,MAAM,UAAU,GAAG,IAAI,CAAC,KAAe,CAAC;YACxC,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,MAAM,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC,CAAC;YAChE,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC;YACrD,MAAM,KAAK,CAAC,WAAW,EAAE,CAAC;QAC9B,CAAC,CAAA,CAAC;aACD,OAAO,CAAC,aAAa,EAAE,wCAAwC,EAAE,CAAC,KAAK,EAAE,EAAE;YACxE,OAAO,KAAK;iBACP,MAAM,CAAC,MAAM,EAAE;gBACZ,KAAK,EAAE,GAAG;gBACV,QAAQ,EAAE,gFAAgF;gBAC1F,IAAI,EAAE,QAAQ;aACjB,CAAC;iBACD,MAAM,CAAC,KAAK,EAAE;gBACX,KAAK,EAAE,GAAG;gBACV,KAAK,EAAE,IAAI;gBACX,QAAQ,EAAE,sCAAsC;gBAChD,IAAI,EAAE,QAAQ;aACjB,CAAC;iBACD,MAAM,CAAC,MAAM,EAAE;gBACZ,KAAK,EAAE,GAAG;gBACV,QAAQ,EAAE,sDAAsD;gBAChE,IAAI,EAAE,QAAQ;aACjB,CAAC;iBACD,MAAM,CAAC,YAAY,EAAE;gBAClB,KAAK,EAAE,IAAI;gBACX,QAAQ,EAAE,0DAA0D;gBACpE,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,GAAG;aACf,CAAC;iBACD,KAAK,CAAC,CAAC,IAAI,EAAE,EAAE;gBACZ,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE;oBAChD,OAAO,IAAI,CAAC;iBACf;qBAAM,IAAI,OAAO,IAAI,CAAC,IAAI,IAAI,QAAQ,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE;oBACpE,OAAO,IAAI,CAAC;iBACf;qBAAM;oBACH,MAAM,IAAI,KAAK,CAAC,mJAAmJ,CAAC,CAAC;iBACxK;YACL,CAAC,CAAC;iBACD,YAAY,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;QAChC,CAAC,EAAE,CAAO,IAAI,EAAE,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAC,CAAC;YAEzD,oBAAoB;YACpB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,OAAO,CAAC,CAAC,CAAC;YACzE,MAAM,UAAU,GAAG,IAAI,mCAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,wBAAwB,EAAE,EAAE,IAAI,CAAC,CAAC,CAAC;YAElG,mBAAmB;YACnB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAe,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC;gBACjC,UAAU;gBACV,UAAU;gBACV,cAAc,EAAE;oBACZ,SAAS,EAAE,IAAI,CAAC,SAAS;iBAC5B;aACJ,CAAC,CAAC;YAEH,oBAAoB;YACpB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,GAAe,EAAE,IAAI,CAAC,IAAc,EAAE,UAAU,CAAC,CAAC;YAEtF,kBAAkB;YAClB,MAAM,OAAO,GAAG,IAAI,uBAAU,EAAE,CAAC;YACjC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE;gBACpB,IAAI;oBACA,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,QAAQ,CAAC,YAAY,GAAG,EAAE,CAAC,CAAC,CAAC;oBAClD,MAAM,EAAE,IAAI,EAAE,OAAO,EAAE,GAAI,MAAM,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBACpD,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,WAAW,CAAC,oBAAQ,CAAC,QAAQ,CAAC,YAAY,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;oBACxE,MAAM,KAAK,CAAC,cAAc,CAAC,GAAG,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;oBAC/C,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,WAAW,CAAC,oBAAQ,CAAC,OAAO,CAAC,SAAS,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;iBACvE;gBAAC,OAAO,GAAY,EAAE;oBACnB,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,WAAW,CAAC,oBAAQ,CAAC,KAAK,CAAC,iBAAiB,GAAG,KAAM,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,CAAC;iBACxG;aACJ;QACL,CAAC,CAAA,CAAC;aACD,OAAO,CAAC,gBAAgB,EAAE,6CAA6C,EAAE,CAAC,KAAK,EAAE,EAAE;YAChF,OAAO,KAAK;iBACP,MAAM,CAAC,KAAK,EAAE;gBACX,KAAK,EAAE,GAAG;gBACV,KAAK,EAAE,IAAI;gBACX,QAAQ,EAAE,6BAA6B;gBACvC,IAAI,EAAE,QAAQ;aACjB,CAAC;iBACD,MAAM,CAAC,MAAM,EAAE;gBACZ,KAAK,EAAE,GAAG;gBACV,QAAQ,EAAE,yDAAyD;gBACnE,IAAI,EAAE,QAAQ;aACjB,CAAC;iBACD,KAAK,CAAC,CAAC,IAAI,EAAE,EAAE;gBACZ,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE;oBAChD,OAAO,IAAI,CAAC;iBACf;qBAAM,IAAI,OAAO,IAAI,CAAC,IAAI,IAAI,QAAQ,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE;oBACpE,OAAO,IAAI,CAAC;iBACf;qBAAM;oBACH,MAAM,IAAI,KAAK,CAAC,mJAAmJ,CAAC,CAAC;iBACxK;YACL,CAAC,CAAC,CAAC;QACX,CAAC,EAAE,CAAO,IAAI,EAAE,EAAE;YACd,mBAAmB;YACnB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAe,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC;YAErD,oBAAoB;YACpB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,GAAe,EAAE,IAAI,CAAC,IAAc,EAAE,UAAU,CAAC,CAAC;YAEtF,mBAAmB;YACnB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE;gBACpB,OAAO,CAAC,GAAG,CAAC,YAAY,GAAG,EAAE,CAAC,CAAC;gBAC/B,MAAM,KAAK,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;aACnC;QACL,CAAC,CAAA,CAAC;aACD,OAAO,CAAC,eAAe,EAAE,oCAAoC,EAAE,EAAE,EAAE,CAAO,IAAI,EAAE,EAAE;YAC/E,MAAM,UAAU,GAAG,IAAI,CAAC,KAAe,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC;YACrD,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,eAAe,EAAE,CAAC;YAC5C,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC;YAC3C,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;QACxC,CAAC,CAAA,CAAC;aACD,OAAO,CAAC,uBAAuB,EAAE,uBAAuB,EAAE,CAAC,KAAK,EAAE,EAAE;YACjE,OAAO,KAAK;iBACP,MAAM,CAAC,MAAM,EAAE;gBACZ,KAAK,EAAE,GAAG;gBACV,QAAQ,EAAE,gFAAgF;aAC7F,CAAC;iBACD,MAAM,CAAC,gBAAgB,EAAE;gBACtB,KAAK,EAAE,IAAI;gBACX,QAAQ,EAAE,oDAAoD;gBAC9D,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,EAAE;aACd,CAAC;iBACD,MAAM,CAAC,aAAa,EAAE;gBACnB,KAAK,EAAE,IAAI;gBACX,QAAQ,EAAE,iDAAiD;gBAC3D,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,EAAE;aACd,CAAC;iBACD,MAAM,CAAC,eAAe,EAAE;gBACrB,KAAK,EAAE,IAAI;gBACX,QAAQ,EAAE,2DAA2D;gBACrE,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,CAAC;aACb,CAAC;iBACD,MAAM,CAAC,QAAQ,EAAE;gBACd,KAAK,EAAE,GAAG;gBACV,QAAQ,EAAE,6EAA6E;gBACvF,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,IAAI;aAChB,CAAC;iBACD,MAAM,CAAC,QAAQ,EAAE;gBACd,KAAK,EAAE,GAAG;gBACV,QAAQ,EAAE,wDAAwD;gBAClE,OAAO,EAAE,CAAC,UAAU,EAAE,OAAO,EAAE,QAAQ,CAAC;gBACxC,OAAO,EAAE,UAAU;aACtB,CAAC;iBACD,YAAY,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;QAChC,CAAC,EAAE,CAAO,IAAI,EAAE,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC,CAAC;YAE9C,oBAAoB;YACpB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,OAAO,CAAC,CAAC,CAAC;YACzE,MAAM,UAAU,GAAG,IAAI,mCAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,wBAAwB,EAAE,EAAE,IAAI,CAAC,CAAC,CAAC;YAElG,mBAAmB;YACnB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAe,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,uCAAkB,CAAC;gBACjC,UAAU;gBACV,UAAU;aACb,CAAC,CAAC;YAEH,cAAc;YACd,MAAM,KAAK,GAAG,IAAI,CAAC,KAAe,CAAC;YACnC,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,cAAc,CAAC,KAAK,EAAE;gBAC9C,YAAY,EAAE,IAAI,CAAC,aAAa;gBAChC,SAAS,EAAE,IAAI,CAAC,UAAU;aAC7B,CAAC,CAAC;YAEH,iBAAiB;YACjB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE;gBAC1B,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;gBACzC,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;gBACnD,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;gBAC5D,IAAI,IAAI,CAAC,MAAM,IAAI,UAAU,EAAE;oBAC3B,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,YAAY,CAAC,CAAC;oBAC7E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;wBACtC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;wBAC5B,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;wBACrF,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,OAAO,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC;wBACpD,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,QAAQ,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC;wBAC1D,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC;qBAC9C;iBACJ;qBAAM,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ,EAAE;oBAChC,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,QAAQ,EAAE,CAAC;oBACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;wBAC3C,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;wBAC/B,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC;wBAC9C,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;wBAC1C,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;wBAC9C,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC;wBAClD,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAC;wBAClD,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,KAAK,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC;wBAC9C,OAAO,CAAC,GAAG,CAAC,oBAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;qBACtE;iBACJ;aACJ;QACL,CAAC,CAAA,CAAC;aACD,IAAI,EAAE;aACN,aAAa,EAAE;aACf,UAAU,EAAE,CAAC;IACtB,CAAC;CAAA;AAzND,kBAyNC;AAGD,SAAe,WAAW,CAAC,KAAe,EAAE,QAAgB,EAAE,OAAe;;QACzE,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE;YAC1C,OAAO,KAAK,CAAC;SAChB;aAAM,IAAI,OAAO,QAAQ,IAAI,QAAQ,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE;YAClE,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;YAClD,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;SACxF;aAAM;YACH,MAAM,IAAI,KAAK,CAAC,+CAA+C,OAAO,6EAA6E,CAAC,CAAA;SACvJ;IACL,CAAC;CAAA"}
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "vectra",
|
|
3
3
|
"author": "Steven Ickman",
|
|
4
4
|
"description": "A vector database that uses the local file system for storage.",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.3.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"keywords": [
|
|
8
8
|
"gpt"
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
"gpt-3-encoder": "1.1.4",
|
|
34
34
|
"json-colorizer": "^2.2.2",
|
|
35
35
|
"openai": "^3.2.1",
|
|
36
|
+
"turndown": "^7.1.2",
|
|
36
37
|
"uuid": "^9.0.0",
|
|
37
38
|
"yargs": "^17.7.2"
|
|
38
39
|
},
|
|
@@ -42,6 +43,7 @@
|
|
|
42
43
|
"@types/node": "^14.14.31",
|
|
43
44
|
"@types/mocha": "^8.2.0",
|
|
44
45
|
"@types/assert": "^1.5.3",
|
|
46
|
+
"@types/turndown": "^5.0.1",
|
|
45
47
|
"@types/uuid": "9.0.1",
|
|
46
48
|
"@types/yargs": "17.0.24",
|
|
47
49
|
"mocha": "10.2.0",
|
|
@@ -8,8 +8,6 @@ import { MetadataFilter, EmbeddingsModel, Tokenizer, MetadataTypes, EmbeddingsRe
|
|
|
8
8
|
import { LocalDocumentResult } from './LocalDocumentResult';
|
|
9
9
|
import { LocalDocument } from './LocalDocument';
|
|
10
10
|
|
|
11
|
-
const EMBEDDINGS_BATCH_SIZE = 500;
|
|
12
|
-
|
|
13
11
|
export interface DocumentQueryOptions {
|
|
14
12
|
maxDocuments?: number;
|
|
15
13
|
maxChunks?: number;
|
|
@@ -131,10 +129,13 @@ export class LocalDocumentIndex extends LocalIndex {
|
|
|
131
129
|
* @remarks
|
|
132
130
|
* A new update is started if one is not already in progress. If an document with the same uri
|
|
133
131
|
* already exists, it will be replaced.
|
|
134
|
-
* @param
|
|
132
|
+
* @param uri - Document URI
|
|
133
|
+
* @param text - Document text
|
|
134
|
+
* @param docType - Optional. Document type
|
|
135
|
+
* @param metadata - Optional. Document metadata to index
|
|
135
136
|
* @returns Inserted document
|
|
136
137
|
*/
|
|
137
|
-
public async upsertDocument(uri: string, text: string, metadata?: Record<string, MetadataTypes>): Promise<LocalDocument> {
|
|
138
|
+
public async upsertDocument(uri: string, text: string, docType?: string, metadata?: Record<string, MetadataTypes>): Promise<LocalDocument> {
|
|
138
139
|
// Ensure embeddings configured
|
|
139
140
|
if (!this._embeddings) {
|
|
140
141
|
throw new Error(`Embeddings model not configured.`);
|
|
@@ -150,12 +151,15 @@ export class LocalDocumentIndex extends LocalIndex {
|
|
|
150
151
|
documentId = v4();
|
|
151
152
|
}
|
|
152
153
|
|
|
153
|
-
//
|
|
154
|
-
const config = Object.assign({}, this._chunkingConfig);
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
const
|
|
158
|
-
|
|
154
|
+
// Initialize text splitter settings
|
|
155
|
+
const config = Object.assign({ docType }, this._chunkingConfig);
|
|
156
|
+
if (config.docType == undefined) {
|
|
157
|
+
// Populate docType based on extension
|
|
158
|
+
const pos = uri.lastIndexOf('.');
|
|
159
|
+
if (pos >= 0) {
|
|
160
|
+
const ext = uri.substring(pos + 1).toLowerCase();
|
|
161
|
+
config.docType = ext;
|
|
162
|
+
}
|
|
159
163
|
}
|
|
160
164
|
|
|
161
165
|
// Split text into chunks
|
|
@@ -163,14 +167,17 @@ export class LocalDocumentIndex extends LocalIndex {
|
|
|
163
167
|
const chunks = splitter.split(text);
|
|
164
168
|
|
|
165
169
|
// Break chunks into batches for embedding generation
|
|
170
|
+
let totalTokens = 0;
|
|
166
171
|
const chunkBatches: string[][] = [];
|
|
167
172
|
let currentBatch: string[] = [];
|
|
168
173
|
for (const chunk of chunks) {
|
|
169
|
-
|
|
170
|
-
if (
|
|
174
|
+
totalTokens += chunk.tokens.length;
|
|
175
|
+
if (totalTokens > this._embeddings.maxTokens) {
|
|
171
176
|
chunkBatches.push(currentBatch);
|
|
172
177
|
currentBatch = [];
|
|
178
|
+
totalTokens = chunk.tokens.length;
|
|
173
179
|
}
|
|
180
|
+
currentBatch.push(chunk.text.replace(/\n/g, ' '));
|
|
174
181
|
}
|
|
175
182
|
if (currentBatch.length > 0) {
|
|
176
183
|
chunkBatches.push(currentBatch);
|
|
@@ -257,7 +264,7 @@ export class LocalDocumentIndex extends LocalIndex {
|
|
|
257
264
|
// Generate embeddings for query
|
|
258
265
|
let embeddings: EmbeddingsResponse;
|
|
259
266
|
try {
|
|
260
|
-
embeddings = await this._embeddings.createEmbeddings(query);
|
|
267
|
+
embeddings = await this._embeddings.createEmbeddings(query.replace(/\n/g, ' '));
|
|
261
268
|
} catch (err: unknown) {
|
|
262
269
|
throw new Error(`Error generating embeddings for query: ${(err as any).toString()}`);
|
|
263
270
|
}
|
package/src/OpenAIEmbeddings.ts
CHANGED
|
@@ -83,6 +83,8 @@ export class OpenAIEmbeddings implements EmbeddingsModel {
|
|
|
83
83
|
|
|
84
84
|
private readonly UserAgent = 'AlphaWave';
|
|
85
85
|
|
|
86
|
+
public readonly maxTokens = 8000;
|
|
87
|
+
|
|
86
88
|
/**
|
|
87
89
|
* Options the client was configured with.
|
|
88
90
|
*/
|
|
@@ -121,7 +123,7 @@ export class OpenAIEmbeddings implements EmbeddingsModel {
|
|
|
121
123
|
|
|
122
124
|
// Create client
|
|
123
125
|
this._httpClient = axios.create({
|
|
124
|
-
validateStatus: (status) =>
|
|
126
|
+
validateStatus: (status) => true
|
|
125
127
|
});
|
|
126
128
|
}
|
|
127
129
|
|
|
@@ -142,6 +144,7 @@ export class OpenAIEmbeddings implements EmbeddingsModel {
|
|
|
142
144
|
} else if (response.status == 429) {
|
|
143
145
|
return { status: 'rate_limited', message: `The embeddings API returned a rate limit error.` }
|
|
144
146
|
} else {
|
|
147
|
+
console.log(inputs);
|
|
145
148
|
return { status: 'error', message: `The embeddings API returned an error status of ${response.status}: ${response.statusText}` };
|
|
146
149
|
}
|
|
147
150
|
}
|