vectra 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "vectra",
3
3
  "author": "Steven Ickman",
4
4
  "description": "A vector database that uses the local file system for storage.",
5
- "version": "0.3.0",
5
+ "version": "0.4.0",
6
6
  "license": "MIT",
7
7
  "keywords": [
8
8
  "gpt"
@@ -0,0 +1,31 @@
1
+ import { TextFetcher } from './types';
2
+ import * as fs from 'fs/promises';
3
+ import * as path from 'path';
4
+
5
+ export class FileFetcher implements TextFetcher {
6
+ public async fetch(uri: string, onDocument: (uri: string, text: string, docType?: string | undefined) => Promise<boolean>): Promise<boolean> {
7
+ // Does path exist and is it a directory?
8
+ let isDirectory: boolean;
9
+ try {
10
+ const stat = await fs.stat(uri);
11
+ isDirectory = stat.isDirectory();
12
+ } catch {
13
+ return true;
14
+ }
15
+
16
+ // If directory, read all files and recurse
17
+ if (isDirectory) {
18
+ const files = await fs.readdir(uri);
19
+ for (const file of files) {
20
+ const filePath = path.join(uri, file);
21
+ await this.fetch(filePath, onDocument);
22
+ }
23
+ return true;
24
+ } else {
25
+ // Read file and call onDocument
26
+ const text = await fs.readFile(uri, 'utf8');
27
+ const parts = uri.split('.');
28
+ return await onDocument(uri, text, parts.length > 0 ? parts[parts.length - 1].toLowerCase() : undefined);
29
+ }
30
+ }
31
+ }
package/src/LocalIndex.ts CHANGED
@@ -20,15 +20,19 @@ export interface CreateIndexConfig {
20
20
  */
21
21
  export class LocalIndex {
22
22
  private readonly _folderPath: string;
23
+ private readonly _indexName: string;
24
+
23
25
  private _data?: IndexData;
24
26
  private _update?: IndexData;
25
27
 
26
28
  /**
27
29
  * Creates a new instance of LocalIndex.
28
30
  * @param folderPath - Path to the index folder
31
+ * @param indexName - Optional name of the index file. Defaults to index.json
29
32
  */
30
- public constructor(folderPath: string) {
33
+ public constructor(folderPath: string, indexName?: string) {
31
34
  this._folderPath = folderPath;
35
+ this._indexName = indexName || "index.json";
32
36
  }
33
37
 
34
38
  /**
@@ -38,6 +42,13 @@ export class LocalIndex {
38
42
  return this._folderPath;
39
43
  }
40
44
 
45
+ /**
46
+ * Optional name of the index file.
47
+ */
48
+ public get indexName(): string {
49
+ return this._indexName;
50
+ }
51
+
41
52
  /**
42
53
  * Begins an update to the index.
43
54
  * @remarks
@@ -87,7 +98,8 @@ export class LocalIndex {
87
98
  metadata_config: config.metadata_config ?? {},
88
99
  items: []
89
100
  };
90
- await fs.writeFile(path.join(this._folderPath, 'index.json'), JSON.stringify(this._data));
101
+
102
+ await fs.writeFile(path.join(this._folderPath, this._indexName), JSON.stringify(this._data));
91
103
  } catch (err: unknown) {
92
104
  await this.deleteIndex();
93
105
  throw new Error('Error creating index');
@@ -139,7 +151,7 @@ export class LocalIndex {
139
151
 
140
152
  try {
141
153
  // Save index
142
- await fs.writeFile(path.join(this._folderPath, 'index.json'), JSON.stringify(this._update));
154
+ await fs.writeFile(path.join(this._folderPath, this._indexName), JSON.stringify(this._update));
143
155
  this._data = this._update;
144
156
  this._update = undefined;
145
157
  } catch(err: unknown) {
@@ -194,7 +206,7 @@ export class LocalIndex {
194
206
  */
195
207
  public async isIndexCreated(): Promise<boolean> {
196
208
  try {
197
- await fs.access(path.join(this._folderPath, 'index.json'));
209
+ await fs.access(path.join(this._folderPath, this.indexName));
198
210
  return true;
199
211
  } catch (err: unknown) {
200
212
  return false;
@@ -307,7 +319,7 @@ export class LocalIndex {
307
319
  throw new Error('Index does not exist');
308
320
  }
309
321
 
310
- const data = await fs.readFile(path.join(this._folderPath, 'index.json'));
322
+ const data = await fs.readFile(path.join(this._folderPath, this.indexName));
311
323
  this._data = JSON.parse(data.toString());
312
324
  }
313
325
 
@@ -140,7 +140,8 @@ export class OpenAIEmbeddings implements EmbeddingsModel {
140
140
 
141
141
  // Process response
142
142
  if (response.status < 300) {
143
- return { status: 'success', output: response.data.data.sort((a, b) => a.index - b.index).map((item) => item.embedding) };
143
+ const {data,model,usage} = response.data
144
+ return { status: 'success', output: data.sort((a, b) => a.index - b.index).map((item) => item.embedding), model, usage };
144
145
  } else if (response.status == 429) {
145
146
  return { status: 'rate_limited', message: `The embeddings API returned a rate limit error.` }
146
147
  } else {
@@ -205,4 +206,4 @@ export class OpenAIEmbeddings implements EmbeddingsModel {
205
206
  return response;
206
207
  }
207
208
  }
208
- }
209
+ }
package/src/WebFetcher.ts CHANGED
@@ -3,7 +3,6 @@ import { TextFetcher } from './types';
3
3
  import * as cheerio from 'cheerio';
4
4
  import TurndownService from 'turndown';
5
5
 
6
-
7
6
  const ALLOWED_CONTENT_TYPES = [
8
7
  "text/html",
9
8
  "application/json",
@@ -45,7 +44,7 @@ export class WebFetcher implements TextFetcher {
45
44
  } as WebFetcherConfig, config);
46
45
  }
47
46
 
48
- public async fetch(uri: string): Promise<{ text: string; docType: string|undefined; }> {
47
+ public async fetch(uri: string, onDocument: (uri: string, text: string, docType?: string) => Promise<boolean>): Promise<boolean> {
49
48
  const httpClient = axios.create({
50
49
  validateStatus: () => true,
51
50
  });
@@ -78,10 +77,10 @@ export class WebFetcher implements TextFetcher {
78
77
  const docType = contentTypeArray[0] != 'text/plain' ? contentTypeArray[0].split('/')[1] : undefined;
79
78
  if (docType == 'html' && this._config.htmlToMarkdown) {
80
79
  const text = this.htmlToMarkdown(response.data, uri);
81
- return {text, docType: 'md'};
80
+ return await onDocument(uri, text, 'md');
82
81
  } else {
83
82
  const text = response.data;
84
- return {text, docType};
83
+ return await onDocument(uri, text, docType);
85
84
  }
86
85
  }
87
86
 
package/src/index.ts CHANGED
@@ -1,3 +1,4 @@
1
+ export * from './FileFetcher';
1
2
  export * from './GPT3Tokenizer';
2
3
  export * from './ItemSelector';
3
4
  export * from './LocalIndex';
package/src/types.ts CHANGED
@@ -56,7 +56,7 @@ export interface TextChunk {
56
56
  }
57
57
 
58
58
  export interface TextFetcher {
59
- fetch(uri: string): Promise<{ text: string; docType: string|undefined; }>;
59
+ fetch(uri: string, onDocument: (uri: string, text: string, docType?: string) => Promise<boolean>): Promise<boolean>;
60
60
  }
61
61
 
62
62
  export interface IndexStats {
package/src/vectra-cli.ts CHANGED
@@ -5,6 +5,7 @@ import { LocalDocumentIndex } from "./LocalDocumentIndex";
5
5
  import { WebFetcher } from './WebFetcher';
6
6
  import { OpenAIEmbeddings } from './OpenAIEmbeddings';
7
7
  import { Colorize } from './internals';
8
+ import { FileFetcher } from './FileFetcher';
8
9
 
9
10
  export async function run() {
10
11
  // prettier-ignore
@@ -77,16 +78,20 @@ export async function run() {
77
78
  const uris = await getItemList(args.uri as string[], args.list as string, 'web page');
78
79
 
79
80
  // Fetch web pages
80
- const fetcher = new WebFetcher();
81
- for (const uri of uris) {
81
+ const fileFetcher = new FileFetcher();
82
+ const webFetcher = new WebFetcher();
83
+ for (const path of uris) {
82
84
  try {
83
- console.log(Colorize.progress(`fetching ${uri}`));
84
- const { text, docType } = await fetcher.fetch(uri);
85
- console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
86
- await index.upsertDocument(uri, text, docType);
87
- console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
85
+ console.log(Colorize.progress(`fetching ${path}`));
86
+ const fetcher = path.startsWith('http') ? webFetcher : fileFetcher;
87
+ await fetcher.fetch(path, async (uri, text, docType) => {
88
+ console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
89
+ await index.upsertDocument(uri, text, docType);
90
+ console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
91
+ return true;
92
+ });
88
93
  } catch (err: unknown) {
89
- console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${uri}\n${(err as Error).message}`)));
94
+ console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${path}\n${(err as Error).message}`)));
90
95
  }
91
96
  }
92
97
  })