langchain 0.0.142 → 0.0.144

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/callbacks/handlers/llmonitor.cjs +1 -0
  2. package/callbacks/handlers/llmonitor.d.ts +1 -0
  3. package/callbacks/handlers/llmonitor.js +1 -0
  4. package/dist/agents/mrkl/outputParser.cjs +1 -1
  5. package/dist/agents/mrkl/outputParser.js +1 -1
  6. package/dist/base_language/index.cjs +2 -1
  7. package/dist/base_language/index.d.ts +7 -2
  8. package/dist/base_language/index.js +2 -1
  9. package/dist/callbacks/handlers/llmonitor.cjs +223 -0
  10. package/dist/callbacks/handlers/llmonitor.d.ts +35 -0
  11. package/dist/callbacks/handlers/llmonitor.js +215 -0
  12. package/dist/chains/openai_functions/extraction.d.ts +4 -4
  13. package/dist/chains/openai_functions/openapi.d.ts +3 -3
  14. package/dist/chains/openai_functions/structured_output.d.ts +5 -4
  15. package/dist/chains/openai_functions/tagging.d.ts +4 -4
  16. package/dist/chains/openai_moderation.cjs +1 -0
  17. package/dist/chains/openai_moderation.js +1 -0
  18. package/dist/chat_models/base.cjs +4 -3
  19. package/dist/chat_models/base.d.ts +3 -3
  20. package/dist/chat_models/base.js +5 -4
  21. package/dist/chat_models/minimax.d.ts +6 -28
  22. package/dist/chat_models/openai.cjs +1 -0
  23. package/dist/chat_models/openai.d.ts +2 -3
  24. package/dist/chat_models/openai.js +1 -0
  25. package/dist/document_loaders/fs/openai_whisper_audio.cjs +32 -0
  26. package/dist/document_loaders/fs/openai_whisper_audio.d.ts +11 -0
  27. package/dist/document_loaders/fs/openai_whisper_audio.js +28 -0
  28. package/dist/document_loaders/web/github.cjs +210 -24
  29. package/dist/document_loaders/web/github.d.ts +44 -1
  30. package/dist/document_loaders/web/github.js +210 -24
  31. package/dist/document_loaders/web/recursive_url.cjs +13 -0
  32. package/dist/document_loaders/web/recursive_url.js +13 -0
  33. package/dist/embeddings/hf_transformers.cjs +71 -0
  34. package/dist/embeddings/hf_transformers.d.ts +29 -0
  35. package/dist/embeddings/hf_transformers.js +67 -0
  36. package/dist/embeddings/openai.cjs +2 -1
  37. package/dist/embeddings/openai.js +2 -1
  38. package/dist/experimental/chat_models/anthropic_functions.d.ts +2 -5
  39. package/dist/llms/openai-chat.cjs +1 -0
  40. package/dist/llms/openai-chat.js +1 -0
  41. package/dist/llms/openai.cjs +1 -0
  42. package/dist/llms/openai.js +1 -0
  43. package/dist/load/import_constants.cjs +3 -0
  44. package/dist/load/import_constants.js +3 -0
  45. package/dist/prompts/chat.cjs +27 -1
  46. package/dist/prompts/chat.d.ts +3 -2
  47. package/dist/prompts/chat.js +28 -2
  48. package/dist/schema/index.cjs +44 -1
  49. package/dist/schema/index.d.ts +10 -0
  50. package/dist/schema/index.js +41 -0
  51. package/dist/tools/serpapi.cjs +108 -13
  52. package/dist/tools/serpapi.js +108 -13
  53. package/dist/vectorstores/redis.cjs +12 -4
  54. package/dist/vectorstores/redis.d.ts +8 -0
  55. package/dist/vectorstores/redis.js +12 -4
  56. package/dist/vectorstores/tigris.cjs +2 -0
  57. package/dist/vectorstores/tigris.d.ts +2 -3
  58. package/dist/vectorstores/tigris.js +2 -0
  59. package/dist/vectorstores/vectara.cjs +30 -12
  60. package/dist/vectorstores/vectara.d.ts +1 -1
  61. package/dist/vectorstores/vectara.js +30 -12
  62. package/document_loaders/fs/openai_whisper_audio.cjs +1 -0
  63. package/document_loaders/fs/openai_whisper_audio.d.ts +1 -0
  64. package/document_loaders/fs/openai_whisper_audio.js +1 -0
  65. package/embeddings/hf_transformers.cjs +1 -0
  66. package/embeddings/hf_transformers.d.ts +1 -0
  67. package/embeddings/hf_transformers.js +1 -0
  68. package/package.json +36 -6
@@ -29,8 +29,22 @@ export interface GithubFile {
29
29
  * properties specific to the GitHub repository loader.
30
30
  */
31
31
  export interface GithubRepoLoaderParams extends AsyncCallerParams {
32
+ /**
33
+ * The base URL of the GitHub instance.
34
+ * To be used when you are not targeting github.com, e.g. a GitHub Enterprise instance.
35
+ */
36
+ baseUrl?: string;
37
+ /**
38
+ * The API endpoint URL of the GitHub instance.
39
+ * To be used when you are not targeting github.com, e.g. a GitHub Enterprise instance.
40
+ */
41
+ apiUrl?: string;
32
42
  branch?: string;
33
43
  recursive?: boolean;
44
+ /**
45
+ * Set to true to recursively process submodules. Is only effective, when recursive=true.
46
+ */
47
+ processSubmodules?: boolean;
34
48
  unknown?: UnknownHandling;
35
49
  accessToken?: string;
36
50
  ignoreFiles?: (string | RegExp)[];
@@ -52,19 +66,26 @@ export interface GithubRepoLoaderParams extends AsyncCallerParams {
52
66
  * loading files from a GitHub repository.
53
67
  */
54
68
  export declare class GithubRepoLoader extends BaseDocumentLoader implements GithubRepoLoaderParams {
69
+ baseUrl: string;
70
+ apiUrl: string;
55
71
  private readonly owner;
56
72
  private readonly repo;
57
73
  private readonly initialPath;
58
74
  private headers;
59
75
  branch: string;
60
76
  recursive: boolean;
77
+ processSubmodules: boolean;
61
78
  unknown: UnknownHandling;
62
79
  accessToken?: string;
63
80
  ignoreFiles: (string | RegExp)[];
64
81
  ignore?: Ignore;
65
82
  verbose?: boolean;
83
+ maxConcurrency?: number;
84
+ maxRetries?: number;
66
85
  protected caller: AsyncCaller;
67
- constructor(githubUrl: string, { accessToken, branch, recursive, unknown, ignoreFiles, ignorePaths, verbose, maxConcurrency, maxRetries, ...rest }?: GithubRepoLoaderParams);
86
+ ignorePaths?: string[];
87
+ private submoduleInfos;
88
+ constructor(githubUrl: string, { accessToken, baseUrl, apiUrl, branch, recursive, processSubmodules, unknown, ignoreFiles, ignorePaths, verbose, maxConcurrency, maxRetries, ...rest }?: GithubRepoLoaderParams);
68
89
  /**
69
90
  * Extracts the owner, repository, and path from a GitHub URL.
70
91
  * @param url The GitHub URL to extract information from.
@@ -78,6 +99,22 @@ export declare class GithubRepoLoader extends BaseDocumentLoader implements Gith
78
99
  * @returns A promise that resolves to an array of Document instances.
79
100
  */
80
101
  load(): Promise<Document[]>;
102
+ /**
103
+ * Loads the information about Git submodules from the repository, if available.
104
+ */
105
+ private getSubmoduleInfo;
106
+ /**
107
+ * Parses the given content of a .gitmodules file. Furthermore, queries the current SHA ref of all submodules.
108
+ * Returns the submodule information as array.
109
+ * @param gitmodulesContent the content of a .gitmodules file
110
+ */
111
+ private parseGitmodules;
112
+ /**
113
+ * Loads the documents of the given submodule. Uses the same parameters as for the current repository.
114
+ * External submodules, i.e. submodules pointing to another GitHub instance, are ignored.
115
+ * @param submoduleInfo the info about the submodule to be loaded
116
+ */
117
+ private loadSubmodule;
81
118
  /**
82
119
  * Determines whether a file or directory should be ignored based on its
83
120
  * path and type.
@@ -109,6 +146,7 @@ export declare class GithubRepoLoader extends BaseDocumentLoader implements Gith
109
146
  private processDirectory;
110
147
  /**
111
148
  * Fetches the files from a GitHub repository.
149
+ * If the path denotes a single file, the resulting array contains only one element.
112
150
  * @param path The path of the repository to fetch the files from.
113
151
  * @returns A promise that resolves to an array of GithubFile instances.
114
152
  */
@@ -125,4 +163,9 @@ export declare class GithubRepoLoader extends BaseDocumentLoader implements Gith
125
163
  * @returns void
126
164
  */
127
165
  private handleError;
166
+ /**
167
+ * Logs the given message to the console, if parameter 'verbose' is set to true.
168
+ * @param message the message to be logged.
169
+ */
170
+ private log;
128
171
  }
@@ -22,8 +22,20 @@ function isBinaryPath(name) {
22
22
  * loading files from a GitHub repository.
23
23
  */
24
24
  export class GithubRepoLoader extends BaseDocumentLoader {
25
- constructor(githubUrl, { accessToken = getEnvironmentVariable("GITHUB_ACCESS_TOKEN"), branch = "main", recursive = true, unknown = UnknownHandling.Warn, ignoreFiles = [], ignorePaths, verbose = false, maxConcurrency = 2, maxRetries = 2, ...rest } = {}) {
25
+ constructor(githubUrl, { accessToken = getEnvironmentVariable("GITHUB_ACCESS_TOKEN"), baseUrl = "https://github.com", apiUrl = "https://api.github.com", branch = "main", recursive = true, processSubmodules = false, unknown = UnknownHandling.Warn, ignoreFiles = [], ignorePaths, verbose = false, maxConcurrency = 2, maxRetries = 2, ...rest } = {}) {
26
26
  super();
27
+ Object.defineProperty(this, "baseUrl", {
28
+ enumerable: true,
29
+ configurable: true,
30
+ writable: true,
31
+ value: void 0
32
+ });
33
+ Object.defineProperty(this, "apiUrl", {
34
+ enumerable: true,
35
+ configurable: true,
36
+ writable: true,
37
+ value: void 0
38
+ });
27
39
  Object.defineProperty(this, "owner", {
28
40
  enumerable: true,
29
41
  configurable: true,
@@ -60,6 +72,12 @@ export class GithubRepoLoader extends BaseDocumentLoader {
60
72
  writable: true,
61
73
  value: void 0
62
74
  });
75
+ Object.defineProperty(this, "processSubmodules", {
76
+ enumerable: true,
77
+ configurable: true,
78
+ writable: true,
79
+ value: void 0
80
+ });
63
81
  Object.defineProperty(this, "unknown", {
64
82
  enumerable: true,
65
83
  configurable: true,
@@ -90,22 +108,55 @@ export class GithubRepoLoader extends BaseDocumentLoader {
90
108
  writable: true,
91
109
  value: void 0
92
110
  });
111
+ Object.defineProperty(this, "maxConcurrency", {
112
+ enumerable: true,
113
+ configurable: true,
114
+ writable: true,
115
+ value: void 0
116
+ });
117
+ Object.defineProperty(this, "maxRetries", {
118
+ enumerable: true,
119
+ configurable: true,
120
+ writable: true,
121
+ value: void 0
122
+ });
93
123
  Object.defineProperty(this, "caller", {
94
124
  enumerable: true,
95
125
  configurable: true,
96
126
  writable: true,
97
127
  value: void 0
98
128
  });
129
+ Object.defineProperty(this, "ignorePaths", {
130
+ enumerable: true,
131
+ configurable: true,
132
+ writable: true,
133
+ value: void 0
134
+ });
135
+ Object.defineProperty(this, "submoduleInfos", {
136
+ enumerable: true,
137
+ configurable: true,
138
+ writable: true,
139
+ value: void 0
140
+ });
141
+ this.baseUrl = baseUrl;
142
+ this.apiUrl = apiUrl;
99
143
  const { owner, repo, path } = this.extractOwnerAndRepoAndPath(githubUrl);
100
144
  this.owner = owner;
101
145
  this.repo = repo;
102
146
  this.initialPath = path;
103
147
  this.branch = branch;
104
148
  this.recursive = recursive;
149
+ // processing submodules without processing contents of other directories makes no sense
150
+ if (processSubmodules && !recursive) {
151
+ throw new Error(`Input property "recursive" must be true if "processSubmodules" is true.`);
152
+ }
153
+ this.processSubmodules = processSubmodules;
105
154
  this.unknown = unknown;
106
155
  this.accessToken = accessToken;
107
156
  this.ignoreFiles = ignoreFiles;
108
157
  this.verbose = verbose;
158
+ this.maxConcurrency = maxConcurrency;
159
+ this.maxRetries = maxRetries;
109
160
  this.headers = {
110
161
  "User-Agent": "langchain",
111
162
  };
@@ -114,6 +165,7 @@ export class GithubRepoLoader extends BaseDocumentLoader {
114
165
  maxRetries,
115
166
  ...rest,
116
167
  });
168
+ this.ignorePaths = ignorePaths;
117
169
  if (ignorePaths) {
118
170
  this.ignore = ignore.default().add(ignorePaths);
119
171
  }
@@ -130,7 +182,7 @@ export class GithubRepoLoader extends BaseDocumentLoader {
130
182
  * @returns An object containing the owner, repository, and path extracted from the GitHub URL.
131
183
  */
132
184
  extractOwnerAndRepoAndPath(url) {
133
- const match = url.match(/https:\/\/github.com\/([^/]+)\/([^/]+)(\/tree\/[^/]+\/(.+))?/i);
185
+ const match = url.match(new RegExp(`${this.baseUrl}/([^/]+)/([^/]+)(/tree/[^/]+/(.+))?`, "i"));
134
186
  if (!match) {
135
187
  throw new Error("Invalid GitHub URL format.");
136
188
  }
@@ -143,10 +195,127 @@ export class GithubRepoLoader extends BaseDocumentLoader {
143
195
  * @returns A promise that resolves to an array of Document instances.
144
196
  */
145
197
  async load() {
146
- return (await this.processRepo()).map((fileResponse) => new Document({
198
+ this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`);
199
+ // process repository without submodules
200
+ const documents = (await this.processRepo()).map((fileResponse) => new Document({
147
201
  pageContent: fileResponse.contents,
148
202
  metadata: fileResponse.metadata,
149
203
  }));
204
+ if (this.processSubmodules) {
205
+ // process submodules
206
+ await this.getSubmoduleInfo();
207
+ for (const submoduleInfo of this.submoduleInfos) {
208
+ documents.push(...(await this.loadSubmodule(submoduleInfo)));
209
+ }
210
+ }
211
+ return documents;
212
+ }
213
+ /**
214
+ * Loads the information about Git submodules from the repository, if available.
215
+ */
216
+ async getSubmoduleInfo() {
217
+ this.log("Loading info about submodules...");
218
+ // we have to fetch the files of the root directory to get the download url of the .gitmodules file
219
+ // however, we cannot reuse the files retrieved in processRepo() as initialPath may be != ""
220
+ // so it may be that we end up fetching this file list twice
221
+ const repoFiles = await this.fetchRepoFiles("");
222
+ const gitmodulesFile = repoFiles.filter(({ name }) => name === ".gitmodules")?.[0];
223
+ if (gitmodulesFile) {
224
+ const gitmodulesContent = await this.fetchFileContent({
225
+ download_url: gitmodulesFile.download_url,
226
+ });
227
+ this.submoduleInfos = await this.parseGitmodules(gitmodulesContent);
228
+ }
229
+ else {
230
+ this.submoduleInfos = [];
231
+ }
232
+ this.log(`Found ${this.submoduleInfos.length} submodules:`);
233
+ for (const submoduleInfo of this.submoduleInfos) {
234
+ this.log(JSON.stringify(submoduleInfo));
235
+ }
236
+ }
237
+ /**
238
+ * Parses the given content of a .gitmodules file. Furthermore, queries the current SHA ref of all submodules.
239
+ * Returns the submodule information as array.
240
+ * @param gitmodulesContent the content of a .gitmodules file
241
+ */
242
+ async parseGitmodules(gitmodulesContent) {
243
+ // catches the initial line of submodule entries
244
+ const submodulePattern = /\[submodule "(.*?)"]\n((\s+.*?\s*=\s*.*?\n)*)/g;
245
+ // catches the properties of a submodule
246
+ const keyValuePattern = /\s+(.*?)\s*=\s*(.*?)\s/g;
247
+ const submoduleInfos = [];
248
+ for (const [, name, propertyLines] of gitmodulesContent.matchAll(submodulePattern)) {
249
+ if (!name || !propertyLines) {
250
+ throw new Error("Could not parse submodule entry");
251
+ }
252
+ const submodulePropertyLines = propertyLines.matchAll(keyValuePattern);
253
+ let path;
254
+ let url;
255
+ for (const [, key, value] of submodulePropertyLines) {
256
+ if (!key || !value) {
257
+ throw new Error(`Could not parse key/value pairs for submodule ${name}`);
258
+ }
259
+ switch (key) {
260
+ case "path":
261
+ path = value;
262
+ break;
263
+ case "url":
264
+ url = value;
265
+ if (url.endsWith(".git")) {
266
+ url = url.substring(0, url.length - 4);
267
+ }
268
+ break;
269
+ default:
270
+ // ignoring unused keys
271
+ }
272
+ }
273
+ if (!path || !url) {
274
+ throw new Error(`Missing properties for submodule ${name}`);
275
+ }
276
+ // fetch the current ref of the submodule
277
+ const files = await this.fetchRepoFiles(path);
278
+ const submoduleInfo = {
279
+ name,
280
+ path,
281
+ url,
282
+ ref: files[0].sha,
283
+ };
284
+ submoduleInfos.push(submoduleInfo);
285
+ }
286
+ return submoduleInfos;
287
+ }
288
+ /**
289
+ * Loads the documents of the given submodule. Uses the same parameters as for the current repository.
290
+ * External submodules, i.e. submodules pointing to another GitHub instance, are ignored.
291
+ * @param submoduleInfo the info about the submodule to be loaded
292
+ */
293
+ async loadSubmodule(submoduleInfo) {
294
+ if (!submoduleInfo.url.startsWith(this.baseUrl)) {
295
+ this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
296
+ return [];
297
+ }
298
+ else if (!submoduleInfo.path.startsWith(this.initialPath)) {
299
+ this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`);
300
+ return [];
301
+ }
302
+ else {
303
+ this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`);
304
+ return new GithubRepoLoader(submoduleInfo.url, {
305
+ accessToken: this.accessToken,
306
+ apiUrl: this.apiUrl,
307
+ baseUrl: this.baseUrl,
308
+ branch: submoduleInfo.ref,
309
+ recursive: this.recursive,
310
+ processSubmodules: this.processSubmodules,
311
+ unknown: this.unknown,
312
+ ignoreFiles: this.ignoreFiles,
313
+ ignorePaths: this.ignorePaths,
314
+ verbose: this.verbose,
315
+ maxConcurrency: this.maxConcurrency,
316
+ maxRetries: this.maxRetries,
317
+ }).load();
318
+ }
150
319
  }
151
320
  /**
152
321
  * Determines whether a file or directory should be ignored based on its
@@ -186,7 +355,11 @@ export class GithubRepoLoader extends BaseDocumentLoader {
186
355
  });
187
356
  return {
188
357
  contents: fileContent || "",
189
- metadata: { source: file.path },
358
+ metadata: {
359
+ source: file.path,
360
+ repository: `${this.baseUrl}/${this.owner}/${this.repo}`,
361
+ branch: this.branch,
362
+ },
190
363
  };
191
364
  }
192
365
  /**
@@ -197,19 +370,24 @@ export class GithubRepoLoader extends BaseDocumentLoader {
197
370
  // Directories have nested files / directories, which is why this is a list of promises of promises
198
371
  const currentDirectoryDirectoryPromises = [];
199
372
  for (const file of files) {
200
- if (!this.shouldIgnore(file.path, file.type)) {
201
- if (file.type !== "dir") {
202
- try {
203
- currentDirectoryFilePromises.push(this.fetchFileContentWrapper(file));
204
- }
205
- catch (e) {
206
- this.handleError(`Failed to fetch file content: ${file.path}, ${e}`);
207
- }
373
+ if (this.shouldIgnore(file.path, file.type)) {
374
+ continue;
375
+ }
376
+ if (file.type === "file" && file.size === 0) {
377
+ // this is a submodule. ignoring for the moment. submodule processing is done separately
378
+ continue;
379
+ }
380
+ if (file.type !== "dir") {
381
+ try {
382
+ currentDirectoryFilePromises.push(this.fetchFileContentWrapper(file));
208
383
  }
209
- else if (this.recursive) {
210
- currentDirectoryDirectoryPromises.push(this.processDirectory(file.path));
384
+ catch (e) {
385
+ this.handleError(`Failed to fetch file content: ${file.path}, ${e}`);
211
386
  }
212
387
  }
388
+ else if (this.recursive) {
389
+ currentDirectoryDirectoryPromises.push(this.processDirectory(file.path));
390
+ }
213
391
  }
214
392
  const curDirDirectories = await Promise.all(currentDirectoryDirectoryPromises);
215
393
  return [...currentDirectoryFilePromises, ...curDirDirectories.flat()];
@@ -248,24 +426,25 @@ export class GithubRepoLoader extends BaseDocumentLoader {
248
426
  }
249
427
  /**
250
428
  * Fetches the files from a GitHub repository.
429
+ * If the path denotes a single file, the resulting array contains only one element.
251
430
  * @param path The path of the repository to fetch the files from.
252
431
  * @returns A promise that resolves to an array of GithubFile instances.
253
432
  */
254
433
  async fetchRepoFiles(path) {
255
- const url = `https://api.github.com/repos/${this.owner}/${this.repo}/contents/${path}?ref=${this.branch}`;
434
+ const url = `${this.apiUrl}/repos/${this.owner}/${this.repo}/contents/${path}?ref=${this.branch}`;
256
435
  return this.caller.call(async () => {
257
- if (this.verbose) {
258
- console.log("Fetching", url);
259
- }
436
+ this.log(`Fetching ${url}`);
260
437
  const response = await fetch(url, { headers: this.headers });
261
438
  const data = await response.json();
262
439
  if (!response.ok) {
263
440
  throw new Error(`Unable to fetch repository files: ${response.status} ${JSON.stringify(data)}`);
264
441
  }
265
- if (!Array.isArray(data)) {
266
- throw new Error("Unable to fetch repository files.");
442
+ if (Array.isArray(data)) {
443
+ return data;
444
+ }
445
+ else {
446
+ return [data];
267
447
  }
268
- return data;
269
448
  });
270
449
  }
271
450
  /**
@@ -275,9 +454,7 @@ export class GithubRepoLoader extends BaseDocumentLoader {
275
454
  */
276
455
  async fetchFileContent(file) {
277
456
  return this.caller.call(async () => {
278
- if (this.verbose) {
279
- console.log("Fetching", file.download_url);
280
- }
457
+ this.log(`Fetching ${file.download_url}`);
281
458
  const response = await fetch(file.download_url, {
282
459
  headers: this.headers,
283
460
  });
@@ -302,4 +479,13 @@ export class GithubRepoLoader extends BaseDocumentLoader {
302
479
  throw new Error(`Unknown unknown handling: ${this.unknown}`);
303
480
  }
304
481
  }
482
+ /**
483
+ * Logs the given message to the console, if parameter 'verbose' is set to true.
484
+ * @param message the message to be logged.
485
+ */
486
+ log(message) {
487
+ if (this.verbose) {
488
+ console.log(message);
489
+ }
490
+ }
305
491
  }
@@ -84,6 +84,19 @@ class RecursiveUrlLoader extends base_js_1.BaseDocumentLoader {
84
84
  if (invalidPrefixes.some((prefix) => link.startsWith(prefix)) ||
85
85
  invalidSuffixes.some((suffix) => link.endsWith(suffix)))
86
86
  continue;
87
+ let standardizedLink;
88
+ if (link.startsWith("http")) {
89
+ standardizedLink = link;
90
+ }
91
+ else if (link.startsWith("//")) {
92
+ const base = new URL(baseUrl);
93
+ standardizedLink = base.protocol + link;
94
+ }
95
+ else {
96
+ standardizedLink = new URL(link, baseUrl).href;
97
+ }
98
+ if (this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir)))
99
+ continue;
87
100
  if (link.startsWith("http")) {
88
101
  const isAllowed = !this.preventOutside || link.startsWith(baseUrl);
89
102
  if (isAllowed)
@@ -81,6 +81,19 @@ export class RecursiveUrlLoader extends BaseDocumentLoader {
81
81
  if (invalidPrefixes.some((prefix) => link.startsWith(prefix)) ||
82
82
  invalidSuffixes.some((suffix) => link.endsWith(suffix)))
83
83
  continue;
84
+ let standardizedLink;
85
+ if (link.startsWith("http")) {
86
+ standardizedLink = link;
87
+ }
88
+ else if (link.startsWith("//")) {
89
+ const base = new URL(baseUrl);
90
+ standardizedLink = base.protocol + link;
91
+ }
92
+ else {
93
+ standardizedLink = new URL(link, baseUrl).href;
94
+ }
95
+ if (this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir)))
96
+ continue;
84
97
  if (link.startsWith("http")) {
85
98
  const isAllowed = !this.preventOutside || link.startsWith(baseUrl);
86
99
  if (isAllowed)
@@ -0,0 +1,71 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.HuggingFaceTransformersEmbeddings = void 0;
4
+ const transformers_1 = require("@xenova/transformers");
5
+ const chunk_js_1 = require("../util/chunk.cjs");
6
+ const base_js_1 = require("./base.cjs");
7
+ class HuggingFaceTransformersEmbeddings extends base_js_1.Embeddings {
8
+ constructor(fields) {
9
+ super(fields ?? {});
10
+ Object.defineProperty(this, "modelName", {
11
+ enumerable: true,
12
+ configurable: true,
13
+ writable: true,
14
+ value: "Xenova/all-MiniLM-L6-v2"
15
+ });
16
+ Object.defineProperty(this, "batchSize", {
17
+ enumerable: true,
18
+ configurable: true,
19
+ writable: true,
20
+ value: 512
21
+ });
22
+ Object.defineProperty(this, "stripNewLines", {
23
+ enumerable: true,
24
+ configurable: true,
25
+ writable: true,
26
+ value: true
27
+ });
28
+ Object.defineProperty(this, "timeout", {
29
+ enumerable: true,
30
+ configurable: true,
31
+ writable: true,
32
+ value: void 0
33
+ });
34
+ Object.defineProperty(this, "pipelinePromise", {
35
+ enumerable: true,
36
+ configurable: true,
37
+ writable: true,
38
+ value: void 0
39
+ });
40
+ this.modelName = fields?.modelName ?? this.modelName;
41
+ this.stripNewLines = fields?.stripNewLines ?? this.stripNewLines;
42
+ this.timeout = fields?.timeout;
43
+ }
44
+ async embedDocuments(texts) {
45
+ const batches = (0, chunk_js_1.chunkArray)(this.stripNewLines ? texts.map((t) => t.replace(/\n/g, " ")) : texts, this.batchSize);
46
+ const batchRequests = batches.map((batch) => this.runEmbedding(batch));
47
+ const batchResponses = await Promise.all(batchRequests);
48
+ const embeddings = [];
49
+ for (let i = 0; i < batchResponses.length; i += 1) {
50
+ const batchResponse = batchResponses[i];
51
+ for (let j = 0; j < batchResponse.length; j += 1) {
52
+ embeddings.push(batchResponse[j]);
53
+ }
54
+ }
55
+ return embeddings;
56
+ }
57
+ async embedQuery(text) {
58
+ const data = await this.runEmbedding([
59
+ this.stripNewLines ? text.replace(/\n/g, " ") : text,
60
+ ]);
61
+ return data[0];
62
+ }
63
+ async runEmbedding(texts) {
64
+ const pipe = await (this.pipelinePromise ??= (0, transformers_1.pipeline)("feature-extraction", this.modelName));
65
+ return this.caller.call(async () => {
66
+ const output = await pipe(texts, { pooling: "mean", normalize: true });
67
+ return output.tolist();
68
+ });
69
+ }
70
+ }
71
+ exports.HuggingFaceTransformersEmbeddings = HuggingFaceTransformersEmbeddings;
@@ -0,0 +1,29 @@
1
+ import { Embeddings, EmbeddingsParams } from "./base.js";
2
+ export interface HuggingFaceTransformersEmbeddingsParams extends EmbeddingsParams {
3
+ /** Model name to use */
4
+ modelName: string;
5
+ /**
6
+ * Timeout to use when making requests to OpenAI.
7
+ */
8
+ timeout?: number;
9
+ /**
10
+ * The maximum number of documents to embed in a single request.
11
+ */
12
+ batchSize?: number;
13
+ /**
14
+ * Whether to strip new lines from the input text. This is recommended by
15
+ * OpenAI, but may not be suitable for all use cases.
16
+ */
17
+ stripNewLines?: boolean;
18
+ }
19
+ export declare class HuggingFaceTransformersEmbeddings extends Embeddings implements HuggingFaceTransformersEmbeddingsParams {
20
+ modelName: string;
21
+ batchSize: number;
22
+ stripNewLines: boolean;
23
+ timeout?: number;
24
+ private pipelinePromise;
25
+ constructor(fields?: Partial<HuggingFaceTransformersEmbeddingsParams>);
26
+ embedDocuments(texts: string[]): Promise<number[][]>;
27
+ embedQuery(text: string): Promise<number[]>;
28
+ private runEmbedding;
29
+ }
@@ -0,0 +1,67 @@
1
+ import { pipeline } from "@xenova/transformers";
2
+ import { chunkArray } from "../util/chunk.js";
3
+ import { Embeddings } from "./base.js";
4
+ export class HuggingFaceTransformersEmbeddings extends Embeddings {
5
+ constructor(fields) {
6
+ super(fields ?? {});
7
+ Object.defineProperty(this, "modelName", {
8
+ enumerable: true,
9
+ configurable: true,
10
+ writable: true,
11
+ value: "Xenova/all-MiniLM-L6-v2"
12
+ });
13
+ Object.defineProperty(this, "batchSize", {
14
+ enumerable: true,
15
+ configurable: true,
16
+ writable: true,
17
+ value: 512
18
+ });
19
+ Object.defineProperty(this, "stripNewLines", {
20
+ enumerable: true,
21
+ configurable: true,
22
+ writable: true,
23
+ value: true
24
+ });
25
+ Object.defineProperty(this, "timeout", {
26
+ enumerable: true,
27
+ configurable: true,
28
+ writable: true,
29
+ value: void 0
30
+ });
31
+ Object.defineProperty(this, "pipelinePromise", {
32
+ enumerable: true,
33
+ configurable: true,
34
+ writable: true,
35
+ value: void 0
36
+ });
37
+ this.modelName = fields?.modelName ?? this.modelName;
38
+ this.stripNewLines = fields?.stripNewLines ?? this.stripNewLines;
39
+ this.timeout = fields?.timeout;
40
+ }
41
+ async embedDocuments(texts) {
42
+ const batches = chunkArray(this.stripNewLines ? texts.map((t) => t.replace(/\n/g, " ")) : texts, this.batchSize);
43
+ const batchRequests = batches.map((batch) => this.runEmbedding(batch));
44
+ const batchResponses = await Promise.all(batchRequests);
45
+ const embeddings = [];
46
+ for (let i = 0; i < batchResponses.length; i += 1) {
47
+ const batchResponse = batchResponses[i];
48
+ for (let j = 0; j < batchResponse.length; j += 1) {
49
+ embeddings.push(batchResponse[j]);
50
+ }
51
+ }
52
+ return embeddings;
53
+ }
54
+ async embedQuery(text) {
55
+ const data = await this.runEmbedding([
56
+ this.stripNewLines ? text.replace(/\n/g, " ") : text,
57
+ ]);
58
+ return data[0];
59
+ }
60
+ async runEmbedding(texts) {
61
+ const pipe = await (this.pipelinePromise ??= pipeline("feature-extraction", this.modelName));
62
+ return this.caller.call(async () => {
63
+ const output = await pipe(texts, { pooling: "mean", normalize: true });
64
+ return output.tolist();
65
+ });
66
+ }
67
+ }
@@ -82,7 +82,7 @@ class OpenAIEmbeddings extends base_js_1.Embeddings {
82
82
  writable: true,
83
83
  value: void 0
84
84
  });
85
- const apiKey = fieldsWithDefaults?.openAIApiKey ??
85
+ let apiKey = fieldsWithDefaults?.openAIApiKey ??
86
86
  (0, env_js_1.getEnvironmentVariable)("OPENAI_API_KEY");
87
87
  const azureApiKey = fieldsWithDefaults?.azureOpenAIApiKey ??
88
88
  (0, env_js_1.getEnvironmentVariable)("AZURE_OPENAI_API_KEY");
@@ -120,6 +120,7 @@ class OpenAIEmbeddings extends base_js_1.Embeddings {
120
120
  if (!this.azureOpenAIApiVersion) {
121
121
  throw new Error("Azure OpenAI API version not found");
122
122
  }
123
+ apiKey = apiKey ?? "";
123
124
  }
124
125
  this.clientConfig = {
125
126
  apiKey,
@@ -79,7 +79,7 @@ export class OpenAIEmbeddings extends Embeddings {
79
79
  writable: true,
80
80
  value: void 0
81
81
  });
82
- const apiKey = fieldsWithDefaults?.openAIApiKey ??
82
+ let apiKey = fieldsWithDefaults?.openAIApiKey ??
83
83
  getEnvironmentVariable("OPENAI_API_KEY");
84
84
  const azureApiKey = fieldsWithDefaults?.azureOpenAIApiKey ??
85
85
  getEnvironmentVariable("AZURE_OPENAI_API_KEY");
@@ -117,6 +117,7 @@ export class OpenAIEmbeddings extends Embeddings {
117
117
  if (!this.azureOpenAIApiVersion) {
118
118
  throw new Error("Azure OpenAI API version not found");
119
119
  }
120
+ apiKey = apiKey ?? "";
120
121
  }
121
122
  this.clientConfig = {
122
123
  apiKey,