langchain 0.0.142 → 0.0.144

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/callbacks/handlers/llmonitor.cjs +1 -0
  2. package/callbacks/handlers/llmonitor.d.ts +1 -0
  3. package/callbacks/handlers/llmonitor.js +1 -0
  4. package/dist/agents/mrkl/outputParser.cjs +1 -1
  5. package/dist/agents/mrkl/outputParser.js +1 -1
  6. package/dist/base_language/index.cjs +2 -1
  7. package/dist/base_language/index.d.ts +7 -2
  8. package/dist/base_language/index.js +2 -1
  9. package/dist/callbacks/handlers/llmonitor.cjs +223 -0
  10. package/dist/callbacks/handlers/llmonitor.d.ts +35 -0
  11. package/dist/callbacks/handlers/llmonitor.js +215 -0
  12. package/dist/chains/openai_functions/extraction.d.ts +4 -4
  13. package/dist/chains/openai_functions/openapi.d.ts +3 -3
  14. package/dist/chains/openai_functions/structured_output.d.ts +5 -4
  15. package/dist/chains/openai_functions/tagging.d.ts +4 -4
  16. package/dist/chains/openai_moderation.cjs +1 -0
  17. package/dist/chains/openai_moderation.js +1 -0
  18. package/dist/chat_models/base.cjs +4 -3
  19. package/dist/chat_models/base.d.ts +3 -3
  20. package/dist/chat_models/base.js +5 -4
  21. package/dist/chat_models/minimax.d.ts +6 -28
  22. package/dist/chat_models/openai.cjs +1 -0
  23. package/dist/chat_models/openai.d.ts +2 -3
  24. package/dist/chat_models/openai.js +1 -0
  25. package/dist/document_loaders/fs/openai_whisper_audio.cjs +32 -0
  26. package/dist/document_loaders/fs/openai_whisper_audio.d.ts +11 -0
  27. package/dist/document_loaders/fs/openai_whisper_audio.js +28 -0
  28. package/dist/document_loaders/web/github.cjs +210 -24
  29. package/dist/document_loaders/web/github.d.ts +44 -1
  30. package/dist/document_loaders/web/github.js +210 -24
  31. package/dist/document_loaders/web/recursive_url.cjs +13 -0
  32. package/dist/document_loaders/web/recursive_url.js +13 -0
  33. package/dist/embeddings/hf_transformers.cjs +71 -0
  34. package/dist/embeddings/hf_transformers.d.ts +29 -0
  35. package/dist/embeddings/hf_transformers.js +67 -0
  36. package/dist/embeddings/openai.cjs +2 -1
  37. package/dist/embeddings/openai.js +2 -1
  38. package/dist/experimental/chat_models/anthropic_functions.d.ts +2 -5
  39. package/dist/llms/openai-chat.cjs +1 -0
  40. package/dist/llms/openai-chat.js +1 -0
  41. package/dist/llms/openai.cjs +1 -0
  42. package/dist/llms/openai.js +1 -0
  43. package/dist/load/import_constants.cjs +3 -0
  44. package/dist/load/import_constants.js +3 -0
  45. package/dist/prompts/chat.cjs +27 -1
  46. package/dist/prompts/chat.d.ts +3 -2
  47. package/dist/prompts/chat.js +28 -2
  48. package/dist/schema/index.cjs +44 -1
  49. package/dist/schema/index.d.ts +10 -0
  50. package/dist/schema/index.js +41 -0
  51. package/dist/tools/serpapi.cjs +108 -13
  52. package/dist/tools/serpapi.js +108 -13
  53. package/dist/vectorstores/redis.cjs +12 -4
  54. package/dist/vectorstores/redis.d.ts +8 -0
  55. package/dist/vectorstores/redis.js +12 -4
  56. package/dist/vectorstores/tigris.cjs +2 -0
  57. package/dist/vectorstores/tigris.d.ts +2 -3
  58. package/dist/vectorstores/tigris.js +2 -0
  59. package/dist/vectorstores/vectara.cjs +30 -12
  60. package/dist/vectorstores/vectara.d.ts +1 -1
  61. package/dist/vectorstores/vectara.js +30 -12
  62. package/document_loaders/fs/openai_whisper_audio.cjs +1 -0
  63. package/document_loaders/fs/openai_whisper_audio.d.ts +1 -0
  64. package/document_loaders/fs/openai_whisper_audio.js +1 -0
  65. package/embeddings/hf_transformers.cjs +1 -0
  66. package/embeddings/hf_transformers.d.ts +1 -0
  67. package/embeddings/hf_transformers.js +1 -0
  68. package/package.json +36 -6
@@ -114,6 +114,7 @@ export class OpenAIModerationChain extends BaseChain {
114
114
  const output = this._moderate(text, mod.results[0]);
115
115
  return {
116
116
  [this.outputKey]: output,
117
+ results: mod.results,
117
118
  };
118
119
  }
119
120
  _chainType() {
@@ -110,6 +110,7 @@ class BaseChatModel extends index_js_2.BaseLanguageModel {
110
110
  else {
111
111
  parsedOptions = options;
112
112
  }
113
+ const baseMessages = messages.map((messageList) => messageList.map(index_js_1.coerceMessageLikeToMessage));
113
114
  const [runnableConfig, callOptions] = this._separateRunnableConfigFromCallOptions(parsedOptions);
114
115
  // create callback manager and start run
115
116
  const callbackManager_ = await manager_js_1.CallbackManager.configure(runnableConfig.callbacks ?? callbacks, this.callbacks, runnableConfig.tags, this.tags, runnableConfig.metadata, this.metadata, { verbose: this.verbose });
@@ -117,9 +118,9 @@ class BaseChatModel extends index_js_2.BaseLanguageModel {
117
118
  options: callOptions,
118
119
  invocation_params: this?.invocationParams(parsedOptions),
119
120
  };
120
- const runManagers = await callbackManager_?.handleChatModelStart(this.toJSON(), messages, undefined, undefined, extra);
121
+ const runManagers = await callbackManager_?.handleChatModelStart(this.toJSON(), baseMessages, undefined, undefined, extra);
121
122
  // generate results
122
- const results = await Promise.allSettled(messages.map((messageList, i) => this._generate(messageList, { ...callOptions, promptIndex: i }, runManagers?.[i])));
123
+ const results = await Promise.allSettled(baseMessages.map((messageList, i) => this._generate(messageList, { ...callOptions, promptIndex: i }, runManagers?.[i])));
123
124
  // handle results
124
125
  const generations = [];
125
126
  const llmOutputs = [];
@@ -183,7 +184,7 @@ class BaseChatModel extends index_js_2.BaseLanguageModel {
183
184
  * @returns A Promise that resolves to a BaseMessage.
184
185
  */
185
186
  async call(messages, options, callbacks) {
186
- const result = await this.generate([messages], options, callbacks);
187
+ const result = await this.generate([messages.map(index_js_1.coerceMessageLikeToMessage)], options, callbacks);
187
188
  const generations = result.generations;
188
189
  return generations[0][0].message;
189
190
  }
@@ -1,4 +1,4 @@
1
- import { BaseMessage, BasePromptValue, ChatResult, BaseMessageChunk, LLMResult, ChatGenerationChunk } from "../schema/index.js";
1
+ import { BaseMessage, BasePromptValue, ChatResult, BaseMessageChunk, LLMResult, ChatGenerationChunk, BaseMessageLike } from "../schema/index.js";
2
2
  import { BaseLanguageModel, BaseLanguageModelCallOptions, BaseLanguageModelInput, BaseLanguageModelParams } from "../base_language/index.js";
3
3
  import { CallbackManagerForLLMRun, Callbacks } from "../callbacks/manager.js";
4
4
  import { RunnableConfig } from "../schema/runnable.js";
@@ -56,7 +56,7 @@ export declare abstract class BaseChatModel<CallOptions extends BaseChatModelCal
56
56
  * @param callbacks The callbacks for the language model.
57
57
  * @returns A Promise that resolves to an LLMResult.
58
58
  */
59
- generate(messages: BaseMessage[][], options?: string[] | CallOptions, callbacks?: Callbacks): Promise<LLMResult>;
59
+ generate(messages: BaseMessageLike[][], options?: string[] | CallOptions, callbacks?: Callbacks): Promise<LLMResult>;
60
60
  /**
61
61
  * Get the parameters used to invoke the model
62
62
  */
@@ -79,7 +79,7 @@ export declare abstract class BaseChatModel<CallOptions extends BaseChatModelCal
79
79
  * @param callbacks The callbacks for the language model.
80
80
  * @returns A Promise that resolves to a BaseMessage.
81
81
  */
82
- call(messages: BaseMessage[], options?: string[] | CallOptions, callbacks?: Callbacks): Promise<BaseMessage>;
82
+ call(messages: BaseMessageLike[], options?: string[] | CallOptions, callbacks?: Callbacks): Promise<BaseMessage>;
83
83
  /**
84
84
  * Makes a single call to the chat model with a prompt value.
85
85
  * @param promptValue The value of the prompt.
@@ -1,4 +1,4 @@
1
- import { AIMessage, HumanMessage, RUN_KEY, } from "../schema/index.js";
1
+ import { AIMessage, HumanMessage, RUN_KEY, coerceMessageLikeToMessage, } from "../schema/index.js";
2
2
  import { BaseLanguageModel, } from "../base_language/index.js";
3
3
  import { CallbackManager, } from "../callbacks/manager.js";
4
4
  /**
@@ -106,6 +106,7 @@ export class BaseChatModel extends BaseLanguageModel {
106
106
  else {
107
107
  parsedOptions = options;
108
108
  }
109
+ const baseMessages = messages.map((messageList) => messageList.map(coerceMessageLikeToMessage));
109
110
  const [runnableConfig, callOptions] = this._separateRunnableConfigFromCallOptions(parsedOptions);
110
111
  // create callback manager and start run
111
112
  const callbackManager_ = await CallbackManager.configure(runnableConfig.callbacks ?? callbacks, this.callbacks, runnableConfig.tags, this.tags, runnableConfig.metadata, this.metadata, { verbose: this.verbose });
@@ -113,9 +114,9 @@ export class BaseChatModel extends BaseLanguageModel {
113
114
  options: callOptions,
114
115
  invocation_params: this?.invocationParams(parsedOptions),
115
116
  };
116
- const runManagers = await callbackManager_?.handleChatModelStart(this.toJSON(), messages, undefined, undefined, extra);
117
+ const runManagers = await callbackManager_?.handleChatModelStart(this.toJSON(), baseMessages, undefined, undefined, extra);
117
118
  // generate results
118
- const results = await Promise.allSettled(messages.map((messageList, i) => this._generate(messageList, { ...callOptions, promptIndex: i }, runManagers?.[i])));
119
+ const results = await Promise.allSettled(baseMessages.map((messageList, i) => this._generate(messageList, { ...callOptions, promptIndex: i }, runManagers?.[i])));
119
120
  // handle results
120
121
  const generations = [];
121
122
  const llmOutputs = [];
@@ -179,7 +180,7 @@ export class BaseChatModel extends BaseLanguageModel {
179
180
  * @returns A Promise that resolves to a BaseMessage.
180
181
  */
181
182
  async call(messages, options, callbacks) {
182
- const result = await this.generate([messages], options, callbacks);
183
+ const result = await this.generate([messages.map(coerceMessageLikeToMessage)], options, callbacks);
183
184
  const generations = result.generations;
184
185
  return generations[0][0].message;
185
186
  }
@@ -1,8 +1,9 @@
1
+ import type { OpenAI as OpenAIClient } from "openai";
1
2
  import { BaseChatModel, BaseChatModelParams } from "./base.js";
2
3
  import { BaseMessage, ChatResult } from "../schema/index.js";
3
4
  import { CallbackManagerForLLMRun } from "../callbacks/manager.js";
4
5
  import { StructuredTool } from "../tools/index.js";
5
- import { BaseLanguageModelCallOptions } from "../base_language/index.js";
6
+ import { BaseFunctionCallOptions } from "../base_language/index.js";
6
7
  /**
7
8
  * Type representing the sender_type of a message in the Minimax chat model.
8
9
  */
@@ -15,28 +16,6 @@ interface MinimaxChatCompletionRequestMessage {
15
16
  sender_name?: string;
16
17
  text: string;
17
18
  }
18
- export interface MinimaxChatCompletionRequestFunctions {
19
- /**
20
- * The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.
21
- * @type {string}
22
- * @memberof MinimaxChatCompletionRequestFunctions
23
- */
24
- name: string;
25
- /**
26
- * The description of what the function does.
27
- * @type {string}
28
- * @memberof MinimaxChatCompletionRequestFunctions
29
- */
30
- description?: string;
31
- /**
32
- * The parameters the functions accepts, described as a JSON Schema object.
33
- * @type {{ [key: string]: any; }}
34
- * @memberof MinimaxChatCompletionRequestFunctions
35
- */
36
- parameters?: {
37
- [key: string]: any;
38
- };
39
- }
40
19
  /**
41
20
  * Interface representing a request for a chat completion.
42
21
  */
@@ -58,9 +37,9 @@ interface MinimaxChatCompletionRequest {
58
37
  sample_messages?: MinimaxChatCompletionRequestMessage[];
59
38
  /**
60
39
  * A list of functions the model may generate JSON inputs for.
61
- * @type {Array<MinimaxChatCompletionRequestFunctions>}
40
+ * @type {Array<OpenAIClient.Chat.ChatCompletionCreateParams.Function[]>}
62
41
  */
63
- functions?: Array<MinimaxChatCompletionRequestFunctions>;
42
+ functions?: OpenAIClient.Chat.ChatCompletionCreateParams.Function[];
64
43
  plugins?: string[];
65
44
  }
66
45
  interface RoleMeta {
@@ -191,8 +170,7 @@ declare interface MinimaxChatInputPro extends MinimaxChatInputBase {
191
170
  replyConstraints?: ReplyConstraints;
192
171
  }
193
172
  type MinimaxChatInput = MinimaxChatInputNormal & MinimaxChatInputPro;
194
- export interface ChatMinimaxCallOptions extends BaseLanguageModelCallOptions {
195
- functions?: MinimaxChatCompletionRequestFunctions[];
173
+ export interface ChatMinimaxCallOptions extends BaseFunctionCallOptions {
196
174
  tools?: StructuredTool[];
197
175
  defaultUserName?: string;
198
176
  defaultBotName?: string;
@@ -252,7 +230,7 @@ export declare class ChatMinimax extends BaseChatModel<ChatMinimaxCallOptions> i
252
230
  identifyingParams(): {
253
231
  prompt?: string | undefined;
254
232
  stream?: boolean | undefined;
255
- functions?: MinimaxChatCompletionRequestFunctions[] | undefined;
233
+ functions?: OpenAIClient.Chat.Completions.ChatCompletionCreateParams.Function[] | undefined;
256
234
  model: string;
257
235
  temperature?: number | undefined;
258
236
  top_p?: number | undefined;
@@ -325,6 +325,7 @@ class ChatOpenAI extends base_js_1.BaseChatModel {
325
325
  if (!this.azureOpenAIApiVersion) {
326
326
  throw new Error("Azure OpenAI API version not found");
327
327
  }
328
+ this.openAIApiKey = this.openAIApiKey ?? "";
328
329
  }
329
330
  this.clientConfig = {
330
331
  apiKey: this.openAIApiKey,
@@ -4,6 +4,7 @@ import { BaseMessage, ChatGenerationChunk, ChatResult } from "../schema/index.js
4
4
  import { StructuredTool } from "../tools/base.js";
5
5
  import { AzureOpenAIInput, OpenAICallOptions, OpenAIChatInput, OpenAICoreRequestOptions, LegacyOpenAIInput } from "../types/openai-types.js";
6
6
  import { BaseChatModel, BaseChatModelParams } from "./base.js";
7
+ import { BaseFunctionCallOptions } from "../base_language/index.js";
7
8
  export { AzureOpenAIInput, OpenAICallOptions, OpenAIChatInput };
8
9
  interface TokenUsage {
9
10
  completionTokens?: number;
@@ -13,9 +14,7 @@ interface TokenUsage {
13
14
  interface OpenAILLMOutput {
14
15
  tokenUsage: TokenUsage;
15
16
  }
16
- export interface ChatOpenAICallOptions extends OpenAICallOptions {
17
- function_call?: OpenAIClient.Chat.ChatCompletionCreateParams.FunctionCallOption;
18
- functions?: OpenAIClient.Chat.ChatCompletionCreateParams.Function[];
17
+ export interface ChatOpenAICallOptions extends OpenAICallOptions, BaseFunctionCallOptions {
19
18
  tools?: StructuredTool[];
20
19
  promptIndex?: number;
21
20
  }
@@ -322,6 +322,7 @@ export class ChatOpenAI extends BaseChatModel {
322
322
  if (!this.azureOpenAIApiVersion) {
323
323
  throw new Error("Azure OpenAI API version not found");
324
324
  }
325
+ this.openAIApiKey = this.openAIApiKey ?? "";
325
326
  }
326
327
  this.clientConfig = {
327
328
  apiKey: this.openAIApiKey,
@@ -0,0 +1,32 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.OpenAIWhisperAudio = void 0;
4
+ const openai_1 = require("openai");
5
+ const document_js_1 = require("../../document.cjs");
6
+ const buffer_js_1 = require("./buffer.cjs");
7
+ const MODEL_NAME = "whisper-1";
8
+ class OpenAIWhisperAudio extends buffer_js_1.BufferLoader {
9
+ constructor(filePathOrBlob, fields) {
10
+ super(filePathOrBlob);
11
+ Object.defineProperty(this, "openAIClient", {
12
+ enumerable: true,
13
+ configurable: true,
14
+ writable: true,
15
+ value: void 0
16
+ });
17
+ this.openAIClient = new openai_1.OpenAI(fields?.clientOptions);
18
+ }
19
+ async parse(raw, metadata) {
20
+ const fileName = metadata.source === "blob" ? metadata.blobType : metadata.source;
21
+ const transcriptionResponse = await this.openAIClient.audio.transcriptions.create({
22
+ file: await (0, openai_1.toFile)(raw, fileName),
23
+ model: MODEL_NAME,
24
+ });
25
+ const document = new document_js_1.Document({
26
+ pageContent: transcriptionResponse.text,
27
+ metadata,
28
+ });
29
+ return [document];
30
+ }
31
+ }
32
+ exports.OpenAIWhisperAudio = OpenAIWhisperAudio;
@@ -0,0 +1,11 @@
1
+ /// <reference types="node" resolution-mode="require"/>
2
+ import { type ClientOptions } from "openai";
3
+ import { Document } from "../../document.js";
4
+ import { BufferLoader } from "./buffer.js";
5
+ export declare class OpenAIWhisperAudio extends BufferLoader {
6
+ private readonly openAIClient;
7
+ constructor(filePathOrBlob: string | Blob, fields?: {
8
+ clientOptions?: ClientOptions;
9
+ });
10
+ protected parse(raw: Buffer, metadata: Record<string, string>): Promise<Document[]>;
11
+ }
@@ -0,0 +1,28 @@
1
+ import { OpenAI as OpenAIClient, toFile } from "openai";
2
+ import { Document } from "../../document.js";
3
+ import { BufferLoader } from "./buffer.js";
4
+ const MODEL_NAME = "whisper-1";
5
+ export class OpenAIWhisperAudio extends BufferLoader {
6
+ constructor(filePathOrBlob, fields) {
7
+ super(filePathOrBlob);
8
+ Object.defineProperty(this, "openAIClient", {
9
+ enumerable: true,
10
+ configurable: true,
11
+ writable: true,
12
+ value: void 0
13
+ });
14
+ this.openAIClient = new OpenAIClient(fields?.clientOptions);
15
+ }
16
+ async parse(raw, metadata) {
17
+ const fileName = metadata.source === "blob" ? metadata.blobType : metadata.source;
18
+ const transcriptionResponse = await this.openAIClient.audio.transcriptions.create({
19
+ file: await toFile(raw, fileName),
20
+ model: MODEL_NAME,
21
+ });
22
+ const document = new Document({
23
+ pageContent: transcriptionResponse.text,
24
+ metadata,
25
+ });
26
+ return [document];
27
+ }
28
+ }
@@ -28,8 +28,20 @@ function isBinaryPath(name) {
28
28
  * loading files from a GitHub repository.
29
29
  */
30
30
  class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
31
- constructor(githubUrl, { accessToken = (0, env_js_1.getEnvironmentVariable)("GITHUB_ACCESS_TOKEN"), branch = "main", recursive = true, unknown = directory_js_1.UnknownHandling.Warn, ignoreFiles = [], ignorePaths, verbose = false, maxConcurrency = 2, maxRetries = 2, ...rest } = {}) {
31
+ constructor(githubUrl, { accessToken = (0, env_js_1.getEnvironmentVariable)("GITHUB_ACCESS_TOKEN"), baseUrl = "https://github.com", apiUrl = "https://api.github.com", branch = "main", recursive = true, processSubmodules = false, unknown = directory_js_1.UnknownHandling.Warn, ignoreFiles = [], ignorePaths, verbose = false, maxConcurrency = 2, maxRetries = 2, ...rest } = {}) {
32
32
  super();
33
+ Object.defineProperty(this, "baseUrl", {
34
+ enumerable: true,
35
+ configurable: true,
36
+ writable: true,
37
+ value: void 0
38
+ });
39
+ Object.defineProperty(this, "apiUrl", {
40
+ enumerable: true,
41
+ configurable: true,
42
+ writable: true,
43
+ value: void 0
44
+ });
33
45
  Object.defineProperty(this, "owner", {
34
46
  enumerable: true,
35
47
  configurable: true,
@@ -66,6 +78,12 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
66
78
  writable: true,
67
79
  value: void 0
68
80
  });
81
+ Object.defineProperty(this, "processSubmodules", {
82
+ enumerable: true,
83
+ configurable: true,
84
+ writable: true,
85
+ value: void 0
86
+ });
69
87
  Object.defineProperty(this, "unknown", {
70
88
  enumerable: true,
71
89
  configurable: true,
@@ -96,22 +114,55 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
96
114
  writable: true,
97
115
  value: void 0
98
116
  });
117
+ Object.defineProperty(this, "maxConcurrency", {
118
+ enumerable: true,
119
+ configurable: true,
120
+ writable: true,
121
+ value: void 0
122
+ });
123
+ Object.defineProperty(this, "maxRetries", {
124
+ enumerable: true,
125
+ configurable: true,
126
+ writable: true,
127
+ value: void 0
128
+ });
99
129
  Object.defineProperty(this, "caller", {
100
130
  enumerable: true,
101
131
  configurable: true,
102
132
  writable: true,
103
133
  value: void 0
104
134
  });
135
+ Object.defineProperty(this, "ignorePaths", {
136
+ enumerable: true,
137
+ configurable: true,
138
+ writable: true,
139
+ value: void 0
140
+ });
141
+ Object.defineProperty(this, "submoduleInfos", {
142
+ enumerable: true,
143
+ configurable: true,
144
+ writable: true,
145
+ value: void 0
146
+ });
147
+ this.baseUrl = baseUrl;
148
+ this.apiUrl = apiUrl;
105
149
  const { owner, repo, path } = this.extractOwnerAndRepoAndPath(githubUrl);
106
150
  this.owner = owner;
107
151
  this.repo = repo;
108
152
  this.initialPath = path;
109
153
  this.branch = branch;
110
154
  this.recursive = recursive;
155
+ // processing submodules without processing contents of other directories makes no sense
156
+ if (processSubmodules && !recursive) {
157
+ throw new Error(`Input property "recursive" must be true if "processSubmodules" is true.`);
158
+ }
159
+ this.processSubmodules = processSubmodules;
111
160
  this.unknown = unknown;
112
161
  this.accessToken = accessToken;
113
162
  this.ignoreFiles = ignoreFiles;
114
163
  this.verbose = verbose;
164
+ this.maxConcurrency = maxConcurrency;
165
+ this.maxRetries = maxRetries;
115
166
  this.headers = {
116
167
  "User-Agent": "langchain",
117
168
  };
@@ -120,6 +171,7 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
120
171
  maxRetries,
121
172
  ...rest,
122
173
  });
174
+ this.ignorePaths = ignorePaths;
123
175
  if (ignorePaths) {
124
176
  this.ignore = ignore_1.default.default().add(ignorePaths);
125
177
  }
@@ -136,7 +188,7 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
136
188
  * @returns An object containing the owner, repository, and path extracted from the GitHub URL.
137
189
  */
138
190
  extractOwnerAndRepoAndPath(url) {
139
- const match = url.match(/https:\/\/github.com\/([^/]+)\/([^/]+)(\/tree\/[^/]+\/(.+))?/i);
191
+ const match = url.match(new RegExp(`${this.baseUrl}/([^/]+)/([^/]+)(/tree/[^/]+/(.+))?`, "i"));
140
192
  if (!match) {
141
193
  throw new Error("Invalid GitHub URL format.");
142
194
  }
@@ -149,10 +201,127 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
149
201
  * @returns A promise that resolves to an array of Document instances.
150
202
  */
151
203
  async load() {
152
- return (await this.processRepo()).map((fileResponse) => new document_js_1.Document({
204
+ this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`);
205
+ // process repository without submodules
206
+ const documents = (await this.processRepo()).map((fileResponse) => new document_js_1.Document({
153
207
  pageContent: fileResponse.contents,
154
208
  metadata: fileResponse.metadata,
155
209
  }));
210
+ if (this.processSubmodules) {
211
+ // process submodules
212
+ await this.getSubmoduleInfo();
213
+ for (const submoduleInfo of this.submoduleInfos) {
214
+ documents.push(...(await this.loadSubmodule(submoduleInfo)));
215
+ }
216
+ }
217
+ return documents;
218
+ }
219
+ /**
220
+ * Loads the information about Git submodules from the repository, if available.
221
+ */
222
+ async getSubmoduleInfo() {
223
+ this.log("Loading info about submodules...");
224
+ // we have to fetch the files of the root directory to get the download url of the .gitmodules file
225
+ // however, we cannot reuse the files retrieved in processRepo() as initialPath may be != ""
226
+ // so it may be that we end up fetching this file list twice
227
+ const repoFiles = await this.fetchRepoFiles("");
228
+ const gitmodulesFile = repoFiles.filter(({ name }) => name === ".gitmodules")?.[0];
229
+ if (gitmodulesFile) {
230
+ const gitmodulesContent = await this.fetchFileContent({
231
+ download_url: gitmodulesFile.download_url,
232
+ });
233
+ this.submoduleInfos = await this.parseGitmodules(gitmodulesContent);
234
+ }
235
+ else {
236
+ this.submoduleInfos = [];
237
+ }
238
+ this.log(`Found ${this.submoduleInfos.length} submodules:`);
239
+ for (const submoduleInfo of this.submoduleInfos) {
240
+ this.log(JSON.stringify(submoduleInfo));
241
+ }
242
+ }
243
+ /**
244
+ * Parses the given content of a .gitmodules file. Furthermore, queries the current SHA ref of all submodules.
245
+ * Returns the submodule information as array.
246
+ * @param gitmodulesContent the content of a .gitmodules file
247
+ */
248
+ async parseGitmodules(gitmodulesContent) {
249
+ // catches the initial line of submodule entries
250
+ const submodulePattern = /\[submodule "(.*?)"]\n((\s+.*?\s*=\s*.*?\n)*)/g;
251
+ // catches the properties of a submodule
252
+ const keyValuePattern = /\s+(.*?)\s*=\s*(.*?)\s/g;
253
+ const submoduleInfos = [];
254
+ for (const [, name, propertyLines] of gitmodulesContent.matchAll(submodulePattern)) {
255
+ if (!name || !propertyLines) {
256
+ throw new Error("Could not parse submodule entry");
257
+ }
258
+ const submodulePropertyLines = propertyLines.matchAll(keyValuePattern);
259
+ let path;
260
+ let url;
261
+ for (const [, key, value] of submodulePropertyLines) {
262
+ if (!key || !value) {
263
+ throw new Error(`Could not parse key/value pairs for submodule ${name}`);
264
+ }
265
+ switch (key) {
266
+ case "path":
267
+ path = value;
268
+ break;
269
+ case "url":
270
+ url = value;
271
+ if (url.endsWith(".git")) {
272
+ url = url.substring(0, url.length - 4);
273
+ }
274
+ break;
275
+ default:
276
+ // ignoring unused keys
277
+ }
278
+ }
279
+ if (!path || !url) {
280
+ throw new Error(`Missing properties for submodule ${name}`);
281
+ }
282
+ // fetch the current ref of the submodule
283
+ const files = await this.fetchRepoFiles(path);
284
+ const submoduleInfo = {
285
+ name,
286
+ path,
287
+ url,
288
+ ref: files[0].sha,
289
+ };
290
+ submoduleInfos.push(submoduleInfo);
291
+ }
292
+ return submoduleInfos;
293
+ }
294
+ /**
295
+ * Loads the documents of the given submodule. Uses the same parameters as for the current repository.
296
+ * External submodules, i.e. submodules pointing to another GitHub instance, are ignored.
297
+ * @param submoduleInfo the info about the submodule to be loaded
298
+ */
299
+ async loadSubmodule(submoduleInfo) {
300
+ if (!submoduleInfo.url.startsWith(this.baseUrl)) {
301
+ this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
302
+ return [];
303
+ }
304
+ else if (!submoduleInfo.path.startsWith(this.initialPath)) {
305
+ this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`);
306
+ return [];
307
+ }
308
+ else {
309
+ this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`);
310
+ return new GithubRepoLoader(submoduleInfo.url, {
311
+ accessToken: this.accessToken,
312
+ apiUrl: this.apiUrl,
313
+ baseUrl: this.baseUrl,
314
+ branch: submoduleInfo.ref,
315
+ recursive: this.recursive,
316
+ processSubmodules: this.processSubmodules,
317
+ unknown: this.unknown,
318
+ ignoreFiles: this.ignoreFiles,
319
+ ignorePaths: this.ignorePaths,
320
+ verbose: this.verbose,
321
+ maxConcurrency: this.maxConcurrency,
322
+ maxRetries: this.maxRetries,
323
+ }).load();
324
+ }
156
325
  }
157
326
  /**
158
327
  * Determines whether a file or directory should be ignored based on its
@@ -192,7 +361,11 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
192
361
  });
193
362
  return {
194
363
  contents: fileContent || "",
195
- metadata: { source: file.path },
364
+ metadata: {
365
+ source: file.path,
366
+ repository: `${this.baseUrl}/${this.owner}/${this.repo}`,
367
+ branch: this.branch,
368
+ },
196
369
  };
197
370
  }
198
371
  /**
@@ -203,19 +376,24 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
203
376
  // Directories have nested files / directories, which is why this is a list of promises of promises
204
377
  const currentDirectoryDirectoryPromises = [];
205
378
  for (const file of files) {
206
- if (!this.shouldIgnore(file.path, file.type)) {
207
- if (file.type !== "dir") {
208
- try {
209
- currentDirectoryFilePromises.push(this.fetchFileContentWrapper(file));
210
- }
211
- catch (e) {
212
- this.handleError(`Failed to fetch file content: ${file.path}, ${e}`);
213
- }
379
+ if (this.shouldIgnore(file.path, file.type)) {
380
+ continue;
381
+ }
382
+ if (file.type === "file" && file.size === 0) {
383
+ // this is a submodule. ignoring for the moment. submodule processing is done separately
384
+ continue;
385
+ }
386
+ if (file.type !== "dir") {
387
+ try {
388
+ currentDirectoryFilePromises.push(this.fetchFileContentWrapper(file));
214
389
  }
215
- else if (this.recursive) {
216
- currentDirectoryDirectoryPromises.push(this.processDirectory(file.path));
390
+ catch (e) {
391
+ this.handleError(`Failed to fetch file content: ${file.path}, ${e}`);
217
392
  }
218
393
  }
394
+ else if (this.recursive) {
395
+ currentDirectoryDirectoryPromises.push(this.processDirectory(file.path));
396
+ }
219
397
  }
220
398
  const curDirDirectories = await Promise.all(currentDirectoryDirectoryPromises);
221
399
  return [...currentDirectoryFilePromises, ...curDirDirectories.flat()];
@@ -254,24 +432,25 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
254
432
  }
255
433
  /**
256
434
  * Fetches the files from a GitHub repository.
435
+ * If the path denotes a single file, the resulting array contains only one element.
257
436
  * @param path The path of the repository to fetch the files from.
258
437
  * @returns A promise that resolves to an array of GithubFile instances.
259
438
  */
260
439
  async fetchRepoFiles(path) {
261
- const url = `https://api.github.com/repos/${this.owner}/${this.repo}/contents/${path}?ref=${this.branch}`;
440
+ const url = `${this.apiUrl}/repos/${this.owner}/${this.repo}/contents/${path}?ref=${this.branch}`;
262
441
  return this.caller.call(async () => {
263
- if (this.verbose) {
264
- console.log("Fetching", url);
265
- }
442
+ this.log(`Fetching ${url}`);
266
443
  const response = await fetch(url, { headers: this.headers });
267
444
  const data = await response.json();
268
445
  if (!response.ok) {
269
446
  throw new Error(`Unable to fetch repository files: ${response.status} ${JSON.stringify(data)}`);
270
447
  }
271
- if (!Array.isArray(data)) {
272
- throw new Error("Unable to fetch repository files.");
448
+ if (Array.isArray(data)) {
449
+ return data;
450
+ }
451
+ else {
452
+ return [data];
273
453
  }
274
- return data;
275
454
  });
276
455
  }
277
456
  /**
@@ -281,9 +460,7 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
281
460
  */
282
461
  async fetchFileContent(file) {
283
462
  return this.caller.call(async () => {
284
- if (this.verbose) {
285
- console.log("Fetching", file.download_url);
286
- }
463
+ this.log(`Fetching ${file.download_url}`);
287
464
  const response = await fetch(file.download_url, {
288
465
  headers: this.headers,
289
466
  });
@@ -308,5 +485,14 @@ class GithubRepoLoader extends base_js_1.BaseDocumentLoader {
308
485
  throw new Error(`Unknown unknown handling: ${this.unknown}`);
309
486
  }
310
487
  }
488
+ /**
489
+ * Logs the given message to the console, if parameter 'verbose' is set to true.
490
+ * @param message the message to be logged.
491
+ */
492
+ log(message) {
493
+ if (this.verbose) {
494
+ console.log(message);
495
+ }
496
+ }
311
497
  }
312
498
  exports.GithubRepoLoader = GithubRepoLoader;