vectra 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bin/vectra.js +3 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/GPT3Tokenizer.d.ts.map +1 -0
- package/lib/GPT3Tokenizer.js +17 -0
- package/lib/GPT3Tokenizer.js.map +1 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +156 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/LocalDocument.d.ts +16 -0
- package/lib/LocalDocument.d.ts.map +1 -0
- package/lib/LocalDocument.js +99 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +48 -0
- package/lib/LocalDocumentIndex.d.ts.map +1 -0
- package/lib/LocalDocumentIndex.js +367 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +12 -0
- package/lib/LocalDocumentResult.d.ts.map +1 -0
- package/lib/LocalDocumentResult.js +186 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalIndex.d.ts +130 -0
- package/lib/LocalIndex.d.ts.map +1 -0
- package/lib/LocalIndex.js +405 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/OpenAIEmbeddings.d.ts +98 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +139 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/TextSplitter.d.ts +17 -0
- package/lib/TextSplitter.d.ts.map +1 -0
- package/lib/TextSplitter.js +460 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/WebFetcher.d.ts +16 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +144 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/index.d.ts +11 -0
- package/lib/index.d.ts.map +1 -0
- package/lib/index.js +27 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +64 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +42 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/types.d.ts +133 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.d.ts.map +1 -0
- package/lib/vectra-cli.js +276 -0
- package/lib/vectra-cli.js.map +1 -0
- package/package.json +21 -3
- package/src/GPT3Tokenizer.ts +15 -0
- package/src/ItemSelector.ts +9 -9
- package/src/LocalDocument.ts +70 -0
- package/src/LocalDocumentIndex.ts +355 -0
- package/src/LocalDocumentResult.ts +206 -0
- package/src/LocalIndex.ts +12 -78
- package/src/OpenAIEmbeddings.ts +205 -0
- package/src/TextSplitter.ts +480 -0
- package/src/WebFetcher.ts +128 -0
- package/src/index.ts +8 -0
- package/src/internals/Colorize.ts +64 -0
- package/src/internals/index.ts +2 -0
- package/src/internals/types.ts +46 -0
- package/src/types.ts +160 -0
- package/src/vectra-cli.ts +238 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import axios, { AxiosInstance, AxiosResponse, AxiosRequestConfig } from 'axios';
|
|
2
|
+
import { EmbeddingsModel, EmbeddingsResponse } from "./types";
|
|
3
|
+
import { CreateEmbeddingRequest, CreateEmbeddingResponse, OpenAICreateEmbeddingRequest } from "./internals";
|
|
4
|
+
|
|
5
|
+
export interface BaseOpenAIEmbeddingsOptions {
|
|
6
|
+
/**
|
|
7
|
+
* Optional. Retry policy to use when calling the OpenAI API.
|
|
8
|
+
* @remarks
|
|
9
|
+
* The default retry policy is `[2000, 5000]` which means that the first retry will be after
|
|
10
|
+
* 2 seconds and the second retry will be after 5 seconds.
|
|
11
|
+
*/
|
|
12
|
+
retryPolicy?: number[];
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Optional. Request options to use when calling the OpenAI API.
|
|
16
|
+
*/
|
|
17
|
+
requestConfig?: AxiosRequestConfig;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Options for configuring an `OpenAIEmbeddings` to generate embeddings using an OpenAI hosted model.
|
|
22
|
+
*/
|
|
23
|
+
export interface OpenAIEmbeddingsOptions extends BaseOpenAIEmbeddingsOptions {
|
|
24
|
+
/**
|
|
25
|
+
* API key to use when calling the OpenAI API.
|
|
26
|
+
* @remarks
|
|
27
|
+
* A new API key can be created at https://platform.openai.com/account/api-keys.
|
|
28
|
+
*/
|
|
29
|
+
apiKey: string;
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Model to use for completion.
|
|
33
|
+
* @remarks
|
|
34
|
+
* For Azure OpenAI this is the name of the deployment to use.
|
|
35
|
+
*/
|
|
36
|
+
model: string;
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Optional. Organization to use when calling the OpenAI API.
|
|
40
|
+
*/
|
|
41
|
+
organization?: string;
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Optional. Endpoint to use when calling the OpenAI API.
|
|
45
|
+
* @remarks
|
|
46
|
+
* For Azure OpenAI this is the deployment endpoint.
|
|
47
|
+
*/
|
|
48
|
+
endpoint?: string;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Options for configuring an `OpenAIEmbeddings` to generate embeddings using an Azure OpenAI hosted model.
|
|
53
|
+
*/
|
|
54
|
+
export interface AzureOpenAIEmbeddingsOptions extends BaseOpenAIEmbeddingsOptions {
|
|
55
|
+
/**
|
|
56
|
+
* API key to use when making requests to Azure OpenAI.
|
|
57
|
+
*/
|
|
58
|
+
azureApiKey: string;
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Deployment endpoint to use.
|
|
62
|
+
*/
|
|
63
|
+
azureEndpoint: string;
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Name of the Azure OpenAI deployment (model) to use.
|
|
67
|
+
*/
|
|
68
|
+
azureDeployment: string;
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Optional. Version of the API being called. Defaults to `2023-05-15`.
|
|
72
|
+
*/
|
|
73
|
+
azureApiVersion?: string;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* A `PromptCompletionModel` for calling OpenAI and Azure OpenAI hosted models.
|
|
78
|
+
* @remarks
|
|
79
|
+
*/
|
|
80
|
+
export class OpenAIEmbeddings implements EmbeddingsModel {
|
|
81
|
+
private readonly _httpClient: AxiosInstance;
|
|
82
|
+
private readonly _useAzure: boolean;
|
|
83
|
+
|
|
84
|
+
private readonly UserAgent = 'AlphaWave';
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Options the client was configured with.
|
|
88
|
+
*/
|
|
89
|
+
public readonly options: OpenAIEmbeddingsOptions|AzureOpenAIEmbeddingsOptions;
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Creates a new `OpenAIClient` instance.
|
|
93
|
+
* @param options Options for configuring an `OpenAIClient`.
|
|
94
|
+
*/
|
|
95
|
+
public constructor(options: OpenAIEmbeddingsOptions|AzureOpenAIEmbeddingsOptions) {
|
|
96
|
+
// Check for azure config
|
|
97
|
+
if ((options as AzureOpenAIEmbeddingsOptions).azureApiKey) {
|
|
98
|
+
this._useAzure = true;
|
|
99
|
+
this.options = Object.assign({
|
|
100
|
+
retryPolicy: [2000, 5000],
|
|
101
|
+
azureApiVersion: '2023-05-15',
|
|
102
|
+
}, options) as AzureOpenAIEmbeddingsOptions;
|
|
103
|
+
|
|
104
|
+
// Cleanup and validate endpoint
|
|
105
|
+
let endpoint = this.options.azureEndpoint.trim();
|
|
106
|
+
if (endpoint.endsWith('/')) {
|
|
107
|
+
endpoint = endpoint.substring(0, endpoint.length - 1);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (!endpoint.toLowerCase().startsWith('https://')) {
|
|
111
|
+
throw new Error(`Client created with an invalid endpoint of '${endpoint}'. The endpoint must be a valid HTTPS url.`);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
this.options.azureEndpoint = endpoint;
|
|
115
|
+
} else {
|
|
116
|
+
this._useAzure = false;
|
|
117
|
+
this.options = Object.assign({
|
|
118
|
+
retryPolicy: [2000, 5000]
|
|
119
|
+
}, options) as OpenAIEmbeddingsOptions;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Create client
|
|
123
|
+
this._httpClient = axios.create({
|
|
124
|
+
validateStatus: (status) => status < 400 || status == 429
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Creates embeddings for the given inputs using the OpenAI API.
|
|
130
|
+
* @param model Name of the model to use (or deployment for Azure).
|
|
131
|
+
* @param inputs Text inputs to create embeddings for.
|
|
132
|
+
* @returns A `EmbeddingsResponse` with a status and the generated embeddings or a message when an error occurs.
|
|
133
|
+
*/
|
|
134
|
+
public async createEmbeddings(inputs: string | string[]): Promise<EmbeddingsResponse> {
|
|
135
|
+
const response = await this.createEmbeddingRequest({
|
|
136
|
+
input: inputs,
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
// Process response
|
|
140
|
+
if (response.status < 300) {
|
|
141
|
+
return { status: 'success', output: response.data.data.sort((a, b) => a.index - b.index).map((item) => item.embedding) };
|
|
142
|
+
} else if (response.status == 429) {
|
|
143
|
+
return { status: 'rate_limited', message: `The embeddings API returned a rate limit error.` }
|
|
144
|
+
} else {
|
|
145
|
+
return { status: 'error', message: `The embeddings API returned an error status of ${response.status}: ${response.statusText}` };
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* @private
|
|
151
|
+
*/
|
|
152
|
+
protected createEmbeddingRequest(request: CreateEmbeddingRequest): Promise<AxiosResponse<CreateEmbeddingResponse>> {
|
|
153
|
+
if (this._useAzure) {
|
|
154
|
+
const options = this.options as AzureOpenAIEmbeddingsOptions;
|
|
155
|
+
const url = `${options.azureEndpoint}/openai/deployments/${options.azureDeployment}/embeddings?api-version=${options.azureApiVersion!}`;
|
|
156
|
+
return this.post(url, request);
|
|
157
|
+
} else {
|
|
158
|
+
const options = this.options as OpenAIEmbeddingsOptions;
|
|
159
|
+
const url = `${options.endpoint ?? 'https://api.openai.com'}/v1/embeddings`;
|
|
160
|
+
(request as OpenAICreateEmbeddingRequest).model = options.model;
|
|
161
|
+
return this.post(url, request);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* @private
|
|
167
|
+
*/
|
|
168
|
+
protected async post<TData>(url: string, body: object, retryCount = 0): Promise<AxiosResponse<TData>> {
|
|
169
|
+
// Initialize request config
|
|
170
|
+
const requestConfig: AxiosRequestConfig = Object.assign({}, this.options.requestConfig);
|
|
171
|
+
|
|
172
|
+
// Initialize request headers
|
|
173
|
+
if (!requestConfig.headers) {
|
|
174
|
+
requestConfig.headers = {};
|
|
175
|
+
}
|
|
176
|
+
if (!requestConfig.headers['Content-Type']) {
|
|
177
|
+
requestConfig.headers['Content-Type'] = 'application/json';
|
|
178
|
+
}
|
|
179
|
+
if (!requestConfig.headers['User-Agent']) {
|
|
180
|
+
requestConfig.headers['User-Agent'] = this.UserAgent;
|
|
181
|
+
}
|
|
182
|
+
if (this._useAzure) {
|
|
183
|
+
const options = this.options as AzureOpenAIEmbeddingsOptions;
|
|
184
|
+
requestConfig.headers['api-key'] = options.azureApiKey;
|
|
185
|
+
} else {
|
|
186
|
+
const options = this.options as OpenAIEmbeddingsOptions;
|
|
187
|
+
requestConfig.headers['Authorization'] = `Bearer ${options.apiKey}`;
|
|
188
|
+
if (options.organization) {
|
|
189
|
+
requestConfig.headers['OpenAI-Organization'] = options.organization;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Send request
|
|
194
|
+
const response = await this._httpClient.post(url, body, requestConfig);
|
|
195
|
+
|
|
196
|
+
// Check for rate limit error
|
|
197
|
+
if (response.status == 429 && Array.isArray(this.options.retryPolicy) && retryCount < this.options.retryPolicy.length) {
|
|
198
|
+
const delay = this.options.retryPolicy[retryCount];
|
|
199
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
200
|
+
return this.post(url, body, retryCount + 1);
|
|
201
|
+
} else {
|
|
202
|
+
return response;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
import { GPT3Tokenizer } from "./GPT3Tokenizer";
|
|
2
|
+
import { TextChunk, Tokenizer } from "./types";
|
|
3
|
+
|
|
4
|
+
export interface TextSplitterConfig {
|
|
5
|
+
separators: string[];
|
|
6
|
+
keepSeparators: boolean;
|
|
7
|
+
chunkSize: number;
|
|
8
|
+
chunkOverlap: number;
|
|
9
|
+
tokenizer: Tokenizer;
|
|
10
|
+
docType?: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export class TextSplitter {
|
|
14
|
+
private readonly _config: TextSplitterConfig;
|
|
15
|
+
|
|
16
|
+
public constructor(config?: Partial<TextSplitterConfig>) {
|
|
17
|
+
this._config = Object.assign({
|
|
18
|
+
separators: ["\n\n", "\n", " ", ""],
|
|
19
|
+
keepSeparators: false,
|
|
20
|
+
chunkSize: 400,
|
|
21
|
+
chunkOverlap: 40,
|
|
22
|
+
} as TextSplitterConfig, config);
|
|
23
|
+
|
|
24
|
+
// Create a default tokenizer if none is provided
|
|
25
|
+
if (!this._config.tokenizer) {
|
|
26
|
+
this._config.tokenizer = new GPT3Tokenizer();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Use default separators if none are provided
|
|
30
|
+
if (!this._config.separators || this._config.separators.length === 0) {
|
|
31
|
+
this._config.separators = this.getSeparators(this._config.docType);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Validate the config settings
|
|
35
|
+
if (this._config.chunkSize < 1) {
|
|
36
|
+
throw new Error("chunkSize must be >= 1");
|
|
37
|
+
} else if (this._config.chunkOverlap < 0) {
|
|
38
|
+
throw new Error("chunkOverlap must be >= 0");
|
|
39
|
+
} else if (this._config.chunkOverlap > this._config.chunkSize) {
|
|
40
|
+
throw new Error("chunkOverlap must be <= chunkSize");
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
public split(text: string): TextChunk[] {
|
|
45
|
+
// Get basic chunks
|
|
46
|
+
const chunks = this.recursiveSplit(text, this._config.separators, 0);
|
|
47
|
+
|
|
48
|
+
const that = this;
|
|
49
|
+
function getOverlapTokens(tokens?: number[]): number[] {
|
|
50
|
+
if (tokens != undefined) {
|
|
51
|
+
const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
|
|
52
|
+
return tokens.slice(tokens.length);
|
|
53
|
+
} else {
|
|
54
|
+
return [];
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Add overlap tokens and text to the start and end of each chunk
|
|
59
|
+
if (this._config.chunkOverlap > 0) {
|
|
60
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
61
|
+
const previousChunk = chunks[i - 1];
|
|
62
|
+
const chunk = chunks[i];
|
|
63
|
+
const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
|
|
64
|
+
chunk.startOverlap = getOverlapTokens(previousChunk.tokens.reverse()).reverse();
|
|
65
|
+
chunk.endOverlap = getOverlapTokens(nextChunk?.tokens);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return chunks;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
|
|
73
|
+
const chunks: TextChunk[] = [];
|
|
74
|
+
if (text.length > 0 && separators.length > 0) {
|
|
75
|
+
const separator = separators[0];
|
|
76
|
+
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
77
|
+
const parts = text.split(separator);
|
|
78
|
+
for (let i = 0; i < parts.length; i++) {
|
|
79
|
+
const lastChunk = (i === parts.length - 1);
|
|
80
|
+
|
|
81
|
+
// Get chunk text and endPos
|
|
82
|
+
let chunk = parts[i];
|
|
83
|
+
const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
|
|
84
|
+
if (this._config.keepSeparators && !lastChunk) {
|
|
85
|
+
chunk += separator;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Encode chunk text
|
|
89
|
+
const tokens = this._config.tokenizer.encode(chunk);
|
|
90
|
+
if (tokens.length > this._config.chunkSize) {
|
|
91
|
+
// Break the text into smaller chunks
|
|
92
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
93
|
+
chunks.push(...subChunks);
|
|
94
|
+
} else {
|
|
95
|
+
// Append chunk to output
|
|
96
|
+
chunks.push({
|
|
97
|
+
text: chunk,
|
|
98
|
+
tokens: tokens,
|
|
99
|
+
startPos: startPos,
|
|
100
|
+
endPos: endPos,
|
|
101
|
+
startOverlap: [],
|
|
102
|
+
endOverlap: [],
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Update startPos
|
|
107
|
+
startPos = endPos + 1;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return chunks;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
private getSeparators(docType?: string): string[] {
|
|
115
|
+
switch (docType ?? '') {
|
|
116
|
+
case "cpp":
|
|
117
|
+
return [
|
|
118
|
+
// Split along class definitions
|
|
119
|
+
"\nclass ",
|
|
120
|
+
// Split along function definitions
|
|
121
|
+
"\nvoid ",
|
|
122
|
+
"\nint ",
|
|
123
|
+
"\nfloat ",
|
|
124
|
+
"\ndouble ",
|
|
125
|
+
// Split along control flow statements
|
|
126
|
+
"\nif ",
|
|
127
|
+
"\nfor ",
|
|
128
|
+
"\nwhile ",
|
|
129
|
+
"\nswitch ",
|
|
130
|
+
"\ncase ",
|
|
131
|
+
// Split by the normal type of lines
|
|
132
|
+
"\n\n",
|
|
133
|
+
"\n",
|
|
134
|
+
" ",
|
|
135
|
+
"",
|
|
136
|
+
];
|
|
137
|
+
case "go":
|
|
138
|
+
return [
|
|
139
|
+
// Split along function definitions
|
|
140
|
+
"\nfunc ",
|
|
141
|
+
"\nvar ",
|
|
142
|
+
"\nconst ",
|
|
143
|
+
"\ntype ",
|
|
144
|
+
// Split along control flow statements
|
|
145
|
+
"\nif ",
|
|
146
|
+
"\nfor ",
|
|
147
|
+
"\nswitch ",
|
|
148
|
+
"\ncase ",
|
|
149
|
+
// Split by the normal type of lines
|
|
150
|
+
"\n\n",
|
|
151
|
+
"\n",
|
|
152
|
+
" ",
|
|
153
|
+
"",
|
|
154
|
+
];
|
|
155
|
+
case "java":
|
|
156
|
+
case "c#":
|
|
157
|
+
case "csharp":
|
|
158
|
+
case "cs":
|
|
159
|
+
case "ts":
|
|
160
|
+
case "tsx":
|
|
161
|
+
case "typescript":
|
|
162
|
+
return [
|
|
163
|
+
// Split along class definitions
|
|
164
|
+
"\nclass ",
|
|
165
|
+
// Split along method definitions
|
|
166
|
+
"\npublic ",
|
|
167
|
+
"\nprotected ",
|
|
168
|
+
"\nprivate ",
|
|
169
|
+
"\nstatic ",
|
|
170
|
+
// Split along control flow statements
|
|
171
|
+
"\nif ",
|
|
172
|
+
"\nfor ",
|
|
173
|
+
"\nwhile ",
|
|
174
|
+
"\nswitch ",
|
|
175
|
+
"\ncase ",
|
|
176
|
+
// Split by the normal type of lines
|
|
177
|
+
"\n\n",
|
|
178
|
+
"\n",
|
|
179
|
+
" ",
|
|
180
|
+
"",
|
|
181
|
+
];
|
|
182
|
+
case "js":
|
|
183
|
+
case "jsx":
|
|
184
|
+
case "javascript":
|
|
185
|
+
return [
|
|
186
|
+
// Split along class definitions
|
|
187
|
+
"\nclass ",
|
|
188
|
+
// Split along function definitions
|
|
189
|
+
"\nfunction ",
|
|
190
|
+
"\nconst ",
|
|
191
|
+
"\nlet ",
|
|
192
|
+
"\nvar ",
|
|
193
|
+
"\nclass ",
|
|
194
|
+
// Split along control flow statements
|
|
195
|
+
"\nif ",
|
|
196
|
+
"\nfor ",
|
|
197
|
+
"\nwhile ",
|
|
198
|
+
"\nswitch ",
|
|
199
|
+
"\ncase ",
|
|
200
|
+
"\ndefault ",
|
|
201
|
+
// Split by the normal type of lines
|
|
202
|
+
"\n\n",
|
|
203
|
+
"\n",
|
|
204
|
+
" ",
|
|
205
|
+
"",
|
|
206
|
+
];
|
|
207
|
+
case "php":
|
|
208
|
+
return [
|
|
209
|
+
// Split along function definitions
|
|
210
|
+
"\nfunction ",
|
|
211
|
+
// Split along class definitions
|
|
212
|
+
"\nclass ",
|
|
213
|
+
// Split along control flow statements
|
|
214
|
+
"\nif ",
|
|
215
|
+
"\nforeach ",
|
|
216
|
+
"\nwhile ",
|
|
217
|
+
"\ndo ",
|
|
218
|
+
"\nswitch ",
|
|
219
|
+
"\ncase ",
|
|
220
|
+
// Split by the normal type of lines
|
|
221
|
+
"\n\n",
|
|
222
|
+
"\n",
|
|
223
|
+
" ",
|
|
224
|
+
"",
|
|
225
|
+
];
|
|
226
|
+
case "proto":
|
|
227
|
+
return [
|
|
228
|
+
// Split along message definitions
|
|
229
|
+
"\nmessage ",
|
|
230
|
+
// Split along service definitions
|
|
231
|
+
"\nservice ",
|
|
232
|
+
// Split along enum definitions
|
|
233
|
+
"\nenum ",
|
|
234
|
+
// Split along option definitions
|
|
235
|
+
"\noption ",
|
|
236
|
+
// Split along import statements
|
|
237
|
+
"\nimport ",
|
|
238
|
+
// Split along syntax declarations
|
|
239
|
+
"\nsyntax ",
|
|
240
|
+
// Split by the normal type of lines
|
|
241
|
+
"\n\n",
|
|
242
|
+
"\n",
|
|
243
|
+
" ",
|
|
244
|
+
"",
|
|
245
|
+
];
|
|
246
|
+
case "python":
|
|
247
|
+
case "py":
|
|
248
|
+
return [
|
|
249
|
+
// First, try to split along class definitions
|
|
250
|
+
"\nclass ",
|
|
251
|
+
"\ndef ",
|
|
252
|
+
"\n\tdef ",
|
|
253
|
+
// Now split by the normal type of lines
|
|
254
|
+
"\n\n",
|
|
255
|
+
"\n",
|
|
256
|
+
" ",
|
|
257
|
+
"",
|
|
258
|
+
];
|
|
259
|
+
case "rst":
|
|
260
|
+
return [
|
|
261
|
+
// Split along section titles
|
|
262
|
+
"\n===\n",
|
|
263
|
+
"\n---\n",
|
|
264
|
+
"\n***\n",
|
|
265
|
+
// Split along directive markers
|
|
266
|
+
"\n.. ",
|
|
267
|
+
// Split by the normal type of lines
|
|
268
|
+
"\n\n",
|
|
269
|
+
"\n",
|
|
270
|
+
" ",
|
|
271
|
+
"",
|
|
272
|
+
];
|
|
273
|
+
case "ruby":
|
|
274
|
+
return [
|
|
275
|
+
// Split along method definitions
|
|
276
|
+
"\ndef ",
|
|
277
|
+
"\nclass ",
|
|
278
|
+
// Split along control flow statements
|
|
279
|
+
"\nif ",
|
|
280
|
+
"\nunless ",
|
|
281
|
+
"\nwhile ",
|
|
282
|
+
"\nfor ",
|
|
283
|
+
"\ndo ",
|
|
284
|
+
"\nbegin ",
|
|
285
|
+
"\nrescue ",
|
|
286
|
+
// Split by the normal type of lines
|
|
287
|
+
"\n\n",
|
|
288
|
+
"\n",
|
|
289
|
+
" ",
|
|
290
|
+
"",
|
|
291
|
+
];
|
|
292
|
+
case "rust":
|
|
293
|
+
return [
|
|
294
|
+
// Split along function definitions
|
|
295
|
+
"\nfn ",
|
|
296
|
+
"\nconst ",
|
|
297
|
+
"\nlet ",
|
|
298
|
+
// Split along control flow statements
|
|
299
|
+
"\nif ",
|
|
300
|
+
"\nwhile ",
|
|
301
|
+
"\nfor ",
|
|
302
|
+
"\nloop ",
|
|
303
|
+
"\nmatch ",
|
|
304
|
+
"\nconst ",
|
|
305
|
+
// Split by the normal type of lines
|
|
306
|
+
"\n\n",
|
|
307
|
+
"\n",
|
|
308
|
+
" ",
|
|
309
|
+
"",
|
|
310
|
+
];
|
|
311
|
+
case "scala":
|
|
312
|
+
return [
|
|
313
|
+
// Split along class definitions
|
|
314
|
+
"\nclass ",
|
|
315
|
+
"\nobject ",
|
|
316
|
+
// Split along method definitions
|
|
317
|
+
"\ndef ",
|
|
318
|
+
"\nval ",
|
|
319
|
+
"\nvar ",
|
|
320
|
+
// Split along control flow statements
|
|
321
|
+
"\nif ",
|
|
322
|
+
"\nfor ",
|
|
323
|
+
"\nwhile ",
|
|
324
|
+
"\nmatch ",
|
|
325
|
+
"\ncase ",
|
|
326
|
+
// Split by the normal type of lines
|
|
327
|
+
"\n\n",
|
|
328
|
+
"\n",
|
|
329
|
+
" ",
|
|
330
|
+
"",
|
|
331
|
+
];
|
|
332
|
+
case "swift":
|
|
333
|
+
return [
|
|
334
|
+
// Split along function definitions
|
|
335
|
+
"\nfunc ",
|
|
336
|
+
// Split along class definitions
|
|
337
|
+
"\nclass ",
|
|
338
|
+
"\nstruct ",
|
|
339
|
+
"\nenum ",
|
|
340
|
+
// Split along control flow statements
|
|
341
|
+
"\nif ",
|
|
342
|
+
"\nfor ",
|
|
343
|
+
"\nwhile ",
|
|
344
|
+
"\ndo ",
|
|
345
|
+
"\nswitch ",
|
|
346
|
+
"\ncase ",
|
|
347
|
+
// Split by the normal type of lines
|
|
348
|
+
"\n\n",
|
|
349
|
+
"\n",
|
|
350
|
+
" ",
|
|
351
|
+
"",
|
|
352
|
+
];
|
|
353
|
+
case "markdown":
|
|
354
|
+
return [
|
|
355
|
+
// First, try to split along Markdown headings (starting with level 2)
|
|
356
|
+
"\n## ",
|
|
357
|
+
"\n### ",
|
|
358
|
+
"\n#### ",
|
|
359
|
+
"\n##### ",
|
|
360
|
+
"\n###### ",
|
|
361
|
+
// Note the alternative syntax for headings (below) is not handled here
|
|
362
|
+
// Heading level 2
|
|
363
|
+
// ---------------
|
|
364
|
+
// End of code block
|
|
365
|
+
"```\n\n",
|
|
366
|
+
// Horizontal lines
|
|
367
|
+
"\n\n***\n\n",
|
|
368
|
+
"\n\n---\n\n",
|
|
369
|
+
"\n\n___\n\n",
|
|
370
|
+
// Note that this splitter doesn't handle horizontal lines defined
|
|
371
|
+
// by *three or more* of ***, ---, or ___, but this is not handled
|
|
372
|
+
"\n\n",
|
|
373
|
+
"\n",
|
|
374
|
+
" ",
|
|
375
|
+
"",
|
|
376
|
+
];
|
|
377
|
+
case "latex":
|
|
378
|
+
return [
|
|
379
|
+
// First, try to split along Latex sections
|
|
380
|
+
"\n\\chapter{",
|
|
381
|
+
"\n\\section{",
|
|
382
|
+
"\n\\subsection{",
|
|
383
|
+
"\n\\subsubsection{",
|
|
384
|
+
|
|
385
|
+
// Now split by environments
|
|
386
|
+
"\n\\begin{enumerate}",
|
|
387
|
+
"\n\\begin{itemize}",
|
|
388
|
+
"\n\\begin{description}",
|
|
389
|
+
"\n\\begin{list}",
|
|
390
|
+
"\n\\begin{quote}",
|
|
391
|
+
"\n\\begin{quotation}",
|
|
392
|
+
"\n\\begin{verse}",
|
|
393
|
+
"\n\\begin{verbatim}",
|
|
394
|
+
|
|
395
|
+
// Now split by math environments
|
|
396
|
+
"\n\\begin{align}",
|
|
397
|
+
"$$",
|
|
398
|
+
"$",
|
|
399
|
+
|
|
400
|
+
// Now split by the normal type of lines
|
|
401
|
+
"\n\n",
|
|
402
|
+
"\n",
|
|
403
|
+
" ",
|
|
404
|
+
"",
|
|
405
|
+
];
|
|
406
|
+
case "html":
|
|
407
|
+
return [
|
|
408
|
+
// First, try to split along HTML tags
|
|
409
|
+
"<body>",
|
|
410
|
+
"<div>",
|
|
411
|
+
"<p>",
|
|
412
|
+
"<br>",
|
|
413
|
+
"<li>",
|
|
414
|
+
"<h1>",
|
|
415
|
+
"<h2>",
|
|
416
|
+
"<h3>",
|
|
417
|
+
"<h4>",
|
|
418
|
+
"<h5>",
|
|
419
|
+
"<h6>",
|
|
420
|
+
"<span>",
|
|
421
|
+
"<table>",
|
|
422
|
+
"<tr>",
|
|
423
|
+
"<td>",
|
|
424
|
+
"<th>",
|
|
425
|
+
"<ul>",
|
|
426
|
+
"<ol>",
|
|
427
|
+
"<header>",
|
|
428
|
+
"<footer>",
|
|
429
|
+
"<nav>",
|
|
430
|
+
// Head
|
|
431
|
+
"<head>",
|
|
432
|
+
"<style>",
|
|
433
|
+
"<script>",
|
|
434
|
+
"<meta>",
|
|
435
|
+
"<title>",
|
|
436
|
+
// Normal type of lines
|
|
437
|
+
" ",
|
|
438
|
+
"",
|
|
439
|
+
];
|
|
440
|
+
case "sol":
|
|
441
|
+
return [
|
|
442
|
+
// Split along compiler informations definitions
|
|
443
|
+
"\npragma ",
|
|
444
|
+
"\nusing ",
|
|
445
|
+
// Split along contract definitions
|
|
446
|
+
"\ncontract ",
|
|
447
|
+
"\ninterface ",
|
|
448
|
+
"\nlibrary ",
|
|
449
|
+
// Split along method definitions
|
|
450
|
+
"\nconstructor ",
|
|
451
|
+
"\ntype ",
|
|
452
|
+
"\nfunction ",
|
|
453
|
+
"\nevent ",
|
|
454
|
+
"\nmodifier ",
|
|
455
|
+
"\nerror ",
|
|
456
|
+
"\nstruct ",
|
|
457
|
+
"\nenum ",
|
|
458
|
+
// Split along control flow statements
|
|
459
|
+
"\nif ",
|
|
460
|
+
"\nfor ",
|
|
461
|
+
"\nwhile ",
|
|
462
|
+
"\ndo while ",
|
|
463
|
+
"\nassembly ",
|
|
464
|
+
// Split by the normal type of lines
|
|
465
|
+
"\n\n",
|
|
466
|
+
"\n",
|
|
467
|
+
" ",
|
|
468
|
+
"",
|
|
469
|
+
];
|
|
470
|
+
default:
|
|
471
|
+
return [
|
|
472
|
+
// Split by the normal type of lines
|
|
473
|
+
"\n\n",
|
|
474
|
+
"\n",
|
|
475
|
+
" ",
|
|
476
|
+
"",
|
|
477
|
+
];
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|