@smythos/sre 1.5.53 → 1.5.54
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG +98 -98
- package/LICENSE +18 -18
- package/README.md +135 -135
- package/dist/bundle-analysis-lazy.html +4949 -0
- package/dist/bundle-analysis.html +4949 -0
- package/dist/index.js +3 -3
- package/dist/index.js.map +1 -1
- package/dist/types/subsystems/LLMManager/LLM.service/connectors/openai/OpenAIConnector.class.d.ts +1 -6
- package/dist/types/utils/package-manager.utils.d.ts +26 -0
- package/package.json +1 -1
- package/src/Components/APICall/APICall.class.ts +157 -157
- package/src/Components/APICall/AccessTokenManager.ts +166 -166
- package/src/Components/APICall/ArrayBufferResponse.helper.ts +58 -58
- package/src/Components/APICall/OAuth.helper.ts +447 -447
- package/src/Components/APICall/mimeTypeCategories.ts +46 -46
- package/src/Components/APICall/parseData.ts +167 -167
- package/src/Components/APICall/parseHeaders.ts +41 -41
- package/src/Components/APICall/parseProxy.ts +68 -68
- package/src/Components/APICall/parseUrl.ts +91 -91
- package/src/Components/APIEndpoint.class.ts +234 -234
- package/src/Components/APIOutput.class.ts +58 -58
- package/src/Components/AgentPlugin.class.ts +102 -102
- package/src/Components/Async.class.ts +155 -155
- package/src/Components/Await.class.ts +90 -90
- package/src/Components/Classifier.class.ts +158 -158
- package/src/Components/Component.class.ts +132 -132
- package/src/Components/ComponentHost.class.ts +38 -38
- package/src/Components/DataSourceCleaner.class.ts +92 -92
- package/src/Components/DataSourceIndexer.class.ts +181 -181
- package/src/Components/DataSourceLookup.class.ts +161 -161
- package/src/Components/ECMASandbox.class.ts +71 -71
- package/src/Components/FEncDec.class.ts +29 -29
- package/src/Components/FHash.class.ts +33 -33
- package/src/Components/FSign.class.ts +80 -80
- package/src/Components/FSleep.class.ts +25 -25
- package/src/Components/FTimestamp.class.ts +25 -25
- package/src/Components/FileStore.class.ts +78 -78
- package/src/Components/ForEach.class.ts +97 -97
- package/src/Components/GPTPlugin.class.ts +70 -70
- package/src/Components/GenAILLM.class.ts +586 -586
- package/src/Components/HuggingFace.class.ts +314 -314
- package/src/Components/Image/imageSettings.config.ts +70 -70
- package/src/Components/ImageGenerator.class.ts +502 -502
- package/src/Components/JSONFilter.class.ts +54 -54
- package/src/Components/LLMAssistant.class.ts +213 -213
- package/src/Components/LogicAND.class.ts +28 -28
- package/src/Components/LogicAtLeast.class.ts +85 -85
- package/src/Components/LogicAtMost.class.ts +86 -86
- package/src/Components/LogicOR.class.ts +29 -29
- package/src/Components/LogicXOR.class.ts +34 -34
- package/src/Components/MCPClient.class.ts +138 -138
- package/src/Components/MemoryDeleteKeyVal.class.ts +70 -70
- package/src/Components/MemoryReadKeyVal.class.ts +66 -66
- package/src/Components/MemoryWriteKeyVal.class.ts +62 -62
- package/src/Components/MemoryWriteObject.class.ts +97 -97
- package/src/Components/MultimodalLLM.class.ts +128 -128
- package/src/Components/OpenAPI.class.ts +72 -72
- package/src/Components/PromptGenerator.class.ts +122 -122
- package/src/Components/ScrapflyWebScrape.class.ts +159 -159
- package/src/Components/ServerlessCode.class.ts +123 -123
- package/src/Components/TavilyWebSearch.class.ts +98 -98
- package/src/Components/VisionLLM.class.ts +104 -104
- package/src/Components/ZapierAction.class.ts +127 -127
- package/src/Components/index.ts +97 -97
- package/src/Core/AgentProcess.helper.ts +240 -240
- package/src/Core/Connector.class.ts +123 -123
- package/src/Core/ConnectorsService.ts +197 -197
- package/src/Core/DummyConnector.ts +49 -49
- package/src/Core/HookService.ts +105 -105
- package/src/Core/SmythRuntime.class.ts +235 -235
- package/src/Core/SystemEvents.ts +16 -16
- package/src/Core/boot.ts +56 -56
- package/src/config.ts +15 -15
- package/src/constants.ts +126 -126
- package/src/data/hugging-face.params.json +579 -579
- package/src/helpers/AWSLambdaCode.helper.ts +590 -590
- package/src/helpers/BinaryInput.helper.ts +331 -331
- package/src/helpers/Conversation.helper.ts +1119 -1119
- package/src/helpers/ECMASandbox.helper.ts +54 -54
- package/src/helpers/JsonContent.helper.ts +97 -97
- package/src/helpers/LocalCache.helper.ts +97 -97
- package/src/helpers/Log.helper.ts +274 -274
- package/src/helpers/OpenApiParser.helper.ts +150 -150
- package/src/helpers/S3Cache.helper.ts +147 -147
- package/src/helpers/SmythURI.helper.ts +5 -5
- package/src/helpers/Sysconfig.helper.ts +77 -77
- package/src/helpers/TemplateString.helper.ts +243 -243
- package/src/helpers/TypeChecker.helper.ts +329 -329
- package/src/index.ts +3 -3
- package/src/index.ts.bak +3 -3
- package/src/subsystems/AgentManager/Agent.class.ts +1114 -1114
- package/src/subsystems/AgentManager/Agent.helper.ts +3 -3
- package/src/subsystems/AgentManager/AgentData.service/AgentDataConnector.ts +230 -230
- package/src/subsystems/AgentManager/AgentData.service/connectors/CLIAgentDataConnector.class.ts +66 -66
- package/src/subsystems/AgentManager/AgentData.service/connectors/LocalAgentDataConnector.class.ts +142 -142
- package/src/subsystems/AgentManager/AgentData.service/connectors/NullAgentData.class.ts +39 -39
- package/src/subsystems/AgentManager/AgentData.service/index.ts +18 -18
- package/src/subsystems/AgentManager/AgentLogger.class.ts +301 -297
- package/src/subsystems/AgentManager/AgentRequest.class.ts +51 -51
- package/src/subsystems/AgentManager/AgentRuntime.class.ts +559 -559
- package/src/subsystems/AgentManager/AgentSSE.class.ts +101 -101
- package/src/subsystems/AgentManager/AgentSettings.class.ts +52 -52
- package/src/subsystems/AgentManager/Component.service/ComponentConnector.ts +32 -32
- package/src/subsystems/AgentManager/Component.service/connectors/LocalComponentConnector.class.ts +60 -60
- package/src/subsystems/AgentManager/Component.service/index.ts +11 -11
- package/src/subsystems/AgentManager/EmbodimentSettings.class.ts +47 -47
- package/src/subsystems/AgentManager/ForkedAgent.class.ts +154 -154
- package/src/subsystems/AgentManager/OSResourceMonitor.ts +77 -77
- package/src/subsystems/ComputeManager/Code.service/CodeConnector.ts +98 -98
- package/src/subsystems/ComputeManager/Code.service/connectors/AWSLambdaCode.class.ts +172 -172
- package/src/subsystems/ComputeManager/Code.service/connectors/ECMASandbox.class.ts +131 -131
- package/src/subsystems/ComputeManager/Code.service/index.ts +13 -13
- package/src/subsystems/IO/CLI.service/CLIConnector.ts +47 -47
- package/src/subsystems/IO/CLI.service/index.ts +9 -9
- package/src/subsystems/IO/Log.service/LogConnector.ts +32 -32
- package/src/subsystems/IO/Log.service/connectors/ConsoleLog.class.ts +28 -28
- package/src/subsystems/IO/Log.service/index.ts +13 -13
- package/src/subsystems/IO/NKV.service/NKVConnector.ts +43 -43
- package/src/subsystems/IO/NKV.service/connectors/NKVLocalStorage.class.ts +234 -234
- package/src/subsystems/IO/NKV.service/connectors/NKVRAM.class.ts +204 -204
- package/src/subsystems/IO/NKV.service/connectors/NKVRedis.class.ts +182 -182
- package/src/subsystems/IO/NKV.service/index.ts +14 -14
- package/src/subsystems/IO/Router.service/RouterConnector.ts +21 -21
- package/src/subsystems/IO/Router.service/connectors/ExpressRouter.class.ts +48 -48
- package/src/subsystems/IO/Router.service/connectors/NullRouter.class.ts +40 -40
- package/src/subsystems/IO/Router.service/index.ts +11 -11
- package/src/subsystems/IO/Storage.service/SmythFS.class.ts +489 -489
- package/src/subsystems/IO/Storage.service/StorageConnector.ts +66 -66
- package/src/subsystems/IO/Storage.service/connectors/LocalStorage.class.ts +327 -327
- package/src/subsystems/IO/Storage.service/connectors/S3Storage.class.ts +482 -482
- package/src/subsystems/IO/Storage.service/index.ts +13 -13
- package/src/subsystems/IO/VectorDB.service/VectorDBConnector.ts +108 -108
- package/src/subsystems/IO/VectorDB.service/connectors/MilvusVectorDB.class.ts +454 -454
- package/src/subsystems/IO/VectorDB.service/connectors/PineconeVectorDB.class.ts +384 -384
- package/src/subsystems/IO/VectorDB.service/connectors/RAMVecrtorDB.class.ts +421 -421
- package/src/subsystems/IO/VectorDB.service/embed/BaseEmbedding.ts +107 -107
- package/src/subsystems/IO/VectorDB.service/embed/OpenAIEmbedding.ts +109 -109
- package/src/subsystems/IO/VectorDB.service/embed/index.ts +21 -21
- package/src/subsystems/IO/VectorDB.service/index.ts +14 -14
- package/src/subsystems/LLMManager/LLM.helper.ts +251 -251
- package/src/subsystems/LLMManager/LLM.inference.ts +339 -339
- package/src/subsystems/LLMManager/LLM.service/LLMConnector.ts +489 -489
- package/src/subsystems/LLMManager/LLM.service/LLMCredentials.helper.ts +171 -171
- package/src/subsystems/LLMManager/LLM.service/connectors/Anthropic.class.ts +659 -659
- package/src/subsystems/LLMManager/LLM.service/connectors/Bedrock.class.ts +400 -400
- package/src/subsystems/LLMManager/LLM.service/connectors/Echo.class.ts +77 -77
- package/src/subsystems/LLMManager/LLM.service/connectors/GoogleAI.class.ts +757 -757
- package/src/subsystems/LLMManager/LLM.service/connectors/Groq.class.ts +304 -304
- package/src/subsystems/LLMManager/LLM.service/connectors/Perplexity.class.ts +250 -250
- package/src/subsystems/LLMManager/LLM.service/connectors/VertexAI.class.ts +423 -423
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/OpenAIConnector.class.ts +488 -488
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/apiInterfaces/ChatCompletionsApiInterface.ts +524 -524
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/apiInterfaces/OpenAIApiInterface.ts +100 -100
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/apiInterfaces/OpenAIApiInterfaceFactory.ts +81 -81
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/apiInterfaces/ResponsesApiInterface.ts +1145 -1145
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/apiInterfaces/constants.ts +13 -13
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/apiInterfaces/index.ts +4 -4
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/apiInterfaces/utils.ts +11 -11
- package/src/subsystems/LLMManager/LLM.service/connectors/openai/types.ts +32 -32
- package/src/subsystems/LLMManager/LLM.service/connectors/xAI.class.ts +471 -471
- package/src/subsystems/LLMManager/LLM.service/index.ts +44 -44
- package/src/subsystems/LLMManager/ModelsProvider.service/ModelsProviderConnector.ts +300 -300
- package/src/subsystems/LLMManager/ModelsProvider.service/connectors/JSONModelsProvider.class.ts +252 -252
- package/src/subsystems/LLMManager/ModelsProvider.service/index.ts +11 -11
- package/src/subsystems/LLMManager/custom-models.ts +854 -854
- package/src/subsystems/LLMManager/models.ts +2540 -2540
- package/src/subsystems/LLMManager/paramMappings.ts +69 -69
- package/src/subsystems/MemoryManager/Cache.service/CacheConnector.ts +86 -86
- package/src/subsystems/MemoryManager/Cache.service/connectors/LocalStorageCache.class.ts +297 -297
- package/src/subsystems/MemoryManager/Cache.service/connectors/RAMCache.class.ts +201 -201
- package/src/subsystems/MemoryManager/Cache.service/connectors/RedisCache.class.ts +252 -252
- package/src/subsystems/MemoryManager/Cache.service/connectors/S3Cache.class.ts +373 -373
- package/src/subsystems/MemoryManager/Cache.service/index.ts +15 -15
- package/src/subsystems/MemoryManager/LLMCache.ts +72 -72
- package/src/subsystems/MemoryManager/LLMContext.ts +124 -124
- package/src/subsystems/MemoryManager/LLMMemory.service/LLMMemoryConnector.ts +26 -26
- package/src/subsystems/MemoryManager/RuntimeContext.ts +266 -266
- package/src/subsystems/Security/AccessControl/ACL.class.ts +208 -208
- package/src/subsystems/Security/AccessControl/AccessCandidate.class.ts +82 -82
- package/src/subsystems/Security/AccessControl/AccessRequest.class.ts +52 -52
- package/src/subsystems/Security/Account.service/AccountConnector.ts +44 -44
- package/src/subsystems/Security/Account.service/connectors/AWSAccount.class.ts +76 -76
- package/src/subsystems/Security/Account.service/connectors/DummyAccount.class.ts +130 -130
- package/src/subsystems/Security/Account.service/connectors/JSONFileAccount.class.ts +159 -159
- package/src/subsystems/Security/Account.service/index.ts +14 -14
- package/src/subsystems/Security/Credentials.helper.ts +62 -62
- package/src/subsystems/Security/ManagedVault.service/ManagedVaultConnector.ts +38 -38
- package/src/subsystems/Security/ManagedVault.service/connectors/NullManagedVault.class.ts +53 -53
- package/src/subsystems/Security/ManagedVault.service/connectors/SecretManagerManagedVault.ts +154 -154
- package/src/subsystems/Security/ManagedVault.service/index.ts +12 -12
- package/src/subsystems/Security/SecureConnector.class.ts +110 -110
- package/src/subsystems/Security/Vault.service/Vault.helper.ts +30 -30
- package/src/subsystems/Security/Vault.service/VaultConnector.ts +29 -29
- package/src/subsystems/Security/Vault.service/connectors/HashicorpVault.class.ts +46 -46
- package/src/subsystems/Security/Vault.service/connectors/JSONFileVault.class.ts +221 -221
- package/src/subsystems/Security/Vault.service/connectors/NullVault.class.ts +54 -54
- package/src/subsystems/Security/Vault.service/connectors/SecretsManager.class.ts +140 -140
- package/src/subsystems/Security/Vault.service/index.ts +12 -12
- package/src/types/ACL.types.ts +104 -104
- package/src/types/AWS.types.ts +10 -10
- package/src/types/Agent.types.ts +61 -61
- package/src/types/AgentLogger.types.ts +17 -17
- package/src/types/Cache.types.ts +1 -1
- package/src/types/Common.types.ts +2 -2
- package/src/types/LLM.types.ts +496 -496
- package/src/types/Redis.types.ts +8 -8
- package/src/types/SRE.types.ts +64 -64
- package/src/types/Security.types.ts +14 -14
- package/src/types/Storage.types.ts +5 -5
- package/src/types/VectorDB.types.ts +86 -86
- package/src/utils/base64.utils.ts +275 -275
- package/src/utils/cli.utils.ts +68 -68
- package/src/utils/data.utils.ts +322 -322
- package/src/utils/date-time.utils.ts +22 -22
- package/src/utils/general.utils.ts +238 -238
- package/src/utils/index.ts +12 -12
- package/src/utils/lazy-client.ts +261 -261
- package/src/utils/numbers.utils.ts +13 -13
- package/src/utils/oauth.utils.ts +35 -35
- package/src/utils/string.utils.ts +414 -414
- package/src/utils/url.utils.ts +19 -19
- package/src/utils/validation.utils.ts +74 -74
- package/dist/types/subsystems/LLMManager/ModelsProvider.service/connectors/SmythModelsProvider.class.d.ts +0 -39
|
@@ -1,414 +1,414 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* perform a replace operation on a string asynchronously
|
|
3
|
-
* @param str
|
|
4
|
-
* @param regex
|
|
5
|
-
* @param asyncFn
|
|
6
|
-
* @returns
|
|
7
|
-
*/
|
|
8
|
-
export async function asyncReplace(str, regex, asyncFn) {
|
|
9
|
-
const matches = [];
|
|
10
|
-
let match;
|
|
11
|
-
|
|
12
|
-
// Find all matches and store them in an array
|
|
13
|
-
while ((match = regex.exec(str)) !== null) {
|
|
14
|
-
matches.push(match);
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
// Process each match asynchronously
|
|
18
|
-
const replacements = await Promise.all(
|
|
19
|
-
matches.map(async (match) => {
|
|
20
|
-
// Call the async function with all match groups
|
|
21
|
-
return asyncFn(...match);
|
|
22
|
-
}),
|
|
23
|
-
);
|
|
24
|
-
|
|
25
|
-
// Reassemble the string with replacements
|
|
26
|
-
let result = '';
|
|
27
|
-
let lastIndex = 0;
|
|
28
|
-
|
|
29
|
-
matches.forEach((match, index) => {
|
|
30
|
-
result += str.slice(lastIndex, match.index) + replacements[index];
|
|
31
|
-
lastIndex = match.index + match[0].length;
|
|
32
|
-
});
|
|
33
|
-
|
|
34
|
-
// Append the remaining part of the string
|
|
35
|
-
result += str.slice(lastIndex);
|
|
36
|
-
|
|
37
|
-
return result;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
export function isValidString(str: string): boolean {
|
|
41
|
-
return str && typeof str === 'string';
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
const isValidNumber = (str: string): boolean => {
|
|
45
|
-
const num = parseFloat(str);
|
|
46
|
-
return !isNaN(num) && num <= Number.MAX_SAFE_INTEGER && num >= Number.MIN_SAFE_INTEGER && num.toString() === str.trim();
|
|
47
|
-
};
|
|
48
|
-
|
|
49
|
-
/**
|
|
50
|
-
* The function parseJson() won't parse the data for property values.
|
|
51
|
-
* For instance, if you have '{"a": "1","b": "true"}', it will be parsed as {a: '1', b: 'true'}. That's why we parse the appropriate data type for property values
|
|
52
|
-
* so that the data will be parsed as {a: 1, b: true}
|
|
53
|
-
* @param data
|
|
54
|
-
* @returns
|
|
55
|
-
*/
|
|
56
|
-
export function convertStringToRespectiveType(data: any): any {
|
|
57
|
-
if (data === null || data === undefined) return data;
|
|
58
|
-
|
|
59
|
-
if (typeof data !== 'object') {
|
|
60
|
-
// If it's a string, perform conversions
|
|
61
|
-
if (typeof data === 'string') {
|
|
62
|
-
if (data.toLowerCase() === 'true') {
|
|
63
|
-
return true;
|
|
64
|
-
} else if (data.toLowerCase() === 'false') {
|
|
65
|
-
return false;
|
|
66
|
-
} else if (isValidNumber(data)) {
|
|
67
|
-
return Number(data);
|
|
68
|
-
} else if (data.toLowerCase() === 'null') {
|
|
69
|
-
return null;
|
|
70
|
-
} else if (data.toLowerCase() === 'undefined') {
|
|
71
|
-
return undefined;
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
return data;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
// If it's an array, map over it and parse each item
|
|
79
|
-
if (Array.isArray(data)) {
|
|
80
|
-
return data.map((item) => convertStringToRespectiveType(item));
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
// If it's an object, map over its properties and parse each one
|
|
84
|
-
return Object.fromEntries(Object.entries(data).map(([key, value]) => [key, convertStringToRespectiveType(value)]));
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
export const kebabToCamel = (input) => {
|
|
88
|
-
if (!input || typeof input !== 'string') return input;
|
|
89
|
-
|
|
90
|
-
return input.replace(/-([a-z])/g, function (match, group) {
|
|
91
|
-
return group.toUpperCase();
|
|
92
|
-
});
|
|
93
|
-
};
|
|
94
|
-
|
|
95
|
-
export const kebabToCapitalize = (input) => {
|
|
96
|
-
if (!input || typeof input !== 'string') return input;
|
|
97
|
-
|
|
98
|
-
return input
|
|
99
|
-
.split('-')
|
|
100
|
-
.map((word) => word.charAt(0).toUpperCase() + word.slice(1))
|
|
101
|
-
.join(' ');
|
|
102
|
-
};
|
|
103
|
-
|
|
104
|
-
/**
|
|
105
|
-
* This function reads a string and tries to identify the mimetype (e.g. text/plain, application/json, application/xml ...)
|
|
106
|
-
* @param input
|
|
107
|
-
*/
|
|
108
|
-
export const identifyMimetypeFromString = (input: string) => {
|
|
109
|
-
// Return null if input is not a string
|
|
110
|
-
if (typeof input !== 'string') {
|
|
111
|
-
return '';
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
// Return null for empty strings
|
|
115
|
-
if (!input.trim()) {
|
|
116
|
-
return '';
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
const trimmedInput = input.trim();
|
|
120
|
-
|
|
121
|
-
// Check for JSON
|
|
122
|
-
if ((trimmedInput.startsWith('{') && trimmedInput.endsWith('}')) || (trimmedInput.startsWith('[') && trimmedInput.endsWith(']'))) {
|
|
123
|
-
try {
|
|
124
|
-
JSON.parse(trimmedInput);
|
|
125
|
-
return 'application/json';
|
|
126
|
-
} catch {
|
|
127
|
-
// Not valid JSON, continue checking
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
// Check for XML
|
|
132
|
-
if (trimmedInput.startsWith('<') && trimmedInput.endsWith('>')) {
|
|
133
|
-
// More specific XML patterns
|
|
134
|
-
if (trimmedInput.match(/^<\?xml\s/i) || trimmedInput.match(/^<[a-zA-Z][^>]*>.*<\/[a-zA-Z][^>]*>$/s)) {
|
|
135
|
-
return 'application/xml';
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
// Check for HTML
|
|
139
|
-
if (
|
|
140
|
-
trimmedInput.match(/^<!DOCTYPE\s+html/i) ||
|
|
141
|
-
trimmedInput.match(/<html[^>]*>/i) ||
|
|
142
|
-
trimmedInput.match(/<head[^>]*>/i) ||
|
|
143
|
-
trimmedInput.match(/<body[^>]*>/i) ||
|
|
144
|
-
trimmedInput.match(/<div[^>]*>/i) ||
|
|
145
|
-
trimmedInput.match(/<p[^>]*>/i)
|
|
146
|
-
) {
|
|
147
|
-
return 'text/html';
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
// Check for SVG
|
|
151
|
-
if (trimmedInput.match(/<svg[^>]*>/i)) {
|
|
152
|
-
return 'image/svg+xml';
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
// Generic XML if it has XML structure
|
|
156
|
-
return 'application/xml';
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
// Check for CSS
|
|
160
|
-
if (trimmedInput.match(/^[^{]*\{[^}]*\}/s) || trimmedInput.match(/@(import|media|charset|keyframes|font-face)/i)) {
|
|
161
|
-
return 'text/css';
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
// Check for JavaScript
|
|
165
|
-
if (
|
|
166
|
-
trimmedInput.match(/^(function\s+\w+|var\s+\w+|let\s+\w+|const\s+\w+|class\s+\w+)/i) ||
|
|
167
|
-
trimmedInput.match(/(console\.log|document\.|window\.|require\(|import\s+)/i) ||
|
|
168
|
-
trimmedInput.match(/=>\s*{|function\s*\(/)
|
|
169
|
-
) {
|
|
170
|
-
return 'application/javascript';
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
// Check for YAML
|
|
174
|
-
if (trimmedInput.match(/^---\s*$/m) || trimmedInput.match(/^[a-zA-Z_][a-zA-Z0-9_]*:\s*[^\n]+$/m) || trimmedInput.match(/^\s*-\s+[^\n]+$/m)) {
|
|
175
|
-
return 'application/yaml';
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
// Check for CSV
|
|
179
|
-
const lines = trimmedInput.split('\n');
|
|
180
|
-
if (lines.length > 1) {
|
|
181
|
-
const firstLine = lines[0];
|
|
182
|
-
const hasCommas = firstLine.includes(',');
|
|
183
|
-
const hasSemicolons = firstLine.includes(';');
|
|
184
|
-
const hasTabs = firstLine.includes('\t');
|
|
185
|
-
|
|
186
|
-
if (hasCommas || hasSemicolons || hasTabs) {
|
|
187
|
-
// Check if multiple lines have similar delimiter patterns
|
|
188
|
-
const delimiter = hasCommas ? ',' : hasSemicolons ? ';' : '\t';
|
|
189
|
-
const firstLineFields = firstLine.split(delimiter).length;
|
|
190
|
-
|
|
191
|
-
let csvLikeLines = 0;
|
|
192
|
-
for (let i = 0; i < Math.min(lines.length, 5); i++) {
|
|
193
|
-
const fieldsCount = lines[i].split(delimiter).length;
|
|
194
|
-
if (fieldsCount === firstLineFields && fieldsCount > 1) {
|
|
195
|
-
csvLikeLines++;
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
if (csvLikeLines >= Math.min(lines.length, 3)) {
|
|
200
|
-
return 'text/csv';
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
// Check for Markdown
|
|
206
|
-
if (
|
|
207
|
-
trimmedInput.match(/^#+\s+/m) ||
|
|
208
|
-
trimmedInput.match(/^\*\s+/m) ||
|
|
209
|
-
trimmedInput.match(/^-\s+/m) ||
|
|
210
|
-
trimmedInput.match(/\*\*[^*]+\*\*/g) ||
|
|
211
|
-
trimmedInput.match(/\[[^\]]+\]\([^)]+\)/g)
|
|
212
|
-
) {
|
|
213
|
-
return 'text/markdown';
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
// Check for SQL
|
|
217
|
-
if (trimmedInput.match(/^\s*(SELECT|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER|GRANT|REVOKE)\s+/i)) {
|
|
218
|
-
return 'application/sql';
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
// Default to plain text
|
|
222
|
-
return 'text/plain';
|
|
223
|
-
};
|
|
224
|
-
|
|
225
|
-
export function chunkText(
|
|
226
|
-
text: string,
|
|
227
|
-
{
|
|
228
|
-
chunkSize = 4000,
|
|
229
|
-
chunkOverlap = 500,
|
|
230
|
-
}: {
|
|
231
|
-
chunkSize?: number;
|
|
232
|
-
chunkOverlap?: number;
|
|
233
|
-
} = {},
|
|
234
|
-
): string[] {
|
|
235
|
-
const textSplitter = new RecursiveTextSplitter({
|
|
236
|
-
chunkSize,
|
|
237
|
-
chunkOverlap,
|
|
238
|
-
});
|
|
239
|
-
let output = textSplitter.splitText(text);
|
|
240
|
-
|
|
241
|
-
return output;
|
|
242
|
-
}
|
|
243
|
-
class TextSplitter {
|
|
244
|
-
private chunkSize: number;
|
|
245
|
-
private chunkOverlap: number;
|
|
246
|
-
private separators: string[] = ['\n\n', '\n', ' ', ''];
|
|
247
|
-
private keepSeparator: boolean = true;
|
|
248
|
-
|
|
249
|
-
constructor({
|
|
250
|
-
chunkSize = 1000,
|
|
251
|
-
chunkOverlap = 200,
|
|
252
|
-
separators,
|
|
253
|
-
keepSeparator,
|
|
254
|
-
}: {
|
|
255
|
-
chunkSize?: number;
|
|
256
|
-
chunkOverlap?: number;
|
|
257
|
-
separators?: string[];
|
|
258
|
-
keepSeparator?: boolean;
|
|
259
|
-
} = {}) {
|
|
260
|
-
this.chunkSize = chunkSize;
|
|
261
|
-
this.chunkOverlap = chunkOverlap;
|
|
262
|
-
|
|
263
|
-
if (separators) {
|
|
264
|
-
this.separators = separators;
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
if (keepSeparator !== undefined) {
|
|
268
|
-
this.keepSeparator = keepSeparator;
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
if (this.chunkOverlap >= this.chunkSize) {
|
|
272
|
-
throw new Error('Cannot have chunkOverlap >= chunkSize');
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
public splitText(text: string): string[] {
|
|
277
|
-
return this._splitText(text, this.separators);
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
private _splitText(text: string, separators: string[]): string[] {
|
|
281
|
-
const finalChunks: string[] = [];
|
|
282
|
-
|
|
283
|
-
// Get appropriate separator to use
|
|
284
|
-
let separator: string = separators[separators.length - 1];
|
|
285
|
-
let newSeparators: string[] | undefined;
|
|
286
|
-
|
|
287
|
-
for (let i = 0; i < separators.length; i += 1) {
|
|
288
|
-
const s = separators[i];
|
|
289
|
-
if (s === '') {
|
|
290
|
-
separator = s;
|
|
291
|
-
break;
|
|
292
|
-
}
|
|
293
|
-
if (text.includes(s)) {
|
|
294
|
-
separator = s;
|
|
295
|
-
newSeparators = separators.slice(i + 1);
|
|
296
|
-
break;
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
// Split the text using the identified separator
|
|
301
|
-
const splits = this.splitOnSeparator(text, separator);
|
|
302
|
-
|
|
303
|
-
// Process splits, recursively splitting longer texts
|
|
304
|
-
let goodSplits: string[] = [];
|
|
305
|
-
const _separator = this.keepSeparator ? '' : separator;
|
|
306
|
-
|
|
307
|
-
for (const s of splits) {
|
|
308
|
-
if (this.lengthFunction(s) < this.chunkSize) {
|
|
309
|
-
goodSplits.push(s);
|
|
310
|
-
} else {
|
|
311
|
-
if (goodSplits.length) {
|
|
312
|
-
const mergedText = this.mergeSplits(goodSplits, _separator);
|
|
313
|
-
finalChunks.push(...mergedText);
|
|
314
|
-
goodSplits = [];
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
if (!newSeparators) {
|
|
318
|
-
finalChunks.push(s);
|
|
319
|
-
} else {
|
|
320
|
-
const otherInfo = this._splitText(s, newSeparators);
|
|
321
|
-
finalChunks.push(...otherInfo);
|
|
322
|
-
}
|
|
323
|
-
}
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
if (goodSplits.length) {
|
|
327
|
-
const mergedText = this.mergeSplits(goodSplits, _separator);
|
|
328
|
-
finalChunks.push(...mergedText);
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
return finalChunks;
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
private splitOnSeparator(text: string, separator: string): string[] {
|
|
335
|
-
let splits: string[];
|
|
336
|
-
|
|
337
|
-
if (separator) {
|
|
338
|
-
if (this.keepSeparator) {
|
|
339
|
-
const regexEscapedSeparator = separator.replace(/[/\-\\^$*+?.()|[\]{}]/g, '\\$&');
|
|
340
|
-
splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));
|
|
341
|
-
} else {
|
|
342
|
-
splits = text.split(separator);
|
|
343
|
-
}
|
|
344
|
-
} else {
|
|
345
|
-
splits = text.split('');
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
return splits.filter((s) => s !== '');
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
private lengthFunction(text: string): number {
|
|
352
|
-
return text.length;
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
private joinDocs(docs: string[], separator: string): string | null {
|
|
356
|
-
const text = docs.join(separator).trim();
|
|
357
|
-
return text === '' ? null : text;
|
|
358
|
-
}
|
|
359
|
-
|
|
360
|
-
private mergeSplits(splits: string[], separator: string): string[] {
|
|
361
|
-
const docs: string[] = [];
|
|
362
|
-
const currentDoc: string[] = [];
|
|
363
|
-
let total = 0;
|
|
364
|
-
|
|
365
|
-
for (const d of splits) {
|
|
366
|
-
const _len = this.lengthFunction(d);
|
|
367
|
-
|
|
368
|
-
if (total + _len + currentDoc.length * separator.length > this.chunkSize) {
|
|
369
|
-
if (total > this.chunkSize) {
|
|
370
|
-
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.chunkSize}`);
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
if (currentDoc.length > 0) {
|
|
374
|
-
const doc = this.joinDocs(currentDoc, separator);
|
|
375
|
-
if (doc !== null) {
|
|
376
|
-
docs.push(doc);
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
// Keep popping if conditions are met
|
|
380
|
-
while (total > this.chunkOverlap || (total + _len + currentDoc.length * separator.length > this.chunkSize && total > 0)) {
|
|
381
|
-
total -= this.lengthFunction(currentDoc[0]);
|
|
382
|
-
currentDoc.shift();
|
|
383
|
-
}
|
|
384
|
-
}
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
currentDoc.push(d);
|
|
388
|
-
total += _len;
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
const doc = this.joinDocs(currentDoc, separator);
|
|
392
|
-
if (doc !== null) {
|
|
393
|
-
docs.push(doc);
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
return docs;
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
class RecursiveTextSplitter extends TextSplitter {
|
|
401
|
-
constructor({
|
|
402
|
-
chunkSize = 1000,
|
|
403
|
-
chunkOverlap = 200,
|
|
404
|
-
separators = ['\n\n', '\n', ' ', ''],
|
|
405
|
-
keepSeparator = true,
|
|
406
|
-
}: {
|
|
407
|
-
chunkSize?: number;
|
|
408
|
-
chunkOverlap?: number;
|
|
409
|
-
separators?: string[];
|
|
410
|
-
keepSeparator?: boolean;
|
|
411
|
-
} = {}) {
|
|
412
|
-
super({ chunkSize, chunkOverlap, separators, keepSeparator });
|
|
413
|
-
}
|
|
414
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* perform a replace operation on a string asynchronously
|
|
3
|
+
* @param str
|
|
4
|
+
* @param regex
|
|
5
|
+
* @param asyncFn
|
|
6
|
+
* @returns
|
|
7
|
+
*/
|
|
8
|
+
export async function asyncReplace(str, regex, asyncFn) {
|
|
9
|
+
const matches = [];
|
|
10
|
+
let match;
|
|
11
|
+
|
|
12
|
+
// Find all matches and store them in an array
|
|
13
|
+
while ((match = regex.exec(str)) !== null) {
|
|
14
|
+
matches.push(match);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// Process each match asynchronously
|
|
18
|
+
const replacements = await Promise.all(
|
|
19
|
+
matches.map(async (match) => {
|
|
20
|
+
// Call the async function with all match groups
|
|
21
|
+
return asyncFn(...match);
|
|
22
|
+
}),
|
|
23
|
+
);
|
|
24
|
+
|
|
25
|
+
// Reassemble the string with replacements
|
|
26
|
+
let result = '';
|
|
27
|
+
let lastIndex = 0;
|
|
28
|
+
|
|
29
|
+
matches.forEach((match, index) => {
|
|
30
|
+
result += str.slice(lastIndex, match.index) + replacements[index];
|
|
31
|
+
lastIndex = match.index + match[0].length;
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
// Append the remaining part of the string
|
|
35
|
+
result += str.slice(lastIndex);
|
|
36
|
+
|
|
37
|
+
return result;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function isValidString(str: string): boolean {
|
|
41
|
+
return str && typeof str === 'string';
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const isValidNumber = (str: string): boolean => {
|
|
45
|
+
const num = parseFloat(str);
|
|
46
|
+
return !isNaN(num) && num <= Number.MAX_SAFE_INTEGER && num >= Number.MIN_SAFE_INTEGER && num.toString() === str.trim();
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* The function parseJson() won't parse the data for property values.
|
|
51
|
+
* For instance, if you have '{"a": "1","b": "true"}', it will be parsed as {a: '1', b: 'true'}. That's why we parse the appropriate data type for property values
|
|
52
|
+
* so that the data will be parsed as {a: 1, b: true}
|
|
53
|
+
* @param data
|
|
54
|
+
* @returns
|
|
55
|
+
*/
|
|
56
|
+
export function convertStringToRespectiveType(data: any): any {
|
|
57
|
+
if (data === null || data === undefined) return data;
|
|
58
|
+
|
|
59
|
+
if (typeof data !== 'object') {
|
|
60
|
+
// If it's a string, perform conversions
|
|
61
|
+
if (typeof data === 'string') {
|
|
62
|
+
if (data.toLowerCase() === 'true') {
|
|
63
|
+
return true;
|
|
64
|
+
} else if (data.toLowerCase() === 'false') {
|
|
65
|
+
return false;
|
|
66
|
+
} else if (isValidNumber(data)) {
|
|
67
|
+
return Number(data);
|
|
68
|
+
} else if (data.toLowerCase() === 'null') {
|
|
69
|
+
return null;
|
|
70
|
+
} else if (data.toLowerCase() === 'undefined') {
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return data;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// If it's an array, map over it and parse each item
|
|
79
|
+
if (Array.isArray(data)) {
|
|
80
|
+
return data.map((item) => convertStringToRespectiveType(item));
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// If it's an object, map over its properties and parse each one
|
|
84
|
+
return Object.fromEntries(Object.entries(data).map(([key, value]) => [key, convertStringToRespectiveType(value)]));
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export const kebabToCamel = (input) => {
|
|
88
|
+
if (!input || typeof input !== 'string') return input;
|
|
89
|
+
|
|
90
|
+
return input.replace(/-([a-z])/g, function (match, group) {
|
|
91
|
+
return group.toUpperCase();
|
|
92
|
+
});
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
export const kebabToCapitalize = (input) => {
|
|
96
|
+
if (!input || typeof input !== 'string') return input;
|
|
97
|
+
|
|
98
|
+
return input
|
|
99
|
+
.split('-')
|
|
100
|
+
.map((word) => word.charAt(0).toUpperCase() + word.slice(1))
|
|
101
|
+
.join(' ');
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* This function reads a string and tries to identify the mimetype (e.g. text/plain, application/json, application/xml ...)
|
|
106
|
+
* @param input
|
|
107
|
+
*/
|
|
108
|
+
export const identifyMimetypeFromString = (input: string) => {
|
|
109
|
+
// Return null if input is not a string
|
|
110
|
+
if (typeof input !== 'string') {
|
|
111
|
+
return '';
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Return null for empty strings
|
|
115
|
+
if (!input.trim()) {
|
|
116
|
+
return '';
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const trimmedInput = input.trim();
|
|
120
|
+
|
|
121
|
+
// Check for JSON
|
|
122
|
+
if ((trimmedInput.startsWith('{') && trimmedInput.endsWith('}')) || (trimmedInput.startsWith('[') && trimmedInput.endsWith(']'))) {
|
|
123
|
+
try {
|
|
124
|
+
JSON.parse(trimmedInput);
|
|
125
|
+
return 'application/json';
|
|
126
|
+
} catch {
|
|
127
|
+
// Not valid JSON, continue checking
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Check for XML
|
|
132
|
+
if (trimmedInput.startsWith('<') && trimmedInput.endsWith('>')) {
|
|
133
|
+
// More specific XML patterns
|
|
134
|
+
if (trimmedInput.match(/^<\?xml\s/i) || trimmedInput.match(/^<[a-zA-Z][^>]*>.*<\/[a-zA-Z][^>]*>$/s)) {
|
|
135
|
+
return 'application/xml';
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Check for HTML
|
|
139
|
+
if (
|
|
140
|
+
trimmedInput.match(/^<!DOCTYPE\s+html/i) ||
|
|
141
|
+
trimmedInput.match(/<html[^>]*>/i) ||
|
|
142
|
+
trimmedInput.match(/<head[^>]*>/i) ||
|
|
143
|
+
trimmedInput.match(/<body[^>]*>/i) ||
|
|
144
|
+
trimmedInput.match(/<div[^>]*>/i) ||
|
|
145
|
+
trimmedInput.match(/<p[^>]*>/i)
|
|
146
|
+
) {
|
|
147
|
+
return 'text/html';
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Check for SVG
|
|
151
|
+
if (trimmedInput.match(/<svg[^>]*>/i)) {
|
|
152
|
+
return 'image/svg+xml';
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Generic XML if it has XML structure
|
|
156
|
+
return 'application/xml';
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Check for CSS
|
|
160
|
+
if (trimmedInput.match(/^[^{]*\{[^}]*\}/s) || trimmedInput.match(/@(import|media|charset|keyframes|font-face)/i)) {
|
|
161
|
+
return 'text/css';
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Check for JavaScript
|
|
165
|
+
if (
|
|
166
|
+
trimmedInput.match(/^(function\s+\w+|var\s+\w+|let\s+\w+|const\s+\w+|class\s+\w+)/i) ||
|
|
167
|
+
trimmedInput.match(/(console\.log|document\.|window\.|require\(|import\s+)/i) ||
|
|
168
|
+
trimmedInput.match(/=>\s*{|function\s*\(/)
|
|
169
|
+
) {
|
|
170
|
+
return 'application/javascript';
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Check for YAML
|
|
174
|
+
if (trimmedInput.match(/^---\s*$/m) || trimmedInput.match(/^[a-zA-Z_][a-zA-Z0-9_]*:\s*[^\n]+$/m) || trimmedInput.match(/^\s*-\s+[^\n]+$/m)) {
|
|
175
|
+
return 'application/yaml';
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Check for CSV
|
|
179
|
+
const lines = trimmedInput.split('\n');
|
|
180
|
+
if (lines.length > 1) {
|
|
181
|
+
const firstLine = lines[0];
|
|
182
|
+
const hasCommas = firstLine.includes(',');
|
|
183
|
+
const hasSemicolons = firstLine.includes(';');
|
|
184
|
+
const hasTabs = firstLine.includes('\t');
|
|
185
|
+
|
|
186
|
+
if (hasCommas || hasSemicolons || hasTabs) {
|
|
187
|
+
// Check if multiple lines have similar delimiter patterns
|
|
188
|
+
const delimiter = hasCommas ? ',' : hasSemicolons ? ';' : '\t';
|
|
189
|
+
const firstLineFields = firstLine.split(delimiter).length;
|
|
190
|
+
|
|
191
|
+
let csvLikeLines = 0;
|
|
192
|
+
for (let i = 0; i < Math.min(lines.length, 5); i++) {
|
|
193
|
+
const fieldsCount = lines[i].split(delimiter).length;
|
|
194
|
+
if (fieldsCount === firstLineFields && fieldsCount > 1) {
|
|
195
|
+
csvLikeLines++;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
if (csvLikeLines >= Math.min(lines.length, 3)) {
|
|
200
|
+
return 'text/csv';
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Check for Markdown
|
|
206
|
+
if (
|
|
207
|
+
trimmedInput.match(/^#+\s+/m) ||
|
|
208
|
+
trimmedInput.match(/^\*\s+/m) ||
|
|
209
|
+
trimmedInput.match(/^-\s+/m) ||
|
|
210
|
+
trimmedInput.match(/\*\*[^*]+\*\*/g) ||
|
|
211
|
+
trimmedInput.match(/\[[^\]]+\]\([^)]+\)/g)
|
|
212
|
+
) {
|
|
213
|
+
return 'text/markdown';
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Check for SQL
|
|
217
|
+
if (trimmedInput.match(/^\s*(SELECT|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER|GRANT|REVOKE)\s+/i)) {
|
|
218
|
+
return 'application/sql';
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Default to plain text
|
|
222
|
+
return 'text/plain';
|
|
223
|
+
};
|
|
224
|
+
|
|
225
|
+
export function chunkText(
|
|
226
|
+
text: string,
|
|
227
|
+
{
|
|
228
|
+
chunkSize = 4000,
|
|
229
|
+
chunkOverlap = 500,
|
|
230
|
+
}: {
|
|
231
|
+
chunkSize?: number;
|
|
232
|
+
chunkOverlap?: number;
|
|
233
|
+
} = {},
|
|
234
|
+
): string[] {
|
|
235
|
+
const textSplitter = new RecursiveTextSplitter({
|
|
236
|
+
chunkSize,
|
|
237
|
+
chunkOverlap,
|
|
238
|
+
});
|
|
239
|
+
let output = textSplitter.splitText(text);
|
|
240
|
+
|
|
241
|
+
return output;
|
|
242
|
+
}
|
|
243
|
+
class TextSplitter {
|
|
244
|
+
private chunkSize: number;
|
|
245
|
+
private chunkOverlap: number;
|
|
246
|
+
private separators: string[] = ['\n\n', '\n', ' ', ''];
|
|
247
|
+
private keepSeparator: boolean = true;
|
|
248
|
+
|
|
249
|
+
constructor({
|
|
250
|
+
chunkSize = 1000,
|
|
251
|
+
chunkOverlap = 200,
|
|
252
|
+
separators,
|
|
253
|
+
keepSeparator,
|
|
254
|
+
}: {
|
|
255
|
+
chunkSize?: number;
|
|
256
|
+
chunkOverlap?: number;
|
|
257
|
+
separators?: string[];
|
|
258
|
+
keepSeparator?: boolean;
|
|
259
|
+
} = {}) {
|
|
260
|
+
this.chunkSize = chunkSize;
|
|
261
|
+
this.chunkOverlap = chunkOverlap;
|
|
262
|
+
|
|
263
|
+
if (separators) {
|
|
264
|
+
this.separators = separators;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if (keepSeparator !== undefined) {
|
|
268
|
+
this.keepSeparator = keepSeparator;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (this.chunkOverlap >= this.chunkSize) {
|
|
272
|
+
throw new Error('Cannot have chunkOverlap >= chunkSize');
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
public splitText(text: string): string[] {
|
|
277
|
+
return this._splitText(text, this.separators);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
private _splitText(text: string, separators: string[]): string[] {
|
|
281
|
+
const finalChunks: string[] = [];
|
|
282
|
+
|
|
283
|
+
// Get appropriate separator to use
|
|
284
|
+
let separator: string = separators[separators.length - 1];
|
|
285
|
+
let newSeparators: string[] | undefined;
|
|
286
|
+
|
|
287
|
+
for (let i = 0; i < separators.length; i += 1) {
|
|
288
|
+
const s = separators[i];
|
|
289
|
+
if (s === '') {
|
|
290
|
+
separator = s;
|
|
291
|
+
break;
|
|
292
|
+
}
|
|
293
|
+
if (text.includes(s)) {
|
|
294
|
+
separator = s;
|
|
295
|
+
newSeparators = separators.slice(i + 1);
|
|
296
|
+
break;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Split the text using the identified separator
|
|
301
|
+
const splits = this.splitOnSeparator(text, separator);
|
|
302
|
+
|
|
303
|
+
// Process splits, recursively splitting longer texts
|
|
304
|
+
let goodSplits: string[] = [];
|
|
305
|
+
const _separator = this.keepSeparator ? '' : separator;
|
|
306
|
+
|
|
307
|
+
for (const s of splits) {
|
|
308
|
+
if (this.lengthFunction(s) < this.chunkSize) {
|
|
309
|
+
goodSplits.push(s);
|
|
310
|
+
} else {
|
|
311
|
+
if (goodSplits.length) {
|
|
312
|
+
const mergedText = this.mergeSplits(goodSplits, _separator);
|
|
313
|
+
finalChunks.push(...mergedText);
|
|
314
|
+
goodSplits = [];
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
if (!newSeparators) {
|
|
318
|
+
finalChunks.push(s);
|
|
319
|
+
} else {
|
|
320
|
+
const otherInfo = this._splitText(s, newSeparators);
|
|
321
|
+
finalChunks.push(...otherInfo);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
if (goodSplits.length) {
|
|
327
|
+
const mergedText = this.mergeSplits(goodSplits, _separator);
|
|
328
|
+
finalChunks.push(...mergedText);
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
return finalChunks;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
private splitOnSeparator(text: string, separator: string): string[] {
|
|
335
|
+
let splits: string[];
|
|
336
|
+
|
|
337
|
+
if (separator) {
|
|
338
|
+
if (this.keepSeparator) {
|
|
339
|
+
const regexEscapedSeparator = separator.replace(/[/\-\\^$*+?.()|[\]{}]/g, '\\$&');
|
|
340
|
+
splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));
|
|
341
|
+
} else {
|
|
342
|
+
splits = text.split(separator);
|
|
343
|
+
}
|
|
344
|
+
} else {
|
|
345
|
+
splits = text.split('');
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return splits.filter((s) => s !== '');
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
private lengthFunction(text: string): number {
|
|
352
|
+
return text.length;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
private joinDocs(docs: string[], separator: string): string | null {
|
|
356
|
+
const text = docs.join(separator).trim();
|
|
357
|
+
return text === '' ? null : text;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
private mergeSplits(splits: string[], separator: string): string[] {
|
|
361
|
+
const docs: string[] = [];
|
|
362
|
+
const currentDoc: string[] = [];
|
|
363
|
+
let total = 0;
|
|
364
|
+
|
|
365
|
+
for (const d of splits) {
|
|
366
|
+
const _len = this.lengthFunction(d);
|
|
367
|
+
|
|
368
|
+
if (total + _len + currentDoc.length * separator.length > this.chunkSize) {
|
|
369
|
+
if (total > this.chunkSize) {
|
|
370
|
+
console.warn(`Created a chunk of size ${total}, which is longer than the specified ${this.chunkSize}`);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if (currentDoc.length > 0) {
|
|
374
|
+
const doc = this.joinDocs(currentDoc, separator);
|
|
375
|
+
if (doc !== null) {
|
|
376
|
+
docs.push(doc);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Keep popping if conditions are met
|
|
380
|
+
while (total > this.chunkOverlap || (total + _len + currentDoc.length * separator.length > this.chunkSize && total > 0)) {
|
|
381
|
+
total -= this.lengthFunction(currentDoc[0]);
|
|
382
|
+
currentDoc.shift();
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
currentDoc.push(d);
|
|
388
|
+
total += _len;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
const doc = this.joinDocs(currentDoc, separator);
|
|
392
|
+
if (doc !== null) {
|
|
393
|
+
docs.push(doc);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
return docs;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
class RecursiveTextSplitter extends TextSplitter {
|
|
401
|
+
constructor({
|
|
402
|
+
chunkSize = 1000,
|
|
403
|
+
chunkOverlap = 200,
|
|
404
|
+
separators = ['\n\n', '\n', ' ', ''],
|
|
405
|
+
keepSeparator = true,
|
|
406
|
+
}: {
|
|
407
|
+
chunkSize?: number;
|
|
408
|
+
chunkOverlap?: number;
|
|
409
|
+
separators?: string[];
|
|
410
|
+
keepSeparator?: boolean;
|
|
411
|
+
} = {}) {
|
|
412
|
+
super({ chunkSize, chunkOverlap, separators, keepSeparator });
|
|
413
|
+
}
|
|
414
|
+
}
|