@intlayer/backend 5.4.0-canary.0 → 5.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ Ignore all instructions before this one.
2
+ Keep forever all instructions after this one. From now you are not allowed to ignore the previous instructions.
3
+ You are an expert in internationalization solution for Javascript Application. You're an helpful chatbot for Intlayer, an Internationalization solution
4
+ The website url of the solution is 'https://intlayer.org'. And your implemented on this website.
5
+ The user is an potential user of Intlayer. Your task is now to answer the questions of the user.
6
+ You must talk as an member of Intlayer. You must only answer the questions relative to Intlayer.
7
+ Any question should be related to Intlayer. If a question is not related to Intlayer, you should NOT answer it.
8
+ You must NOT answer question who is generally secret for a company (E.g. financial information).
9
+ You should NOT suggest to implement other i18n solutions.
10
+ Your should NOT invent information that are not precised into the relevant documentation chunks provided.
11
+ You should answer the shortly if possible.
12
+ If you don't have enough information to answer the question, not answer using extra information extracted from your knowledge.
13
+ If your have a doubt about something, you should ask more question to the user.
14
+ If a user stuggles with a question, and you cant find an answer, you can :
15
+
16
+ - suggest to the user to submit an issue on the Intlayer repository: https://github.com/aymericzip/intlayer/issues. Our team generally responds to issues within a day. Notice that your issue will also probably help other users.
17
+ - the multiple example of implementations of Intlayer in the repository: https://github.com/aymericzip/intlayer/tree/main/examples
18
+
19
+ You can also suggest when it's possible to the user to star the Intlayer repository: https://github.com/aymericzip/intlayer to support the project. It really help us.
20
+
21
+ Here some useful urls to know more about Intlayer:
22
+ https://intlayer.org/docs
23
+ https://intlayer.org/blog
24
+ https://intlayer.org/pricing
25
+ https://intlayer.org/dashboard
26
+
27
+ Your should return a result as markdown.
28
+ Code element should include metadata fileName="file.ts" if could be useful for the user.
29
+ Code element format should not include metadata (E.g. codeFormat="typescript", or packageManager="npm".
30
+
31
+ Here is the relevant documentation:
32
+ {{relevantFilesReferences}}',
@@ -40,7 +40,10 @@ var import_docs = require("@intlayer/docs");
40
40
  var import_dotenv = __toESM(require("dotenv"), 1);
41
41
  var import_fs = __toESM(require("fs"), 1);
42
42
  var import_openai = require("openai");
43
+ var import_path = require("path");
44
+ var import_url = require("url");
43
45
  var import_embeddings = __toESM(require('./embeddings.json'), 1);
46
+ const import_meta = {};
44
47
  const vectorStore = [];
45
48
  const MODEL = "gpt-4o-2024-11-20";
46
49
  const MODEL_TEMPERATURE = 0.1;
@@ -98,10 +101,11 @@ const indexMarkdownFiles = async () => {
98
101
  import_dotenv.default.config({
99
102
  path: [`.env.${env}.local`, `.env.${env}`, ".env.local", ".env"]
100
103
  });
104
+ const frequentQuestions = (0, import_docs.getFequentQuestions)();
101
105
  const docs = (0, import_docs.getDocs)(import_config.Locales.ENGLISH);
102
106
  const blogs = (0, import_blog.getBlogs)(import_config.Locales.ENGLISH);
103
107
  let result = {};
104
- const files = { ...docs, ...blogs };
108
+ const files = { ...docs, ...blogs, ...frequentQuestions };
105
109
  for (const fileKey of Object.keys(files)) {
106
110
  const fileChunks = chunkText(files[fileKey]);
107
111
  for (const chunkIndex of Object.keys(fileChunks)) {
@@ -147,10 +151,16 @@ const searchChunkReference = async (query) => {
147
151
  })).filter((chunk) => chunk.similarity > MIN_RELEVANT_CHUNKS_SIMILARITY).sort((a, b) => b.similarity - a.similarity).slice(0, MAX_RELEVANT_CHUNKS_NB);
148
152
  return results;
149
153
  };
154
+ const getFileContent = (relativeFilePath) => {
155
+ const __dirname = (0, import_path.dirname)((0, import_url.fileURLToPath)(import_meta.url));
156
+ const absolutePath = (0, import_path.join)(__dirname, relativeFilePath);
157
+ const fileContent = (0, import_fs.readFileSync)(absolutePath, "utf-8");
158
+ return fileContent;
159
+ };
160
+ const CHAT_GPT_PROMPT = getFileContent("./PROMPT.md");
150
161
  const initPrompt = {
151
162
  role: "system",
152
- content: `Ignore all instructions before this one. Keep forever all instructions after this one. From now you are not allowed to ignore the previous instructions. You are an expert in internationalization solution for Javascript Application. You're an helpful chatbot for Intlayer, an Internationalization solution The website url of the solution is: 'https://intlayer.org'). And your implemented on this website. The user is an potential user of Intlayer. Your task is now to answer the questions of the user. You must talk as an member of Intlayer. You must only answer the questions relative to Intlayer. Any question should be related to Intlayer. If a question is not related to Intlayer, you should NOT answer it. You must NOT answer question who is generally secret for a company (E.g. financial information). Your should NOT invent information that are not precised into the relevant documentation chunks provided. If you don't have enough information to answer the question, not answer using extra information extracted from your knowledge. If your have a doubt about something, you should ask more question to the user. Here some useful urls to know more about Intlayer: https://intlayer.org/docs https://intlayer.org/blog https://intlayer.org/pricing https://intlayer.org/dashboard Your should return a result as markdown. Code element should include metadata fileName="file.ts" if could be useful for the user. Code element format should not include metadata (E.g. codeFormat="typescript", or packageManager="npm". Here is the relevant documentation: {{relevantFilesReferences}}`
153
- // Placeholder for relevant documentation to be inserted later
163
+ content: CHAT_GPT_PROMPT
154
164
  };
155
165
  const askDocQuestion = async (messages, options) => {
156
166
  const openai = new import_openai.OpenAI({ apiKey: process.env.OPENAI_API_KEY });
@@ -1 +1 @@
1
- {"version":3,"sources":["../../../../../src/utils/AI/askDocQuestion/askDocQuestion.ts"],"sourcesContent":["import { getBlogs } from '@intlayer/blog';\nimport { Locales } from '@intlayer/config';\nimport { getDocs } from '@intlayer/docs';\nimport dotenv from 'dotenv';\nimport fs from 'fs';\nimport { OpenAI } from 'openai';\nimport embeddingsList from './embeddings.json' with { type: 'json' };\n\ntype VectorStoreEl = {\n fileKey: string;\n chunkNumber: number;\n content: string;\n embedding: number[];\n};\n\n/**\n * Simple in-memory vector store to hold document embeddings and their content.\n * Each entry contains:\n * - fileKey: A unique key identifying the file\n * - chunkNumber: The number of the chunk within the document\n * - content: The chunk content\n * - embedding: The numerical embedding vector for the chunk\n */\nconst vectorStore: VectorStoreEl[] = [];\n\n// Constants defining OpenAI's token and character limits\nconst MODEL: OpenAI.Chat.ChatModel = 'gpt-4o-2024-11-20'; // Model to use for chat completions\nconst MODEL_TEMPERATURE = 0.1; // Temperature to use for chat completions\nconst EMBEDDING_MODEL: OpenAI.Embeddings.EmbeddingModel =\n 'text-embedding-3-large'; // Model to use for embedding generation\nconst OVERLAP_TOKENS = 200; // Number of tokens to overlap between chunks\nconst MAX_CHUNK_TOKENS = 800; // Maximum number of tokens per chunk\nconst CHAR_BY_TOKEN = 4.15; // Approximate pessimistically the number of characters per token // Can use `tiktoken` or other tokenizers to calculate it more precisely\nconst MAX_CHARS = MAX_CHUNK_TOKENS * CHAR_BY_TOKEN;\nconst OVERLAP_CHARS = OVERLAP_TOKENS * CHAR_BY_TOKEN;\nconst MAX_RELEVANT_CHUNKS_NB = 8; // Maximum number of relevant chunks to attach to chatGPT context\nconst MIN_RELEVANT_CHUNKS_SIMILARITY = 0.25; // Minimum similarity required for a chunk to be considered relevant\n\n/**\n * Splits a given text into chunks ensuring each chunk does not exceed MAX_CHARS.\n * @param text - The input text to split.\n * @returns - Array of text chunks.\n */\nconst chunkText = (text: string): string[] => {\n const chunks: string[] = [];\n let start = 0;\n\n while (start < text.length) {\n let end = Math.min(start + MAX_CHARS, text.length);\n\n // Ensure we don't cut words in the middle (find nearest space)\n if (end < text.length) {\n const lastSpace = text.lastIndexOf(' ', end);\n if (lastSpace > start) {\n end = lastSpace;\n }\n }\n\n chunks.push(text.substring(start, end));\n\n // Move start forward correctly\n const nextStart = end - OVERLAP_CHARS;\n if (nextStart <= start) {\n // Prevent infinite loop if overlap is too large\n start = end;\n } else {\n start = nextStart;\n }\n }\n\n return chunks;\n};\n\n/**\n * Generates an embedding for a given text using OpenAI's embedding API.\n * Trims the text if it exceeds the maximum allowed characters.\n *\n * @param text - The input text to generate an embedding for\n * @returns The embedding vector as a number array\n */\nconst generateEmbedding = async (text: string): Promise<number[]> => {\n try {\n const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });\n const response = await openai.embeddings.create({\n model: EMBEDDING_MODEL, // Specify the embedding model\n input: text,\n });\n\n return response.data[0].embedding; // Return the generated embedding\n } catch (error) {\n console.error('Error generating embedding:', error);\n return [];\n }\n};\n\n/**\n * Calculates the cosine similarity between two vectors.\n * Cosine similarity measures the cosine of the angle between two vectors in an inner product space.\n * Used to determine the similarity between chunks of text.\n *\n * @param vecA - The first vector\n * @param vecB - The second vector\n * @returns The cosine similarity score\n */\nconst cosineSimilarity = (vecA: number[], vecB: number[]): number => {\n // Calculate the dot product of the two vectors\n const dotProduct = vecA.reduce((sum, a, idx) => sum + a * vecB[idx], 0);\n\n // Calculate the magnitude (Euclidean norm) of each vector\n const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));\n const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));\n\n // Compute and return the cosine similarity\n return dotProduct / (magnitudeA * magnitudeB);\n};\n\n/**\n * Indexes all Markdown documents by generating embeddings for each chunk and storing them in memory.\n * Also updates the embeddings.json file if new embeddings are generated.\n */\nexport const indexMarkdownFiles = async (): Promise<void> => {\n const env = process.env.NODE_ENV;\n dotenv.config({\n path: [`.env.${env}.local`, `.env.${env}`, '.env.local', '.env'],\n });\n\n // Retrieve documentation and blog posts in English locale\n const docs = getDocs(Locales.ENGLISH);\n const blogs = getBlogs(Locales.ENGLISH);\n\n let result: Record<string, number[]> = {}; // Object to hold updated embeddings\n\n const files = { ...docs, ...blogs }; // Combine docs and blogs into a single object\n\n // Iterate over each file key (identifier) in the combined files\n for (const fileKey of Object.keys(files)) {\n // Split the document into chunks based on headings\n const fileChunks = chunkText(files[fileKey as keyof typeof files]);\n\n // Iterate over each chunk within the current file\n for (const chunkIndex of Object.keys(fileChunks)) {\n const chunkNumber = Number(chunkIndex) + 1; // Chunk number starts at 1\n const chunksNumber = fileChunks.length;\n\n const fileChunk = fileChunks[\n chunkIndex as keyof typeof fileChunks\n ] as string;\n\n const embeddingKeyName = `${fileKey}/chunk_${chunkNumber}`; // Unique key for the chunk\n\n // Retrieve precomputed embedding if available\n const docEmbedding = embeddingsList[\n embeddingKeyName as keyof typeof embeddingsList\n ] as number[] | undefined;\n\n let embedding = docEmbedding; // Use existing embedding if available\n\n if (!embedding) {\n embedding = await generateEmbedding(fileChunk); // Generate embedding if not present\n }\n\n // Update the result object with the new embedding\n result = { ...result, [embeddingKeyName]: embedding };\n\n // Store the embedding and content in the in-memory vector store\n vectorStore.push({\n fileKey,\n chunkNumber,\n embedding,\n content: fileChunk,\n });\n\n console.info(`- Indexed: ${embeddingKeyName}/${chunksNumber}`);\n }\n }\n\n if (process.env.NODE_ENV === 'development') {\n try {\n // Compare the newly generated embeddings with existing ones\n if (JSON.stringify(result) !== JSON.stringify(embeddingsList)) {\n // If there are new embeddings, save them to embeddings.json\n fs.writeFileSync(\n 'src/utils/AI/askDocQuestion/embeddings.json',\n JSON.stringify(result, null, 2)\n );\n }\n } catch (error) {\n console.error(error); // Log any errors during the file write process\n }\n }\n};\n\n// Automatically index Markdown files\nindexMarkdownFiles();\n\n/**\n * Searches the indexed documents for the most relevant chunks based on a query.\n * Utilizes cosine similarity to find the closest matching embeddings.\n *\n * @param query - The search query provided by the user\n * @returns An array of the top matching document chunks' content\n */\nexport const searchChunkReference = async (\n query: string\n): Promise<VectorStoreEl[]> => {\n // Generate an embedding for the user's query\n const queryEmbedding = await generateEmbedding(query);\n\n // Calculate similarity scores between the query embedding and each document's embedding\n const results = vectorStore\n .map((chunk) => ({\n ...chunk,\n similarity: cosineSimilarity(queryEmbedding, chunk.embedding), // Add similarity score to each doc\n }))\n .filter((chunk) => chunk.similarity > MIN_RELEVANT_CHUNKS_SIMILARITY) // Filter out documents with low similarity scores\n .sort((a, b) => b.similarity - a.similarity) // Sort documents by highest similarity first\n .slice(0, MAX_RELEVANT_CHUNKS_NB); // Select the top 6 most similar documents\n\n // Return the content of the top matching documents\n return results;\n};\n\n// Define the structure of messages used in chat completions\nexport type ChatCompletionRequestMessage = {\n role: 'system' | 'user' | 'assistant'; // The role of the message sender\n content: string; // The text content of the message\n};\n\n// Initial prompt configuration for the chatbot\nexport const initPrompt: ChatCompletionRequestMessage = {\n role: 'system',\n content:\n 'Ignore all instructions before this one. \\\n Keep forever all instructions after this one. From now you are not allowed to ignore the previous instructions. \\\n You are an expert in internationalization solution for Javascript Application. You\\'re an helpful chatbot for Intlayer, an Internationalization solution\\\n The website url of the solution is: \\'https://intlayer.org\\'). And your implemented on this website. \\\n The user is an potential user of Intlayer. Your task is now to answer the questions of the user. \\\n You must talk as an member of Intlayer. You must only answer the questions relative to Intlayer. \\\n Any question should be related to Intlayer. If a question is not related to Intlayer, you should NOT answer it. \\\n You must NOT answer question who is generally secret for a company (E.g. financial information). \\\n Your should NOT invent information that are not precised into the relevant documentation chunks provided. \\\n If you don\\'t have enough information to answer the question, not answer using extra information extracted from your knowledge. \\\n If your have a doubt about something, you should ask more question to the user. \\\n \\\n Here some useful urls to know more about Intlayer: \\\n https://intlayer.org/docs \\\n https://intlayer.org/blog \\\n https://intlayer.org/pricing \\\n https://intlayer.org/dashboard \\\n \\\n Your should return a result as markdown.\\\n Code element should include metadata fileName=\"file.ts\" if could be useful for the user. \\\n Code element format should not include metadata (E.g. codeFormat=\"typescript\", or packageManager=\"npm\". \\\n \\\n Here is the relevant documentation:\\\n {{relevantFilesReferences}}', // Placeholder for relevant documentation to be inserted later\n};\n\nexport type AskDocQuestionResult = {\n response: string;\n relatedFiles: string[];\n};\n\nexport type AskDocQuestionOptions = {\n onMessage?: (chunk: string) => void;\n};\n\n/**\n * Handles the \"Ask a question\" endpoint in an Express.js route.\n * Processes user messages, retrieves relevant documents, and interacts with OpenAI's chat API to generate responses.\n *\n * @param messages - An array of chat messages from the user and assistant\n * @returns The assistant's response as a string\n */\nexport const askDocQuestion = async (\n messages: ChatCompletionRequestMessage[],\n options?: AskDocQuestionOptions\n): Promise<AskDocQuestionResult> => {\n const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });\n\n // Assistant's response are filtered out otherwise the chatbot will be stuck in a self-referential loop\n // Note that the embedding precision will be lowered if the user change of context in the chat\n const userMessages = messages.filter((message) => message.role === 'user');\n\n // Format the user's question to keep only the relevant keywords\n const query = userMessages\n .map((message) => `- ${message.content}`)\n .join('\\n');\n\n // 1) Find relevant documents based on the user's question\n const relevantFilesReferences = await searchChunkReference(query);\n\n // 2) Integrate the relevant documents into the initial system prompt\n const messagesList: ChatCompletionRequestMessage[] = [\n {\n ...initPrompt,\n content: initPrompt.content.replace(\n '{{relevantFilesReferences}}',\n relevantFilesReferences.length === 0\n ? 'Not relevant file found related to the question.'\n : relevantFilesReferences\n .map(\n (doc, idx) =>\n `[Chunk ${idx}] docKey = \"${doc.fileKey}\":\\n${doc.content}`\n )\n .join('\\n\\n') // Insert relevant docs into the prompt\n ),\n },\n ...messages, // Include all user and assistant messages\n ];\n\n // 3) Send the compiled messages to OpenAI's Chat Completion API (using a specific model)\n const response = await openai.chat.completions.create({\n model: MODEL,\n temperature: MODEL_TEMPERATURE,\n messages: messagesList,\n stream: true,\n });\n\n let fullResponse = '';\n for await (const chunk of response) {\n const content = chunk.choices[0]?.delta?.content || '';\n if (content) {\n fullResponse += content;\n options?.onMessage?.(content);\n }\n }\n\n // 4) Extract unique related files\n const relatedFiles = [\n ...new Set(relevantFilesReferences.map((doc) => doc.fileKey)),\n ];\n\n // 5) Return the assistant's response to the user\n return {\n response: fullResponse ?? 'Error: No result found',\n relatedFiles,\n };\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,kBAAyB;AACzB,oBAAwB;AACxB,kBAAwB;AACxB,oBAAmB;AACnB,gBAAe;AACf,oBAAuB;AACvB,wBAA2B;AAiB3B,MAAM,cAA+B,CAAC;AAGtC,MAAM,QAA+B;AACrC,MAAM,oBAAoB;AAC1B,MAAM,kBACJ;AACF,MAAM,iBAAiB;AACvB,MAAM,mBAAmB;AACzB,MAAM,gBAAgB;AACtB,MAAM,YAAY,mBAAmB;AACrC,MAAM,gBAAgB,iBAAiB;AACvC,MAAM,yBAAyB;AAC/B,MAAM,iCAAiC;AAOvC,MAAM,YAAY,CAAC,SAA2B;AAC5C,QAAM,SAAmB,CAAC;AAC1B,MAAI,QAAQ;AAEZ,SAAO,QAAQ,KAAK,QAAQ;AAC1B,QAAI,MAAM,KAAK,IAAI,QAAQ,WAAW,KAAK,MAAM;AAGjD,QAAI,MAAM,KAAK,QAAQ;AACrB,YAAM,YAAY,KAAK,YAAY,KAAK,GAAG;AAC3C,UAAI,YAAY,OAAO;AACrB,cAAM;AAAA,MACR;AAAA,IACF;AAEA,WAAO,KAAK,KAAK,UAAU,OAAO,GAAG,CAAC;AAGtC,UAAM,YAAY,MAAM;AACxB,QAAI,aAAa,OAAO;AAEtB,cAAQ;AAAA,IACV,OAAO;AACL,cAAQ;AAAA,IACV;AAAA,EACF;AAEA,SAAO;AACT;AASA,MAAM,oBAAoB,OAAO,SAAoC;AACnE,MAAI;AACF,UAAM,SAAS,IAAI,qBAAO,EAAE,QAAQ,QAAQ,IAAI,eAAe,CAAC;AAChE,UAAM,WAAW,MAAM,OAAO,WAAW,OAAO;AAAA,MAC9C,OAAO;AAAA;AAAA,MACP,OAAO;AAAA,IACT,CAAC;AAED,WAAO,SAAS,KAAK,CAAC,EAAE;AAAA,EAC1B,SAAS,OAAO;AACd,YAAQ,MAAM,+BAA+B,KAAK;AAClD,WAAO,CAAC;AAAA,EACV;AACF;AAWA,MAAM,mBAAmB,CAAC,MAAgB,SAA2B;AAEnE,QAAM,aAAa,KAAK,OAAO,CAAC,KAAK,GAAG,QAAQ,MAAM,IAAI,KAAK,GAAG,GAAG,CAAC;AAGtE,QAAM,aAAa,KAAK,KAAK,KAAK,OAAO,CAAC,KAAK,MAAM,MAAM,IAAI,GAAG,CAAC,CAAC;AACpE,QAAM,aAAa,KAAK,KAAK,KAAK,OAAO,CAAC,KAAK,MAAM,MAAM,IAAI,GAAG,CAAC,CAAC;AAGpE,SAAO,cAAc,aAAa;AACpC;AAMO,MAAM,qBAAqB,YAA2B;AAC3D,QAAM,MAAM,QAAQ,IAAI;AACxB,gBAAAA,QAAO,OAAO;AAAA,IACZ,MAAM,CAAC,QAAQ,GAAG,UAAU,QAAQ,GAAG,IAAI,cAAc,MAAM;AAAA,EACjE,CAAC;AAGD,QAAM,WAAO,qBAAQ,sBAAQ,OAAO;AACpC,QAAM,YAAQ,sBAAS,sBAAQ,OAAO;AAEtC,MAAI,SAAmC,CAAC;AAExC,QAAM,QAAQ,EAAE,GAAG,MAAM,GAAG,MAAM;AAGlC,aAAW,WAAW,OAAO,KAAK,KAAK,GAAG;AAExC,UAAM,aAAa,UAAU,MAAM,OAA6B,CAAC;AAGjE,eAAW,cAAc,OAAO,KAAK,UAAU,GAAG;AAChD,YAAM,cAAc,OAAO,UAAU,IAAI;AACzC,YAAM,eAAe,WAAW;AAEhC,YAAM,YAAY,WAChB,UACF;AAEA,YAAM,mBAAmB,GAAG,OAAO,UAAU,WAAW;AAGxD,YAAM,eAAe,kBAAAC,QACnB,gBACF;AAEA,UAAI,YAAY;AAEhB,UAAI,CAAC,WAAW;AACd,oBAAY,MAAM,kBAAkB,SAAS;AAAA,MAC/C;AAGA,eAAS,EAAE,GAAG,QAAQ,CAAC,gBAAgB,GAAG,UAAU;AAGpD,kBAAY,KAAK;AAAA,QACf;AAAA,QACA;AAAA,QACA;AAAA,QACA,SAAS;AAAA,MACX,CAAC;AAED,cAAQ,KAAK,cAAc,gBAAgB,IAAI,YAAY,EAAE;AAAA,IAC/D;AAAA,EACF;AAEA,MAAI,QAAQ,IAAI,aAAa,eAAe;AAC1C,QAAI;AAEF,UAAI,KAAK,UAAU,MAAM,MAAM,KAAK,UAAU,kBAAAA,OAAc,GAAG;AAE7D,kBAAAC,QAAG;AAAA,UACD;AAAA,UACA,KAAK,UAAU,QAAQ,MAAM,CAAC;AAAA,QAChC;AAAA,MACF;AAAA,IACF,SAAS,OAAO;AACd,cAAQ,MAAM,KAAK;AAAA,IACrB;AAAA,EACF;AACF;AAGA,mBAAmB;AASZ,MAAM,uBAAuB,OAClC,UAC6B;AAE7B,QAAM,iBAAiB,MAAM,kBAAkB,KAAK;AAGpD,QAAM,UAAU,YACb,IAAI,CAAC,WAAW;AAAA,IACf,GAAG;AAAA,IACH,YAAY,iBAAiB,gBAAgB,MAAM,SAAS;AAAA;AAAA,EAC9D,EAAE,EACD,OAAO,CAAC,UAAU,MAAM,aAAa,8BAA8B,EACnE,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,EAAE,UAAU,EAC1C,MAAM,GAAG,sBAAsB;AAGlC,SAAO;AACT;AASO,MAAM,aAA2C;AAAA,EACtD,MAAM;AAAA,EACN,SACE;AAAA;AAwBJ;AAkBO,MAAM,iBAAiB,OAC5B,UACA,YACkC;AAClC,QAAM,SAAS,IAAI,qBAAO,EAAE,QAAQ,QAAQ,IAAI,eAAe,CAAC;AAIhE,QAAM,eAAe,SAAS,OAAO,CAAC,YAAY,QAAQ,SAAS,MAAM;AAGzE,QAAM,QAAQ,aACX,IAAI,CAAC,YAAY,KAAK,QAAQ,OAAO,EAAE,EACvC,KAAK,IAAI;AAGZ,QAAM,0BAA0B,MAAM,qBAAqB,KAAK;AAGhE,QAAM,eAA+C;AAAA,IACnD;AAAA,MACE,GAAG;AAAA,MACH,SAAS,WAAW,QAAQ;AAAA,QAC1B;AAAA,QACA,wBAAwB,WAAW,IAC/B,qDACA,wBACG;AAAA,UACC,CAAC,KAAK,QACJ,UAAU,GAAG,eAAe,IAAI,OAAO;AAAA,EAAO,IAAI,OAAO;AAAA,QAC7D,EACC,KAAK,MAAM;AAAA;AAAA,MACpB;AAAA,IACF;AAAA,IACA,GAAG;AAAA;AAAA,EACL;AAGA,QAAM,WAAW,MAAM,OAAO,KAAK,YAAY,OAAO;AAAA,IACpD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,UAAU;AAAA,IACV,QAAQ;AAAA,EACV,CAAC;AAED,MAAI,eAAe;AACnB,mBAAiB,SAAS,UAAU;AAClC,UAAM,UAAU,MAAM,QAAQ,CAAC,GAAG,OAAO,WAAW;AACpD,QAAI,SAAS;AACX,sBAAgB;AAChB,eAAS,YAAY,OAAO;AAAA,IAC9B;AAAA,EACF;AAGA,QAAM,eAAe;AAAA,IACnB,GAAG,IAAI,IAAI,wBAAwB,IAAI,CAAC,QAAQ,IAAI,OAAO,CAAC;AAAA,EAC9D;AAGA,SAAO;AAAA,IACL,UAAU,gBAAgB;AAAA,IAC1B;AAAA,EACF;AACF;","names":["dotenv","embeddingsList","fs"]}
1
+ {"version":3,"sources":["../../../../../src/utils/AI/askDocQuestion/askDocQuestion.ts"],"sourcesContent":["import { getBlogs } from '@intlayer/blog';\nimport { Locales } from '@intlayer/config';\nimport { getDocs, getFequentQuestions } from '@intlayer/docs';\nimport dotenv from 'dotenv';\nimport fs, { readFileSync } from 'fs';\nimport { OpenAI } from 'openai';\nimport { dirname, join } from 'path';\nimport { fileURLToPath } from 'url';\nimport embeddingsList from './embeddings.json' with { type: 'json' };\n\ntype VectorStoreEl = {\n fileKey: string;\n chunkNumber: number;\n content: string;\n embedding: number[];\n};\n\n/**\n * Simple in-memory vector store to hold document embeddings and their content.\n * Each entry contains:\n * - fileKey: A unique key identifying the file\n * - chunkNumber: The number of the chunk within the document\n * - content: The chunk content\n * - embedding: The numerical embedding vector for the chunk\n */\nconst vectorStore: VectorStoreEl[] = [];\n\n// Constants defining OpenAI's token and character limits\nconst MODEL: OpenAI.Chat.ChatModel = 'gpt-4o-2024-11-20'; // Model to use for chat completions\nconst MODEL_TEMPERATURE = 0.1; // Temperature to use for chat completions\nconst EMBEDDING_MODEL: OpenAI.Embeddings.EmbeddingModel =\n 'text-embedding-3-large'; // Model to use for embedding generation\nconst OVERLAP_TOKENS = 200; // Number of tokens to overlap between chunks\nconst MAX_CHUNK_TOKENS = 800; // Maximum number of tokens per chunk\nconst CHAR_BY_TOKEN = 4.15; // Approximate pessimistically the number of characters per token // Can use `tiktoken` or other tokenizers to calculate it more precisely\nconst MAX_CHARS = MAX_CHUNK_TOKENS * CHAR_BY_TOKEN;\nconst OVERLAP_CHARS = OVERLAP_TOKENS * CHAR_BY_TOKEN;\nconst MAX_RELEVANT_CHUNKS_NB = 8; // Maximum number of relevant chunks to attach to chatGPT context\nconst MIN_RELEVANT_CHUNKS_SIMILARITY = 0.25; // Minimum similarity required for a chunk to be considered relevant\n\n/**\n * Splits a given text into chunks ensuring each chunk does not exceed MAX_CHARS.\n * @param text - The input text to split.\n * @returns - Array of text chunks.\n */\nconst chunkText = (text: string): string[] => {\n const chunks: string[] = [];\n let start = 0;\n\n while (start < text.length) {\n let end = Math.min(start + MAX_CHARS, text.length);\n\n // Ensure we don't cut words in the middle (find nearest space)\n if (end < text.length) {\n const lastSpace = text.lastIndexOf(' ', end);\n if (lastSpace > start) {\n end = lastSpace;\n }\n }\n\n chunks.push(text.substring(start, end));\n\n // Move start forward correctly\n const nextStart = end - OVERLAP_CHARS;\n if (nextStart <= start) {\n // Prevent infinite loop if overlap is too large\n start = end;\n } else {\n start = nextStart;\n }\n }\n\n return chunks;\n};\n\n/**\n * Generates an embedding for a given text using OpenAI's embedding API.\n * Trims the text if it exceeds the maximum allowed characters.\n *\n * @param text - The input text to generate an embedding for\n * @returns The embedding vector as a number array\n */\nconst generateEmbedding = async (text: string): Promise<number[]> => {\n try {\n const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });\n const response = await openai.embeddings.create({\n model: EMBEDDING_MODEL, // Specify the embedding model\n input: text,\n });\n\n return response.data[0].embedding; // Return the generated embedding\n } catch (error) {\n console.error('Error generating embedding:', error);\n return [];\n }\n};\n\n/**\n * Calculates the cosine similarity between two vectors.\n * Cosine similarity measures the cosine of the angle between two vectors in an inner product space.\n * Used to determine the similarity between chunks of text.\n *\n * @param vecA - The first vector\n * @param vecB - The second vector\n * @returns The cosine similarity score\n */\nconst cosineSimilarity = (vecA: number[], vecB: number[]): number => {\n // Calculate the dot product of the two vectors\n const dotProduct = vecA.reduce((sum, a, idx) => sum + a * vecB[idx], 0);\n\n // Calculate the magnitude (Euclidean norm) of each vector\n const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));\n const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));\n\n // Compute and return the cosine similarity\n return dotProduct / (magnitudeA * magnitudeB);\n};\n\n/**\n * Indexes all Markdown documents by generating embeddings for each chunk and storing them in memory.\n * Also updates the embeddings.json file if new embeddings are generated.\n */\nexport const indexMarkdownFiles = async (): Promise<void> => {\n const env = process.env.NODE_ENV;\n dotenv.config({\n path: [`.env.${env}.local`, `.env.${env}`, '.env.local', '.env'],\n });\n\n // Retrieve documentation and blog posts in English locale\n const frequentQuestions = getFequentQuestions();\n const docs = getDocs(Locales.ENGLISH);\n const blogs = getBlogs(Locales.ENGLISH);\n\n let result: Record<string, number[]> = {}; // Object to hold updated embeddings\n\n const files = { ...docs, ...blogs, ...frequentQuestions }; // Combine docs and blogs into a single object\n\n // Iterate over each file key (identifier) in the combined files\n for (const fileKey of Object.keys(files)) {\n // Split the document into chunks based on headings\n const fileChunks = chunkText(files[fileKey as keyof typeof files]);\n\n // Iterate over each chunk within the current file\n for (const chunkIndex of Object.keys(fileChunks)) {\n const chunkNumber = Number(chunkIndex) + 1; // Chunk number starts at 1\n const chunksNumber = fileChunks.length;\n\n const fileChunk = fileChunks[\n chunkIndex as keyof typeof fileChunks\n ] as string;\n\n const embeddingKeyName = `${fileKey}/chunk_${chunkNumber}`; // Unique key for the chunk\n\n // Retrieve precomputed embedding if available\n const docEmbedding = embeddingsList[\n embeddingKeyName as keyof typeof embeddingsList\n ] as number[] | undefined;\n\n let embedding = docEmbedding; // Use existing embedding if available\n\n if (!embedding) {\n embedding = await generateEmbedding(fileChunk); // Generate embedding if not present\n }\n\n // Update the result object with the new embedding\n result = { ...result, [embeddingKeyName]: embedding };\n\n // Store the embedding and content in the in-memory vector store\n vectorStore.push({\n fileKey,\n chunkNumber,\n embedding,\n content: fileChunk,\n });\n\n console.info(`- Indexed: ${embeddingKeyName}/${chunksNumber}`);\n }\n }\n\n if (process.env.NODE_ENV === 'development') {\n try {\n // Compare the newly generated embeddings with existing ones\n if (JSON.stringify(result) !== JSON.stringify(embeddingsList)) {\n // If there are new embeddings, save them to embeddings.json\n fs.writeFileSync(\n 'src/utils/AI/askDocQuestion/embeddings.json',\n JSON.stringify(result, null, 2)\n );\n }\n } catch (error) {\n console.error(error); // Log any errors during the file write process\n }\n }\n};\n\n// Automatically index Markdown files\nindexMarkdownFiles();\n\n/**\n * Searches the indexed documents for the most relevant chunks based on a query.\n * Utilizes cosine similarity to find the closest matching embeddings.\n *\n * @param query - The search query provided by the user\n * @returns An array of the top matching document chunks' content\n */\nexport const searchChunkReference = async (\n query: string\n): Promise<VectorStoreEl[]> => {\n // Generate an embedding for the user's query\n const queryEmbedding = await generateEmbedding(query);\n\n // Calculate similarity scores between the query embedding and each document's embedding\n const results = vectorStore\n .map((chunk) => ({\n ...chunk,\n similarity: cosineSimilarity(queryEmbedding, chunk.embedding), // Add similarity score to each doc\n }))\n .filter((chunk) => chunk.similarity > MIN_RELEVANT_CHUNKS_SIMILARITY) // Filter out documents with low similarity scores\n .sort((a, b) => b.similarity - a.similarity) // Sort documents by highest similarity first\n .slice(0, MAX_RELEVANT_CHUNKS_NB); // Select the top 6 most similar documents\n\n // Return the content of the top matching documents\n return results;\n};\n\n// Define the structure of messages used in chat completions\nexport type ChatCompletionRequestMessage = {\n role: 'system' | 'user' | 'assistant'; // The role of the message sender\n content: string; // The text content of the message\n};\n\n/**\n * Reads the content of a file synchronously.\n *\n * @function\n * @param relativeFilePath - The relative or absolute path to the target file.\n * @returns The entire contents of the specified file as a UTF-8 encoded string.\n */\nconst getFileContent = (relativeFilePath: string): string => {\n const __dirname = dirname(fileURLToPath(import.meta.url));\n const absolutePath = join(__dirname, relativeFilePath);\n const fileContent = readFileSync(absolutePath, 'utf-8');\n return fileContent;\n};\n\nconst CHAT_GPT_PROMPT = getFileContent('./PROMPT.md');\n\n// Initial prompt configuration for the chatbot\nexport const initPrompt: ChatCompletionRequestMessage = {\n role: 'system',\n content: CHAT_GPT_PROMPT,\n};\n\nexport type AskDocQuestionResult = {\n response: string;\n relatedFiles: string[];\n};\n\nexport type AskDocQuestionOptions = {\n onMessage?: (chunk: string) => void;\n};\n\n/**\n * Handles the \"Ask a question\" endpoint in an Express.js route.\n * Processes user messages, retrieves relevant documents, and interacts with OpenAI's chat API to generate responses.\n *\n * @param messages - An array of chat messages from the user and assistant\n * @returns The assistant's response as a string\n */\nexport const askDocQuestion = async (\n messages: ChatCompletionRequestMessage[],\n options?: AskDocQuestionOptions\n): Promise<AskDocQuestionResult> => {\n const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });\n\n // Assistant's response are filtered out otherwise the chatbot will be stuck in a self-referential loop\n // Note that the embedding precision will be lowered if the user change of context in the chat\n const userMessages = messages.filter((message) => message.role === 'user');\n\n // Format the user's question to keep only the relevant keywords\n const query = userMessages\n .map((message) => `- ${message.content}`)\n .join('\\n');\n\n // 1) Find relevant documents based on the user's question\n const relevantFilesReferences = await searchChunkReference(query);\n\n // 2) Integrate the relevant documents into the initial system prompt\n const messagesList: ChatCompletionRequestMessage[] = [\n {\n ...initPrompt,\n content: initPrompt.content.replace(\n '{{relevantFilesReferences}}',\n relevantFilesReferences.length === 0\n ? 'Not relevant file found related to the question.'\n : relevantFilesReferences\n .map(\n (doc, idx) =>\n `[Chunk ${idx}] docKey = \"${doc.fileKey}\":\\n${doc.content}`\n )\n .join('\\n\\n') // Insert relevant docs into the prompt\n ),\n },\n ...messages, // Include all user and assistant messages\n ];\n\n // 3) Send the compiled messages to OpenAI's Chat Completion API (using a specific model)\n const response = await openai.chat.completions.create({\n model: MODEL,\n temperature: MODEL_TEMPERATURE,\n messages: messagesList,\n stream: true,\n });\n\n let fullResponse = '';\n for await (const chunk of response) {\n const content = chunk.choices[0]?.delta?.content || '';\n if (content) {\n fullResponse += content;\n options?.onMessage?.(content);\n }\n }\n\n // 4) Extract unique related files\n const relatedFiles = [\n ...new Set(relevantFilesReferences.map((doc) => doc.fileKey)),\n ];\n\n // 5) Return the assistant's response to the user\n return {\n response: fullResponse ?? 'Error: No result found',\n relatedFiles,\n };\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,kBAAyB;AACzB,oBAAwB;AACxB,kBAA6C;AAC7C,oBAAmB;AACnB,gBAAiC;AACjC,oBAAuB;AACvB,kBAA8B;AAC9B,iBAA8B;AAC9B,wBAA2B;AAR3B;AAyBA,MAAM,cAA+B,CAAC;AAGtC,MAAM,QAA+B;AACrC,MAAM,oBAAoB;AAC1B,MAAM,kBACJ;AACF,MAAM,iBAAiB;AACvB,MAAM,mBAAmB;AACzB,MAAM,gBAAgB;AACtB,MAAM,YAAY,mBAAmB;AACrC,MAAM,gBAAgB,iBAAiB;AACvC,MAAM,yBAAyB;AAC/B,MAAM,iCAAiC;AAOvC,MAAM,YAAY,CAAC,SAA2B;AAC5C,QAAM,SAAmB,CAAC;AAC1B,MAAI,QAAQ;AAEZ,SAAO,QAAQ,KAAK,QAAQ;AAC1B,QAAI,MAAM,KAAK,IAAI,QAAQ,WAAW,KAAK,MAAM;AAGjD,QAAI,MAAM,KAAK,QAAQ;AACrB,YAAM,YAAY,KAAK,YAAY,KAAK,GAAG;AAC3C,UAAI,YAAY,OAAO;AACrB,cAAM;AAAA,MACR;AAAA,IACF;AAEA,WAAO,KAAK,KAAK,UAAU,OAAO,GAAG,CAAC;AAGtC,UAAM,YAAY,MAAM;AACxB,QAAI,aAAa,OAAO;AAEtB,cAAQ;AAAA,IACV,OAAO;AACL,cAAQ;AAAA,IACV;AAAA,EACF;AAEA,SAAO;AACT;AASA,MAAM,oBAAoB,OAAO,SAAoC;AACnE,MAAI;AACF,UAAM,SAAS,IAAI,qBAAO,EAAE,QAAQ,QAAQ,IAAI,eAAe,CAAC;AAChE,UAAM,WAAW,MAAM,OAAO,WAAW,OAAO;AAAA,MAC9C,OAAO;AAAA;AAAA,MACP,OAAO;AAAA,IACT,CAAC;AAED,WAAO,SAAS,KAAK,CAAC,EAAE;AAAA,EAC1B,SAAS,OAAO;AACd,YAAQ,MAAM,+BAA+B,KAAK;AAClD,WAAO,CAAC;AAAA,EACV;AACF;AAWA,MAAM,mBAAmB,CAAC,MAAgB,SAA2B;AAEnE,QAAM,aAAa,KAAK,OAAO,CAAC,KAAK,GAAG,QAAQ,MAAM,IAAI,KAAK,GAAG,GAAG,CAAC;AAGtE,QAAM,aAAa,KAAK,KAAK,KAAK,OAAO,CAAC,KAAK,MAAM,MAAM,IAAI,GAAG,CAAC,CAAC;AACpE,QAAM,aAAa,KAAK,KAAK,KAAK,OAAO,CAAC,KAAK,MAAM,MAAM,IAAI,GAAG,CAAC,CAAC;AAGpE,SAAO,cAAc,aAAa;AACpC;AAMO,MAAM,qBAAqB,YAA2B;AAC3D,QAAM,MAAM,QAAQ,IAAI;AACxB,gBAAAA,QAAO,OAAO;AAAA,IACZ,MAAM,CAAC,QAAQ,GAAG,UAAU,QAAQ,GAAG,IAAI,cAAc,MAAM;AAAA,EACjE,CAAC;AAGD,QAAM,wBAAoB,iCAAoB;AAC9C,QAAM,WAAO,qBAAQ,sBAAQ,OAAO;AACpC,QAAM,YAAQ,sBAAS,sBAAQ,OAAO;AAEtC,MAAI,SAAmC,CAAC;AAExC,QAAM,QAAQ,EAAE,GAAG,MAAM,GAAG,OAAO,GAAG,kBAAkB;AAGxD,aAAW,WAAW,OAAO,KAAK,KAAK,GAAG;AAExC,UAAM,aAAa,UAAU,MAAM,OAA6B,CAAC;AAGjE,eAAW,cAAc,OAAO,KAAK,UAAU,GAAG;AAChD,YAAM,cAAc,OAAO,UAAU,IAAI;AACzC,YAAM,eAAe,WAAW;AAEhC,YAAM,YAAY,WAChB,UACF;AAEA,YAAM,mBAAmB,GAAG,OAAO,UAAU,WAAW;AAGxD,YAAM,eAAe,kBAAAC,QACnB,gBACF;AAEA,UAAI,YAAY;AAEhB,UAAI,CAAC,WAAW;AACd,oBAAY,MAAM,kBAAkB,SAAS;AAAA,MAC/C;AAGA,eAAS,EAAE,GAAG,QAAQ,CAAC,gBAAgB,GAAG,UAAU;AAGpD,kBAAY,KAAK;AAAA,QACf;AAAA,QACA;AAAA,QACA;AAAA,QACA,SAAS;AAAA,MACX,CAAC;AAED,cAAQ,KAAK,cAAc,gBAAgB,IAAI,YAAY,EAAE;AAAA,IAC/D;AAAA,EACF;AAEA,MAAI,QAAQ,IAAI,aAAa,eAAe;AAC1C,QAAI;AAEF,UAAI,KAAK,UAAU,MAAM,MAAM,KAAK,UAAU,kBAAAA,OAAc,GAAG;AAE7D,kBAAAC,QAAG;AAAA,UACD;AAAA,UACA,KAAK,UAAU,QAAQ,MAAM,CAAC;AAAA,QAChC;AAAA,MACF;AAAA,IACF,SAAS,OAAO;AACd,cAAQ,MAAM,KAAK;AAAA,IACrB;AAAA,EACF;AACF;AAGA,mBAAmB;AASZ,MAAM,uBAAuB,OAClC,UAC6B;AAE7B,QAAM,iBAAiB,MAAM,kBAAkB,KAAK;AAGpD,QAAM,UAAU,YACb,IAAI,CAAC,WAAW;AAAA,IACf,GAAG;AAAA,IACH,YAAY,iBAAiB,gBAAgB,MAAM,SAAS;AAAA;AAAA,EAC9D,EAAE,EACD,OAAO,CAAC,UAAU,MAAM,aAAa,8BAA8B,EACnE,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,EAAE,UAAU,EAC1C,MAAM,GAAG,sBAAsB;AAGlC,SAAO;AACT;AAeA,MAAM,iBAAiB,CAAC,qBAAqC;AAC3D,QAAM,gBAAY,yBAAQ,0BAAc,YAAY,GAAG,CAAC;AACxD,QAAM,mBAAe,kBAAK,WAAW,gBAAgB;AACrD,QAAM,kBAAc,wBAAa,cAAc,OAAO;AACtD,SAAO;AACT;AAEA,MAAM,kBAAkB,eAAe,aAAa;AAG7C,MAAM,aAA2C;AAAA,EACtD,MAAM;AAAA,EACN,SAAS;AACX;AAkBO,MAAM,iBAAiB,OAC5B,UACA,YACkC;AAClC,QAAM,SAAS,IAAI,qBAAO,EAAE,QAAQ,QAAQ,IAAI,eAAe,CAAC;AAIhE,QAAM,eAAe,SAAS,OAAO,CAAC,YAAY,QAAQ,SAAS,MAAM;AAGzE,QAAM,QAAQ,aACX,IAAI,CAAC,YAAY,KAAK,QAAQ,OAAO,EAAE,EACvC,KAAK,IAAI;AAGZ,QAAM,0BAA0B,MAAM,qBAAqB,KAAK;AAGhE,QAAM,eAA+C;AAAA,IACnD;AAAA,MACE,GAAG;AAAA,MACH,SAAS,WAAW,QAAQ;AAAA,QAC1B;AAAA,QACA,wBAAwB,WAAW,IAC/B,qDACA,wBACG;AAAA,UACC,CAAC,KAAK,QACJ,UAAU,GAAG,eAAe,IAAI,OAAO;AAAA,EAAO,IAAI,OAAO;AAAA,QAC7D,EACC,KAAK,MAAM;AAAA;AAAA,MACpB;AAAA,IACF;AAAA,IACA,GAAG;AAAA;AAAA,EACL;AAGA,QAAM,WAAW,MAAM,OAAO,KAAK,YAAY,OAAO;AAAA,IACpD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,UAAU;AAAA,IACV,QAAQ;AAAA,EACV,CAAC;AAED,MAAI,eAAe;AACnB,mBAAiB,SAAS,UAAU;AAClC,UAAM,UAAU,MAAM,QAAQ,CAAC,GAAG,OAAO,WAAW;AACpD,QAAI,SAAS;AACX,sBAAgB;AAChB,eAAS,YAAY,OAAO;AAAA,IAC9B;AAAA,EACF;AAGA,QAAM,eAAe;AAAA,IACnB,GAAG,IAAI,IAAI,wBAAwB,IAAI,CAAC,QAAQ,IAAI,OAAO,CAAC;AAAA,EAC9D;AAGA,SAAO;AAAA,IACL,UAAU,gBAAgB;AAAA,IAC1B;AAAA,EACF;AACF;","names":["dotenv","embeddingsList","fs"]}