@dakshp1234/langchain-textsplitters 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/index.cjs +12 -0
- package/dist/index.d.cts +3 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +3 -0
- package/dist/semantic_text_splitter.cjs +179 -0
- package/dist/semantic_text_splitter.cjs.map +1 -0
- package/dist/semantic_text_splitter.d.cts +44 -0
- package/dist/semantic_text_splitter.d.cts.map +1 -0
- package/dist/semantic_text_splitter.d.ts +44 -0
- package/dist/semantic_text_splitter.d.ts.map +1 -0
- package/dist/semantic_text_splitter.js +178 -0
- package/dist/semantic_text_splitter.js.map +1 -0
- package/dist/text_splitter.cjs +536 -0
- package/dist/text_splitter.cjs.map +1 -0
- package/dist/text_splitter.d.cts +82 -0
- package/dist/text_splitter.d.cts.map +1 -0
- package/dist/text_splitter.d.ts +82 -0
- package/dist/text_splitter.d.ts.map +1 -0
- package/dist/text_splitter.js +530 -0
- package/dist/text_splitter.js.map +1 -0
- package/package.json +65 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text_splitter.js","names":[],"sources":["../src/text_splitter.ts"],"sourcesContent":["import type * as tiktoken from \"js-tiktoken\";\nimport { Document, BaseDocumentTransformer } from \"@langchain/core/documents\";\nimport { getEncoding } from \"@langchain/core/utils/tiktoken\";\n\nexport interface TextSplitterParams {\n chunkSize: number;\n chunkOverlap: number;\n keepSeparator: boolean;\n lengthFunction?:\n | ((text: string) => number)\n | ((text: string) => Promise<number>);\n}\n\nexport type TextSplitterChunkHeaderOptions = {\n chunkHeader?: string;\n chunkOverlapHeader?: string;\n appendChunkOverlapHeader?: boolean;\n};\n\nexport abstract class TextSplitter\n extends BaseDocumentTransformer\n implements TextSplitterParams\n{\n lc_namespace = [\"langchain\", \"document_transformers\", \"text_splitters\"];\n\n chunkSize = 1000;\n\n chunkOverlap = 200;\n\n keepSeparator = false;\n\n lengthFunction:\n | ((text: string) => number)\n | ((text: string) => Promise<number>);\n\n constructor(fields?: Partial<TextSplitterParams>) {\n super(fields);\n this.chunkSize = fields?.chunkSize ?? this.chunkSize;\n this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;\n this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;\n this.lengthFunction =\n fields?.lengthFunction ?? ((text: string) => text.length);\n if (this.chunkOverlap >= this.chunkSize) {\n throw new Error(\"Cannot have chunkOverlap >= chunkSize\");\n }\n }\n\n async transformDocuments(\n documents: Document[],\n chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}\n ): Promise<Document[]> {\n return this.splitDocuments(documents, chunkHeaderOptions);\n }\n\n abstract splitText(text: string): Promise<string[]>;\n\n protected splitOnSeparator(text: string, separator: string): string[] {\n let splits;\n if (separator) {\n if (this.keepSeparator) {\n const regexEscapedSeparator = separator.replace(\n /[/\\-\\\\^$*+?.()|[\\]{}]/g,\n \"\\\\$&\"\n );\n splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));\n } else {\n splits = text.split(separator);\n }\n } else {\n splits = text.split(\"\");\n }\n return splits.filter((s) => s !== \"\");\n }\n\n async createDocuments(\n texts: string[],\n // oxlint-disable-next-line @typescript-eslint/no-explicit-any\n metadatas: Record<string, any>[] = [],\n chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}\n ): Promise<Document[]> {\n // if no metadata is provided, we create an empty one for each text\n // oxlint-disable-next-line @typescript-eslint/no-explicit-any\n const _metadatas: Record<string, any>[] =\n metadatas.length > 0\n ? metadatas\n : [...Array(texts.length)].map(() => ({}));\n const {\n chunkHeader = \"\",\n chunkOverlapHeader = \"(cont'd) \",\n appendChunkOverlapHeader = false,\n } = chunkHeaderOptions;\n const documents = new Array<Document>();\n for (let i = 0; i < texts.length; i += 1) {\n const text = texts[i];\n let lineCounterIndex = 1;\n let prevChunk = null;\n let indexPrevChunk = -1;\n for (const chunk of await this.splitText(text)) {\n let pageContent = chunkHeader;\n\n // we need to count the \\n that are in the text before getting removed by the splitting\n const indexChunk = text.indexOf(chunk, indexPrevChunk + 1);\n if (prevChunk === null) {\n const newLinesBeforeFirstChunk = this.numberOfNewLines(\n text,\n 0,\n indexChunk\n );\n lineCounterIndex += newLinesBeforeFirstChunk;\n } else {\n const indexEndPrevChunk =\n indexPrevChunk + (await this.lengthFunction(prevChunk));\n if (indexEndPrevChunk < indexChunk) {\n const numberOfIntermediateNewLines = this.numberOfNewLines(\n text,\n indexEndPrevChunk,\n indexChunk\n );\n lineCounterIndex += numberOfIntermediateNewLines;\n } else if (indexEndPrevChunk > indexChunk) {\n const numberOfIntermediateNewLines = this.numberOfNewLines(\n text,\n indexChunk,\n indexEndPrevChunk\n );\n lineCounterIndex -= numberOfIntermediateNewLines;\n }\n if (appendChunkOverlapHeader) {\n pageContent += chunkOverlapHeader;\n }\n }\n const newLinesCount = this.numberOfNewLines(chunk);\n\n const loc =\n _metadatas[i].loc && typeof _metadatas[i].loc === \"object\"\n ? { ..._metadatas[i].loc }\n : {};\n loc.lines = {\n from: lineCounterIndex,\n to: lineCounterIndex + newLinesCount,\n };\n const metadataWithLinesNumber = {\n ..._metadatas[i],\n loc,\n };\n\n pageContent += chunk;\n documents.push(\n new Document({\n pageContent,\n metadata: metadataWithLinesNumber,\n })\n );\n lineCounterIndex += newLinesCount;\n prevChunk = chunk;\n indexPrevChunk = indexChunk;\n }\n }\n return documents;\n }\n\n private numberOfNewLines(text: string, start?: number, end?: number) {\n const textSection = text.slice(start, end);\n return (textSection.match(/\\n/g) || []).length;\n }\n\n async splitDocuments(\n documents: Document[],\n chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}\n ): Promise<Document[]> {\n const selectedDocuments = documents.filter(\n (doc) => doc.pageContent !== undefined\n );\n const texts = selectedDocuments.map((doc) => doc.pageContent);\n const metadatas = selectedDocuments.map((doc) => doc.metadata);\n return this.createDocuments(texts, metadatas, chunkHeaderOptions);\n }\n\n private joinDocs(docs: string[], separator: string): string | null {\n const text = docs.join(separator).trim();\n return text === \"\" ? null : text;\n }\n\n async mergeSplits(splits: string[], separator: string): Promise<string[]> {\n const docs: string[] = [];\n const currentDoc: string[] = [];\n let total = 0;\n for (const d of splits) {\n const _len = await this.lengthFunction(d);\n if (\n total + _len + currentDoc.length * separator.length >\n this.chunkSize\n ) {\n if (total > this.chunkSize) {\n console.warn(\n `Created a chunk of size ${total}, +\nwhich is longer than the specified ${this.chunkSize}`\n );\n }\n if (currentDoc.length > 0) {\n const doc = this.joinDocs(currentDoc, separator);\n if (doc !== null) {\n docs.push(doc);\n }\n // Keep on popping if:\n // - we have a larger chunk than in the chunk overlap\n // - or if we still have any chunks and the length is long\n while (\n total > this.chunkOverlap ||\n (total + _len + currentDoc.length * separator.length >\n this.chunkSize &&\n total > 0)\n ) {\n total -= await this.lengthFunction(currentDoc[0]);\n currentDoc.shift();\n }\n }\n }\n currentDoc.push(d);\n total += _len;\n }\n const doc = this.joinDocs(currentDoc, separator);\n if (doc !== null) {\n docs.push(doc);\n }\n return docs;\n }\n}\n\nexport interface CharacterTextSplitterParams extends TextSplitterParams {\n separator: string;\n}\n\nexport class CharacterTextSplitter\n extends TextSplitter\n implements CharacterTextSplitterParams\n{\n static lc_name() {\n return \"CharacterTextSplitter\";\n }\n\n separator = \"\\n\\n\";\n\n constructor(fields?: Partial<CharacterTextSplitterParams>) {\n super(fields);\n this.separator = fields?.separator ?? this.separator;\n }\n\n async splitText(text: string): Promise<string[]> {\n // First we naively split the large input into a bunch of smaller ones.\n const splits = this.splitOnSeparator(text, this.separator);\n return this.mergeSplits(splits, this.keepSeparator ? \"\" : this.separator);\n }\n}\n\nexport interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {\n separators: string[];\n}\n\nexport const SupportedTextSplitterLanguages = [\n \"cpp\",\n \"go\",\n \"java\",\n \"js\",\n \"php\",\n \"proto\",\n \"python\",\n \"rst\",\n \"ruby\",\n \"rust\",\n \"scala\",\n \"swift\",\n \"markdown\",\n \"latex\",\n \"html\",\n \"sol\",\n] as const;\n\nexport type SupportedTextSplitterLanguage =\n (typeof SupportedTextSplitterLanguages)[number];\n\nexport class RecursiveCharacterTextSplitter\n extends TextSplitter\n implements RecursiveCharacterTextSplitterParams\n{\n static lc_name() {\n return \"RecursiveCharacterTextSplitter\";\n }\n\n separators: string[] = [\"\\n\\n\", \"\\n\", \" \", \"\"];\n\n constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>) {\n super(fields);\n this.separators = fields?.separators ?? this.separators;\n this.keepSeparator = fields?.keepSeparator ?? true;\n }\n\n private async _splitText(text: string, separators: string[]) {\n const finalChunks: string[] = [];\n\n // Get appropriate separator to use\n let separator: string = separators[separators.length - 1];\n let newSeparators;\n for (let i = 0; i < separators.length; i += 1) {\n const s = separators[i];\n if (s === \"\") {\n separator = s;\n break;\n }\n if (text.includes(s)) {\n separator = s;\n newSeparators = separators.slice(i + 1);\n break;\n }\n }\n\n // Now that we have the separator, split the text\n const splits = this.splitOnSeparator(text, separator);\n\n // Now go merging things, recursively splitting longer texts.\n let goodSplits: string[] = [];\n const _separator = this.keepSeparator ? \"\" : separator;\n for (const s of splits) {\n if ((await this.lengthFunction(s)) < this.chunkSize) {\n goodSplits.push(s);\n } else {\n if (goodSplits.length) {\n const mergedText = await this.mergeSplits(goodSplits, _separator);\n finalChunks.push(...mergedText);\n goodSplits = [];\n }\n if (!newSeparators) {\n finalChunks.push(s);\n } else {\n const otherInfo = await this._splitText(s, newSeparators);\n finalChunks.push(...otherInfo);\n }\n }\n }\n if (goodSplits.length) {\n const mergedText = await this.mergeSplits(goodSplits, _separator);\n finalChunks.push(...mergedText);\n }\n return finalChunks;\n }\n\n async splitText(text: string): Promise<string[]> {\n return this._splitText(text, this.separators);\n }\n\n static fromLanguage(\n language: SupportedTextSplitterLanguage,\n options?: Partial<RecursiveCharacterTextSplitterParams>\n ) {\n return new RecursiveCharacterTextSplitter({\n ...options,\n separators:\n RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language),\n });\n }\n\n static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage) {\n if (language === \"cpp\") {\n return [\n // Split along class definitions\n \"\\nclass \",\n // Split along function definitions\n \"\\nvoid \",\n \"\\nint \",\n \"\\nfloat \",\n \"\\ndouble \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"go\") {\n return [\n // Split along function definitions\n \"\\nfunc \",\n \"\\nvar \",\n \"\\nconst \",\n \"\\ntype \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"java\") {\n return [\n // Split along class definitions\n \"\\nclass \",\n // Split along method definitions\n \"\\npublic \",\n \"\\nprotected \",\n \"\\nprivate \",\n \"\\nstatic \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"js\") {\n return [\n // Split along function definitions\n \"\\nfunction \",\n \"\\nconst \",\n \"\\nlet \",\n \"\\nvar \",\n \"\\nclass \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\nswitch \",\n \"\\ncase \",\n \"\\ndefault \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"php\") {\n return [\n // Split along function definitions\n \"\\nfunction \",\n // Split along class definitions\n \"\\nclass \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nforeach \",\n \"\\nwhile \",\n \"\\ndo \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"proto\") {\n return [\n // Split along message definitions\n \"\\nmessage \",\n // Split along service definitions\n \"\\nservice \",\n // Split along enum definitions\n \"\\nenum \",\n // Split along option definitions\n \"\\noption \",\n // Split along import statements\n \"\\nimport \",\n // Split along syntax declarations\n \"\\nsyntax \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"python\") {\n return [\n // First, try to split along class definitions\n \"\\nclass \",\n \"\\ndef \",\n \"\\n\\tdef \",\n // Now split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"rst\") {\n return [\n // Split along section titles\n \"\\n===\\n\",\n \"\\n---\\n\",\n \"\\n***\\n\",\n // Split along directive markers\n \"\\n.. \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"ruby\") {\n return [\n // Split along method definitions\n \"\\ndef \",\n \"\\nclass \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nunless \",\n \"\\nwhile \",\n \"\\nfor \",\n \"\\ndo \",\n \"\\nbegin \",\n \"\\nrescue \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"rust\") {\n return [\n // Split along function definitions\n \"\\nfn \",\n \"\\nconst \",\n \"\\nlet \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nwhile \",\n \"\\nfor \",\n \"\\nloop \",\n \"\\nmatch \",\n \"\\nconst \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"scala\") {\n return [\n // Split along class definitions\n \"\\nclass \",\n \"\\nobject \",\n // Split along method definitions\n \"\\ndef \",\n \"\\nval \",\n \"\\nvar \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\nmatch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"swift\") {\n return [\n // Split along function definitions\n \"\\nfunc \",\n // Split along class definitions\n \"\\nclass \",\n \"\\nstruct \",\n \"\\nenum \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\ndo \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"markdown\") {\n return [\n // First, try to split along Markdown headings (starting with level 2)\n \"\\n## \",\n \"\\n### \",\n \"\\n#### \",\n \"\\n##### \",\n \"\\n###### \",\n // Note the alternative syntax for headings (below) is not handled here\n // Heading level 2\n // ---------------\n // End of code block\n \"```\\n\\n\",\n // Horizontal lines\n \"\\n\\n***\\n\\n\",\n \"\\n\\n---\\n\\n\",\n \"\\n\\n___\\n\\n\",\n // Note that this splitter doesn't handle horizontal lines defined\n // by *three or more* of ***, ---, or ___, but this is not handled\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"latex\") {\n return [\n // First, try to split along Latex sections\n \"\\n\\\\chapter{\",\n \"\\n\\\\section{\",\n \"\\n\\\\subsection{\",\n \"\\n\\\\subsubsection{\",\n\n // Now split by environments\n \"\\n\\\\begin{enumerate}\",\n \"\\n\\\\begin{itemize}\",\n \"\\n\\\\begin{description}\",\n \"\\n\\\\begin{list}\",\n \"\\n\\\\begin{quote}\",\n \"\\n\\\\begin{quotation}\",\n \"\\n\\\\begin{verse}\",\n \"\\n\\\\begin{verbatim}\",\n\n // Now split by math environments\n \"\\n\\\\begin{align}\",\n \"$$\",\n \"$\",\n\n // Now split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"html\") {\n return [\n // First, try to split along HTML tags\n \"<body>\",\n \"<div>\",\n \"<p>\",\n \"<br>\",\n \"<li>\",\n \"<h1>\",\n \"<h2>\",\n \"<h3>\",\n \"<h4>\",\n \"<h5>\",\n \"<h6>\",\n \"<span>\",\n \"<table>\",\n \"<tr>\",\n \"<td>\",\n \"<th>\",\n \"<ul>\",\n \"<ol>\",\n \"<header>\",\n \"<footer>\",\n \"<nav>\",\n // Head\n \"<head>\",\n \"<style>\",\n \"<script>\",\n \"<meta>\",\n \"<title>\",\n // Normal type of lines\n \" \",\n \"\",\n ];\n } else if (language === \"sol\") {\n return [\n // Split along compiler informations definitions\n \"\\npragma \",\n \"\\nusing \",\n // Split along contract definitions\n \"\\ncontract \",\n \"\\ninterface \",\n \"\\nlibrary \",\n // Split along method definitions\n \"\\nconstructor \",\n \"\\ntype \",\n \"\\nfunction \",\n \"\\nevent \",\n \"\\nmodifier \",\n \"\\nerror \",\n \"\\nstruct \",\n \"\\nenum \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\ndo while \",\n \"\\nassembly \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else {\n throw new Error(`Language ${language} is not supported.`);\n }\n }\n}\n\nexport interface TokenTextSplitterParams extends TextSplitterParams {\n encodingName: tiktoken.TiktokenEncoding;\n allowedSpecial: \"all\" | Array<string>;\n disallowedSpecial: \"all\" | Array<string>;\n}\n\n/**\n * Implementation of splitter which looks at tokens.\n */\nexport class TokenTextSplitter\n extends TextSplitter\n implements TokenTextSplitterParams\n{\n static lc_name() {\n return \"TokenTextSplitter\";\n }\n\n encodingName: tiktoken.TiktokenEncoding;\n\n allowedSpecial: \"all\" | Array<string>;\n\n disallowedSpecial: \"all\" | Array<string>;\n\n private tokenizer: tiktoken.Tiktoken;\n\n constructor(fields?: Partial<TokenTextSplitterParams>) {\n super(fields);\n\n this.encodingName = fields?.encodingName ?? \"gpt2\";\n this.allowedSpecial = fields?.allowedSpecial ?? [];\n this.disallowedSpecial = fields?.disallowedSpecial ?? \"all\";\n }\n\n async splitText(text: string): Promise<string[]> {\n if (!this.tokenizer) {\n this.tokenizer = await getEncoding(this.encodingName);\n }\n\n const splits: string[] = [];\n\n const input_ids = this.tokenizer.encode(\n text,\n this.allowedSpecial,\n this.disallowedSpecial\n );\n\n let start_idx = 0;\n\n while (start_idx < input_ids.length) {\n if (start_idx > 0) {\n start_idx -= this.chunkOverlap;\n }\n const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);\n const chunk_ids = input_ids.slice(start_idx, end_idx);\n splits.push(this.tokenizer.decode(chunk_ids));\n start_idx = end_idx;\n }\n\n return splits;\n }\n}\n\nexport type MarkdownTextSplitterParams = TextSplitterParams;\n\nexport class MarkdownTextSplitter\n extends RecursiveCharacterTextSplitter\n implements MarkdownTextSplitterParams\n{\n constructor(fields?: Partial<MarkdownTextSplitterParams>) {\n super({\n ...fields,\n separators:\n RecursiveCharacterTextSplitter.getSeparatorsForLanguage(\"markdown\"),\n });\n }\n}\n\nexport type LatexTextSplitterParams = TextSplitterParams;\n\nexport class LatexTextSplitter\n extends RecursiveCharacterTextSplitter\n implements LatexTextSplitterParams\n{\n constructor(fields?: Partial<LatexTextSplitterParams>) {\n super({\n ...fields,\n separators:\n RecursiveCharacterTextSplitter.getSeparatorsForLanguage(\"latex\"),\n });\n }\n}\n"],"mappings":";;;AAmBA,IAAsB,eAAtB,cACU,wBAEV;CACE,eAAe;EAAC;EAAa;EAAyB;EAAiB;CAEvE,YAAY;CAEZ,eAAe;CAEf,gBAAgB;CAEhB;CAIA,YAAY,QAAsC;AAChD,QAAM,OAAO;AACb,OAAK,YAAY,QAAQ,aAAa,KAAK;AAC3C,OAAK,eAAe,QAAQ,gBAAgB,KAAK;AACjD,OAAK,gBAAgB,QAAQ,iBAAiB,KAAK;AACnD,OAAK,iBACH,QAAQ,oBAAoB,SAAiB,KAAK;AACpD,MAAI,KAAK,gBAAgB,KAAK,UAC5B,OAAM,IAAI,MAAM,wCAAwC;;CAI5D,MAAM,mBACJ,WACA,qBAAqD,EAAE,EAClC;AACrB,SAAO,KAAK,eAAe,WAAW,mBAAmB;;CAK3D,iBAA2B,MAAc,WAA6B;EACpE,IAAI;AACJ,MAAI,UACF,KAAI,KAAK,eAAe;GACtB,MAAM,wBAAwB,UAAU,QACtC,0BACA,OACD;AACD,YAAS,KAAK,MAAM,IAAI,OAAO,MAAM,sBAAsB,GAAG,CAAC;QAE/D,UAAS,KAAK,MAAM,UAAU;MAGhC,UAAS,KAAK,MAAM,GAAG;AAEzB,SAAO,OAAO,QAAQ,MAAM,MAAM,GAAG;;CAGvC,MAAM,gBACJ,OAEA,YAAmC,EAAE,EACrC,qBAAqD,EAAE,EAClC;EAGrB,MAAM,aACJ,UAAU,SAAS,IACf,YACA,CAAC,GAAG,MAAM,MAAM,OAAO,CAAC,CAAC,WAAW,EAAE,EAAE;EAC9C,MAAM,EACJ,cAAc,IACd,qBAAqB,aACrB,2BAA2B,UACzB;EACJ,MAAM,YAAY,IAAI,OAAiB;AACvC,OAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK,GAAG;GACxC,MAAM,OAAO,MAAM;GACnB,IAAI,mBAAmB;GACvB,IAAI,YAAY;GAChB,IAAI,iBAAiB;AACrB,QAAK,MAAM,SAAS,MAAM,KAAK,UAAU,KAAK,EAAE;IAC9C,IAAI,cAAc;IAGlB,MAAM,aAAa,KAAK,QAAQ,OAAO,iBAAiB,EAAE;AAC1D,QAAI,cAAc,MAAM;KACtB,MAAM,2BAA2B,KAAK,iBACpC,MACA,GACA,WACD;AACD,yBAAoB;WACf;KACL,MAAM,oBACJ,iBAAkB,MAAM,KAAK,eAAe,UAAU;AACxD,SAAI,oBAAoB,YAAY;MAClC,MAAM,+BAA+B,KAAK,iBACxC,MACA,mBACA,WACD;AACD,0BAAoB;gBACX,oBAAoB,YAAY;MACzC,MAAM,+BAA+B,KAAK,iBACxC,MACA,YACA,kBACD;AACD,0BAAoB;;AAEtB,SAAI,yBACF,gBAAe;;IAGnB,MAAM,gBAAgB,KAAK,iBAAiB,MAAM;IAElD,MAAM,MACJ,WAAW,GAAG,OAAO,OAAO,WAAW,GAAG,QAAQ,WAC9C,EAAE,GAAG,WAAW,GAAG,KAAK,GACxB,EAAE;AACR,QAAI,QAAQ;KACV,MAAM;KACN,IAAI,mBAAmB;KACxB;IACD,MAAM,0BAA0B;KAC9B,GAAG,WAAW;KACd;KACD;AAED,mBAAe;AACf,cAAU,KACR,IAAI,SAAS;KACX;KACA,UAAU;KACX,CAAC,CACH;AACD,wBAAoB;AACpB,gBAAY;AACZ,qBAAiB;;;AAGrB,SAAO;;CAGT,iBAAyB,MAAc,OAAgB,KAAc;AAEnE,UADoB,KAAK,MAAM,OAAO,IAAI,CACtB,MAAM,MAAM,IAAI,EAAE,EAAE;;CAG1C,MAAM,eACJ,WACA,qBAAqD,EAAE,EAClC;EACrB,MAAM,oBAAoB,UAAU,QACjC,QAAQ,IAAI,gBAAgB,KAAA,EAC9B;EACD,MAAM,QAAQ,kBAAkB,KAAK,QAAQ,IAAI,YAAY;EAC7D,MAAM,YAAY,kBAAkB,KAAK,QAAQ,IAAI,SAAS;AAC9D,SAAO,KAAK,gBAAgB,OAAO,WAAW,mBAAmB;;CAGnE,SAAiB,MAAgB,WAAkC;EACjE,MAAM,OAAO,KAAK,KAAK,UAAU,CAAC,MAAM;AACxC,SAAO,SAAS,KAAK,OAAO;;CAG9B,MAAM,YAAY,QAAkB,WAAsC;EACxE,MAAM,OAAiB,EAAE;EACzB,MAAM,aAAuB,EAAE;EAC/B,IAAI,QAAQ;AACZ,OAAK,MAAM,KAAK,QAAQ;GACtB,MAAM,OAAO,MAAM,KAAK,eAAe,EAAE;AACzC,OACE,QAAQ,OAAO,WAAW,SAAS,UAAU,SAC7C,KAAK,WACL;AACA,QAAI,QAAQ,KAAK,UACf,SAAQ,KACN,2BAA2B,MAAM;qCACR,KAAK,YAC/B;AAEH,QAAI,WAAW,SAAS,GAAG;KACzB,MAAM,MAAM,KAAK,SAAS,YAAY,UAAU;AAChD,SAAI,QAAQ,KACV,MAAK,KAAK,IAAI;AAKhB,YACE,QAAQ,KAAK,gBACZ,QAAQ,OAAO,WAAW,SAAS,UAAU,SAC5C,KAAK,aACL,QAAQ,GACV;AACA,eAAS,MAAM,KAAK,eAAe,WAAW,GAAG;AACjD,iBAAW,OAAO;;;;AAIxB,cAAW,KAAK,EAAE;AAClB,YAAS;;EAEX,MAAM,MAAM,KAAK,SAAS,YAAY,UAAU;AAChD,MAAI,QAAQ,KACV,MAAK,KAAK,IAAI;AAEhB,SAAO;;;AAQX,IAAa,wBAAb,cACU,aAEV;CACE,OAAO,UAAU;AACf,SAAO;;CAGT,YAAY;CAEZ,YAAY,QAA+C;AACzD,QAAM,OAAO;AACb,OAAK,YAAY,QAAQ,aAAa,KAAK;;CAG7C,MAAM,UAAU,MAAiC;EAE/C,MAAM,SAAS,KAAK,iBAAiB,MAAM,KAAK,UAAU;AAC1D,SAAO,KAAK,YAAY,QAAQ,KAAK,gBAAgB,KAAK,KAAK,UAAU;;;AAQ7E,MAAa,iCAAiC;CAC5C;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACD;AAKD,IAAa,iCAAb,MAAa,uCACH,aAEV;CACE,OAAO,UAAU;AACf,SAAO;;CAGT,aAAuB;EAAC;EAAQ;EAAM;EAAK;EAAG;CAE9C,YAAY,QAAwD;AAClE,QAAM,OAAO;AACb,OAAK,aAAa,QAAQ,cAAc,KAAK;AAC7C,OAAK,gBAAgB,QAAQ,iBAAiB;;CAGhD,MAAc,WAAW,MAAc,YAAsB;EAC3D,MAAM,cAAwB,EAAE;EAGhC,IAAI,YAAoB,WAAW,WAAW,SAAS;EACvD,IAAI;AACJ,OAAK,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK,GAAG;GAC7C,MAAM,IAAI,WAAW;AACrB,OAAI,MAAM,IAAI;AACZ,gBAAY;AACZ;;AAEF,OAAI,KAAK,SAAS,EAAE,EAAE;AACpB,gBAAY;AACZ,oBAAgB,WAAW,MAAM,IAAI,EAAE;AACvC;;;EAKJ,MAAM,SAAS,KAAK,iBAAiB,MAAM,UAAU;EAGrD,IAAI,aAAuB,EAAE;EAC7B,MAAM,aAAa,KAAK,gBAAgB,KAAK;AAC7C,OAAK,MAAM,KAAK,OACd,KAAK,MAAM,KAAK,eAAe,EAAE,GAAI,KAAK,UACxC,YAAW,KAAK,EAAE;OACb;AACL,OAAI,WAAW,QAAQ;IACrB,MAAM,aAAa,MAAM,KAAK,YAAY,YAAY,WAAW;AACjE,gBAAY,KAAK,GAAG,WAAW;AAC/B,iBAAa,EAAE;;AAEjB,OAAI,CAAC,cACH,aAAY,KAAK,EAAE;QACd;IACL,MAAM,YAAY,MAAM,KAAK,WAAW,GAAG,cAAc;AACzD,gBAAY,KAAK,GAAG,UAAU;;;AAIpC,MAAI,WAAW,QAAQ;GACrB,MAAM,aAAa,MAAM,KAAK,YAAY,YAAY,WAAW;AACjE,eAAY,KAAK,GAAG,WAAW;;AAEjC,SAAO;;CAGT,MAAM,UAAU,MAAiC;AAC/C,SAAO,KAAK,WAAW,MAAM,KAAK,WAAW;;CAG/C,OAAO,aACL,UACA,SACA;AACA,SAAO,IAAI,+BAA+B;GACxC,GAAG;GACH,YACE,+BAA+B,yBAAyB,SAAS;GACpE,CAAC;;CAGJ,OAAO,yBAAyB,UAAyC;AACvE,MAAI,aAAa,MACf,QAAO;GAEL;GAEA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,KACtB,QAAO;GAEL;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,OACtB,QAAO;GAEL;GAEA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,KACtB,QAAO;GAEL;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,MACtB,QAAO;GAEL;GAEA;GAEA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,QACtB,QAAO;GAEL;GAEA;GAEA;GAEA;GAEA;GAEA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,SACtB,QAAO;GAEL;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,MACtB,QAAO;GAEL;GACA;GACA;GAEA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,OACtB,QAAO;GAEL;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,OACtB,QAAO;GAEL;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,QACtB,QAAO;GAEL;GACA;GAEA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,QACtB,QAAO;GAEL;GAEA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,WACtB,QAAO;GAEL;GACA;GACA;GACA;GACA;GAKA;GAEA;GACA;GACA;GAGA;GACA;GACA;GACA;GACD;WACQ,aAAa,QACtB,QAAO;GAEL;GACA;GACA;GACA;GAGA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GAGA;GACA;GACA;GAGA;GACA;GACA;GACA;GACD;WACQ,aAAa,OACtB,QAAO;GAEL;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACD;WACQ,aAAa,MACtB,QAAO;GAEL;GACA;GAEA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;MAED,OAAM,IAAI,MAAM,YAAY,SAAS,oBAAoB;;;;;;AAc/D,IAAa,oBAAb,cACU,aAEV;CACE,OAAO,UAAU;AACf,SAAO;;CAGT;CAEA;CAEA;CAEA;CAEA,YAAY,QAA2C;AACrD,QAAM,OAAO;AAEb,OAAK,eAAe,QAAQ,gBAAgB;AAC5C,OAAK,iBAAiB,QAAQ,kBAAkB,EAAE;AAClD,OAAK,oBAAoB,QAAQ,qBAAqB;;CAGxD,MAAM,UAAU,MAAiC;AAC/C,MAAI,CAAC,KAAK,UACR,MAAK,YAAY,MAAM,YAAY,KAAK,aAAa;EAGvD,MAAM,SAAmB,EAAE;EAE3B,MAAM,YAAY,KAAK,UAAU,OAC/B,MACA,KAAK,gBACL,KAAK,kBACN;EAED,IAAI,YAAY;AAEhB,SAAO,YAAY,UAAU,QAAQ;AACnC,OAAI,YAAY,EACd,cAAa,KAAK;GAEpB,MAAM,UAAU,KAAK,IAAI,YAAY,KAAK,WAAW,UAAU,OAAO;GACtE,MAAM,YAAY,UAAU,MAAM,WAAW,QAAQ;AACrD,UAAO,KAAK,KAAK,UAAU,OAAO,UAAU,CAAC;AAC7C,eAAY;;AAGd,SAAO;;;AAMX,IAAa,uBAAb,cACU,+BAEV;CACE,YAAY,QAA8C;AACxD,QAAM;GACJ,GAAG;GACH,YACE,+BAA+B,yBAAyB,WAAW;GACtE,CAAC;;;AAMN,IAAa,oBAAb,cACU,+BAEV;CACE,YAAY,QAA2C;AACrD,QAAM;GACJ,GAAG;GACH,YACE,+BAA+B,yBAAyB,QAAQ;GACnE,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@dakshp1234/langchain-textsplitters",
|
|
3
|
+
"version": "1.0.1",
|
|
4
|
+
"description": "Various implementations of LangChain.js text splitters",
|
|
5
|
+
"author": "LangChain",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"type": "module",
|
|
8
|
+
"engines": {
|
|
9
|
+
"node": ">=20"
|
|
10
|
+
},
|
|
11
|
+
"repository": {
|
|
12
|
+
"type": "git",
|
|
13
|
+
"url": "git@github.com:langchain-ai/langchainjs.git"
|
|
14
|
+
},
|
|
15
|
+
"homepage": "https://github.com/langchain-ai/langchainjs/tree/main/libs/langchain-textsplitters/",
|
|
16
|
+
"scripts": {
|
|
17
|
+
"build": "turbo build:compile --filter @dakshp1234/langchain-textsplitters --output-logs new-only",
|
|
18
|
+
"build:compile": "tsdown",
|
|
19
|
+
"clean": "rm -rf .turbo dist/",
|
|
20
|
+
"test": "vitest --run",
|
|
21
|
+
"test:watch": "vitest --watch"
|
|
22
|
+
},
|
|
23
|
+
"dependencies": {
|
|
24
|
+
"js-tiktoken": "^1.0.12"
|
|
25
|
+
},
|
|
26
|
+
"peerDependencies": {
|
|
27
|
+
"@langchain/core": "^1.0.0"
|
|
28
|
+
},
|
|
29
|
+
"devDependencies": {
|
|
30
|
+
"@langchain/core": "1.1.0",
|
|
31
|
+
"@langchain/tsconfig": "^1.0.3",
|
|
32
|
+
"@tsconfig/recommended": "^1.0.3",
|
|
33
|
+
"@vitest/coverage-v8": "^3.2.4",
|
|
34
|
+
"dotenv": "^17.4.0",
|
|
35
|
+
"dpdm": "^3.14.0",
|
|
36
|
+
"typescript": "~5.8.3",
|
|
37
|
+
"vitest": "^4.1.2"
|
|
38
|
+
},
|
|
39
|
+
"publishConfig": {
|
|
40
|
+
"access": "public"
|
|
41
|
+
},
|
|
42
|
+
"main": "./dist/index.cjs",
|
|
43
|
+
"types": "./dist/index.d.cts",
|
|
44
|
+
"exports": {
|
|
45
|
+
".": {
|
|
46
|
+
"input": "./src/index.ts",
|
|
47
|
+
"require": {
|
|
48
|
+
"types": "./dist/index.d.cts",
|
|
49
|
+
"default": "./dist/index.cjs"
|
|
50
|
+
},
|
|
51
|
+
"import": {
|
|
52
|
+
"types": "./dist/index.d.ts",
|
|
53
|
+
"default": "./dist/index.js"
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"./package.json": "./package.json"
|
|
57
|
+
},
|
|
58
|
+
"files": [
|
|
59
|
+
"dist/",
|
|
60
|
+
"CHANGELOG.md",
|
|
61
|
+
"README.md",
|
|
62
|
+
"LICENSE"
|
|
63
|
+
],
|
|
64
|
+
"module": "./dist/index.js"
|
|
65
|
+
}
|