langchain-text-splitters 0.3.0.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.1
2
+ Name: langchain-text-splitters
3
+ Version: 0.3.0.dev1
4
+ Summary: LangChain text splitting utilities
5
+ Home-page: https://github.com/langchain-ai/langchain
6
+ License: MIT
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: langchain-core (>=0.3.0.dev1,<0.4.0)
15
+ Project-URL: Repository, https://github.com/langchain-ai/langchain
16
+ Project-URL: Release Notes, https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-text-splitters%3D%3D0%22&expanded=true
17
+ Project-URL: Source Code, https://github.com/langchain-ai/langchain/tree/master/libs/text-splitters
18
+ Description-Content-Type: text/markdown
19
+
20
+ # 🦜✂️ LangChain Text Splitters
21
+
22
+ [![Downloads](https://static.pepy.tech/badge/langchain_text_splitters/month)](https://pepy.tech/project/langchain_text_splitters)
23
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
24
+
25
+ ## Quick Install
26
+
27
+ ```bash
28
+ pip install langchain-text-splitters
29
+ ```
30
+
31
+ ## What is it?
32
+
33
+ LangChain Text Splitters contains utilities for splitting into chunks a wide variety of text documents.
34
+
35
+ For full documentation see the [API reference](https://api.python.langchain.com/en/stable/text_splitters_api_reference.html)
36
+ and the [Text Splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/) module in the main docs.
37
+
38
+ ## 📕 Releases & Versioning
39
+
40
+ `langchain-text-splitters` is currently on version `0.0.x`.
41
+
42
+ Minor version increases will occur for:
43
+
44
+ - Breaking changes for any public interfaces NOT marked `beta`
45
+
46
+ Patch version increases will occur for:
47
+
48
+ - Bug fixes
49
+ - New features
50
+ - Any changes to private interfaces
51
+ - Any changes to `beta` features
52
+
53
+ ## 💁 Contributing
54
+
55
+ As an open-source project in a rapidly developing field, we are extremely open to contributions, whether it be in the form of a new feature, improved infrastructure, or better documentation.
56
+
57
+ For detailed information on how to contribute, see the [Contributing Guide](https://python.langchain.com/docs/contributing/).
58
+
@@ -0,0 +1,38 @@
1
+ # 🦜✂️ LangChain Text Splitters
2
+
3
+ [![Downloads](https://static.pepy.tech/badge/langchain_text_splitters/month)](https://pepy.tech/project/langchain_text_splitters)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+
6
+ ## Quick Install
7
+
8
+ ```bash
9
+ pip install langchain-text-splitters
10
+ ```
11
+
12
+ ## What is it?
13
+
14
+ LangChain Text Splitters contains utilities for splitting into chunks a wide variety of text documents.
15
+
16
+ For full documentation see the [API reference](https://api.python.langchain.com/en/stable/text_splitters_api_reference.html)
17
+ and the [Text Splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/) module in the main docs.
18
+
19
+ ## 📕 Releases & Versioning
20
+
21
+ `langchain-text-splitters` is currently on version `0.0.x`.
22
+
23
+ Minor version increases will occur for:
24
+
25
+ - Breaking changes for any public interfaces NOT marked `beta`
26
+
27
+ Patch version increases will occur for:
28
+
29
+ - Bug fixes
30
+ - New features
31
+ - Any changes to private interfaces
32
+ - Any changes to `beta` features
33
+
34
+ ## 💁 Contributing
35
+
36
+ As an open-source project in a rapidly developing field, we are extremely open to contributions, whether it be in the form of a new feature, improved infrastructure, or better documentation.
37
+
38
+ For detailed information on how to contribute, see the [Contributing Guide](https://python.langchain.com/docs/contributing/).
@@ -0,0 +1,76 @@
1
+ """**Text Splitters** are classes for splitting text.
2
+
3
+
4
+ **Class hierarchy:**
5
+
6
+ .. code-block::
7
+
8
+ BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter # Example: CharacterTextSplitter
9
+ RecursiveCharacterTextSplitter --> <name>TextSplitter
10
+
11
+ Note: **MarkdownHeaderTextSplitter** and **HTMLHeaderTextSplitter do not derive from TextSplitter.
12
+
13
+
14
+ **Main helpers:**
15
+
16
+ .. code-block::
17
+
18
+ Document, Tokenizer, Language, LineType, HeaderType
19
+
20
+ """ # noqa: E501
21
+
22
+ from langchain_text_splitters.base import (
23
+ Language,
24
+ TextSplitter,
25
+ Tokenizer,
26
+ TokenTextSplitter,
27
+ split_text_on_tokens,
28
+ )
29
+ from langchain_text_splitters.character import (
30
+ CharacterTextSplitter,
31
+ RecursiveCharacterTextSplitter,
32
+ )
33
+ from langchain_text_splitters.html import (
34
+ ElementType,
35
+ HTMLHeaderTextSplitter,
36
+ HTMLSectionSplitter,
37
+ )
38
+ from langchain_text_splitters.json import RecursiveJsonSplitter
39
+ from langchain_text_splitters.konlpy import KonlpyTextSplitter
40
+ from langchain_text_splitters.latex import LatexTextSplitter
41
+ from langchain_text_splitters.markdown import (
42
+ HeaderType,
43
+ LineType,
44
+ MarkdownHeaderTextSplitter,
45
+ MarkdownTextSplitter,
46
+ )
47
+ from langchain_text_splitters.nltk import NLTKTextSplitter
48
+ from langchain_text_splitters.python import PythonCodeTextSplitter
49
+ from langchain_text_splitters.sentence_transformers import (
50
+ SentenceTransformersTokenTextSplitter,
51
+ )
52
+ from langchain_text_splitters.spacy import SpacyTextSplitter
53
+
54
+ __all__ = [
55
+ "TokenTextSplitter",
56
+ "TextSplitter",
57
+ "Tokenizer",
58
+ "Language",
59
+ "RecursiveCharacterTextSplitter",
60
+ "RecursiveJsonSplitter",
61
+ "LatexTextSplitter",
62
+ "PythonCodeTextSplitter",
63
+ "KonlpyTextSplitter",
64
+ "SpacyTextSplitter",
65
+ "NLTKTextSplitter",
66
+ "split_text_on_tokens",
67
+ "SentenceTransformersTokenTextSplitter",
68
+ "ElementType",
69
+ "HeaderType",
70
+ "LineType",
71
+ "HTMLHeaderTextSplitter",
72
+ "HTMLSectionSplitter",
73
+ "MarkdownHeaderTextSplitter",
74
+ "MarkdownTextSplitter",
75
+ "CharacterTextSplitter",
76
+ ]
@@ -0,0 +1,328 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ import logging
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from typing import (
9
+ AbstractSet,
10
+ Any,
11
+ Callable,
12
+ Collection,
13
+ Iterable,
14
+ List,
15
+ Literal,
16
+ Optional,
17
+ Sequence,
18
+ Type,
19
+ TypeVar,
20
+ Union,
21
+ )
22
+
23
+ from langchain_core.documents import BaseDocumentTransformer, Document
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ TS = TypeVar("TS", bound="TextSplitter")
28
+
29
+
30
+ class TextSplitter(BaseDocumentTransformer, ABC):
31
+ """Interface for splitting text into chunks."""
32
+
33
+ def __init__(
34
+ self,
35
+ chunk_size: int = 4000,
36
+ chunk_overlap: int = 200,
37
+ length_function: Callable[[str], int] = len,
38
+ keep_separator: Union[bool, Literal["start", "end"]] = False,
39
+ add_start_index: bool = False,
40
+ strip_whitespace: bool = True,
41
+ ) -> None:
42
+ """Create a new TextSplitter.
43
+
44
+ Args:
45
+ chunk_size: Maximum size of chunks to return
46
+ chunk_overlap: Overlap in characters between chunks
47
+ length_function: Function that measures the length of given chunks
48
+ keep_separator: Whether to keep the separator and where to place it
49
+ in each corresponding chunk (True='start')
50
+ add_start_index: If `True`, includes chunk's start index in metadata
51
+ strip_whitespace: If `True`, strips whitespace from the start and end of
52
+ every document
53
+ """
54
+ if chunk_overlap > chunk_size:
55
+ raise ValueError(
56
+ f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
57
+ f"({chunk_size}), should be smaller."
58
+ )
59
+ self._chunk_size = chunk_size
60
+ self._chunk_overlap = chunk_overlap
61
+ self._length_function = length_function
62
+ self._keep_separator = keep_separator
63
+ self._add_start_index = add_start_index
64
+ self._strip_whitespace = strip_whitespace
65
+
66
+ @abstractmethod
67
+ def split_text(self, text: str) -> List[str]:
68
+ """Split text into multiple components."""
69
+
70
+ def create_documents(
71
+ self, texts: List[str], metadatas: Optional[List[dict]] = None
72
+ ) -> List[Document]:
73
+ """Create documents from a list of texts."""
74
+ _metadatas = metadatas or [{}] * len(texts)
75
+ documents = []
76
+ for i, text in enumerate(texts):
77
+ index = 0
78
+ previous_chunk_len = 0
79
+ for chunk in self.split_text(text):
80
+ metadata = copy.deepcopy(_metadatas[i])
81
+ if self._add_start_index:
82
+ offset = index + previous_chunk_len - self._chunk_overlap
83
+ index = text.find(chunk, max(0, offset))
84
+ metadata["start_index"] = index
85
+ previous_chunk_len = len(chunk)
86
+ new_doc = Document(page_content=chunk, metadata=metadata)
87
+ documents.append(new_doc)
88
+ return documents
89
+
90
+ def split_documents(self, documents: Iterable[Document]) -> List[Document]:
91
+ """Split documents."""
92
+ texts, metadatas = [], []
93
+ for doc in documents:
94
+ texts.append(doc.page_content)
95
+ metadatas.append(doc.metadata)
96
+ return self.create_documents(texts, metadatas=metadatas)
97
+
98
+ def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
99
+ text = separator.join(docs)
100
+ if self._strip_whitespace:
101
+ text = text.strip()
102
+ if text == "":
103
+ return None
104
+ else:
105
+ return text
106
+
107
+ def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
108
+ # We now want to combine these smaller pieces into medium size
109
+ # chunks to send to the LLM.
110
+ separator_len = self._length_function(separator)
111
+
112
+ docs = []
113
+ current_doc: List[str] = []
114
+ total = 0
115
+ for d in splits:
116
+ _len = self._length_function(d)
117
+ if (
118
+ total + _len + (separator_len if len(current_doc) > 0 else 0)
119
+ > self._chunk_size
120
+ ):
121
+ if total > self._chunk_size:
122
+ logger.warning(
123
+ f"Created a chunk of size {total}, "
124
+ f"which is longer than the specified {self._chunk_size}"
125
+ )
126
+ if len(current_doc) > 0:
127
+ doc = self._join_docs(current_doc, separator)
128
+ if doc is not None:
129
+ docs.append(doc)
130
+ # Keep on popping if:
131
+ # - we have a larger chunk than in the chunk overlap
132
+ # - or if we still have any chunks and the length is long
133
+ while total > self._chunk_overlap or (
134
+ total + _len + (separator_len if len(current_doc) > 0 else 0)
135
+ > self._chunk_size
136
+ and total > 0
137
+ ):
138
+ total -= self._length_function(current_doc[0]) + (
139
+ separator_len if len(current_doc) > 1 else 0
140
+ )
141
+ current_doc = current_doc[1:]
142
+ current_doc.append(d)
143
+ total += _len + (separator_len if len(current_doc) > 1 else 0)
144
+ doc = self._join_docs(current_doc, separator)
145
+ if doc is not None:
146
+ docs.append(doc)
147
+ return docs
148
+
149
+ @classmethod
150
+ def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
151
+ """Text splitter that uses HuggingFace tokenizer to count length."""
152
+ try:
153
+ from transformers import PreTrainedTokenizerBase
154
+
155
+ if not isinstance(tokenizer, PreTrainedTokenizerBase):
156
+ raise ValueError(
157
+ "Tokenizer received was not an instance of PreTrainedTokenizerBase"
158
+ )
159
+
160
+ def _huggingface_tokenizer_length(text: str) -> int:
161
+ return len(tokenizer.encode(text))
162
+
163
+ except ImportError:
164
+ raise ValueError(
165
+ "Could not import transformers python package. "
166
+ "Please install it with `pip install transformers`."
167
+ )
168
+ return cls(length_function=_huggingface_tokenizer_length, **kwargs)
169
+
170
+ @classmethod
171
+ def from_tiktoken_encoder(
172
+ cls: Type[TS],
173
+ encoding_name: str = "gpt2",
174
+ model_name: Optional[str] = None,
175
+ allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
176
+ disallowed_special: Union[Literal["all"], Collection[str]] = "all",
177
+ **kwargs: Any,
178
+ ) -> TS:
179
+ """Text splitter that uses tiktoken encoder to count length."""
180
+ try:
181
+ import tiktoken
182
+ except ImportError:
183
+ raise ImportError(
184
+ "Could not import tiktoken python package. "
185
+ "This is needed in order to calculate max_tokens_for_prompt. "
186
+ "Please install it with `pip install tiktoken`."
187
+ )
188
+
189
+ if model_name is not None:
190
+ enc = tiktoken.encoding_for_model(model_name)
191
+ else:
192
+ enc = tiktoken.get_encoding(encoding_name)
193
+
194
+ def _tiktoken_encoder(text: str) -> int:
195
+ return len(
196
+ enc.encode(
197
+ text,
198
+ allowed_special=allowed_special,
199
+ disallowed_special=disallowed_special,
200
+ )
201
+ )
202
+
203
+ if issubclass(cls, TokenTextSplitter):
204
+ extra_kwargs = {
205
+ "encoding_name": encoding_name,
206
+ "model_name": model_name,
207
+ "allowed_special": allowed_special,
208
+ "disallowed_special": disallowed_special,
209
+ }
210
+ kwargs = {**kwargs, **extra_kwargs}
211
+
212
+ return cls(length_function=_tiktoken_encoder, **kwargs)
213
+
214
+ def transform_documents(
215
+ self, documents: Sequence[Document], **kwargs: Any
216
+ ) -> Sequence[Document]:
217
+ """Transform sequence of documents by splitting them."""
218
+ return self.split_documents(list(documents))
219
+
220
+
221
+ class TokenTextSplitter(TextSplitter):
222
+ """Splitting text to tokens using model tokenizer."""
223
+
224
+ def __init__(
225
+ self,
226
+ encoding_name: str = "gpt2",
227
+ model_name: Optional[str] = None,
228
+ allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
229
+ disallowed_special: Union[Literal["all"], Collection[str]] = "all",
230
+ **kwargs: Any,
231
+ ) -> None:
232
+ """Create a new TextSplitter."""
233
+ super().__init__(**kwargs)
234
+ try:
235
+ import tiktoken
236
+ except ImportError:
237
+ raise ImportError(
238
+ "Could not import tiktoken python package. "
239
+ "This is needed in order to for TokenTextSplitter. "
240
+ "Please install it with `pip install tiktoken`."
241
+ )
242
+
243
+ if model_name is not None:
244
+ enc = tiktoken.encoding_for_model(model_name)
245
+ else:
246
+ enc = tiktoken.get_encoding(encoding_name)
247
+ self._tokenizer = enc
248
+ self._allowed_special = allowed_special
249
+ self._disallowed_special = disallowed_special
250
+
251
+ def split_text(self, text: str) -> List[str]:
252
+ def _encode(_text: str) -> List[int]:
253
+ return self._tokenizer.encode(
254
+ _text,
255
+ allowed_special=self._allowed_special,
256
+ disallowed_special=self._disallowed_special,
257
+ )
258
+
259
+ tokenizer = Tokenizer(
260
+ chunk_overlap=self._chunk_overlap,
261
+ tokens_per_chunk=self._chunk_size,
262
+ decode=self._tokenizer.decode,
263
+ encode=_encode,
264
+ )
265
+
266
+ return split_text_on_tokens(text=text, tokenizer=tokenizer)
267
+
268
+
269
+ class Language(str, Enum):
270
+ """Enum of the programming languages."""
271
+
272
+ CPP = "cpp"
273
+ GO = "go"
274
+ JAVA = "java"
275
+ KOTLIN = "kotlin"
276
+ JS = "js"
277
+ TS = "ts"
278
+ PHP = "php"
279
+ PROTO = "proto"
280
+ PYTHON = "python"
281
+ RST = "rst"
282
+ RUBY = "ruby"
283
+ RUST = "rust"
284
+ SCALA = "scala"
285
+ SWIFT = "swift"
286
+ MARKDOWN = "markdown"
287
+ LATEX = "latex"
288
+ HTML = "html"
289
+ SOL = "sol"
290
+ CSHARP = "csharp"
291
+ COBOL = "cobol"
292
+ C = "c"
293
+ LUA = "lua"
294
+ PERL = "perl"
295
+ HASKELL = "haskell"
296
+ ELIXIR = "elixir"
297
+ POWERSHELL = "powershell"
298
+
299
+
300
+ @dataclass(frozen=True)
301
+ class Tokenizer:
302
+ """Tokenizer data class."""
303
+
304
+ chunk_overlap: int
305
+ """Overlap in tokens between chunks"""
306
+ tokens_per_chunk: int
307
+ """Maximum number of tokens per chunk"""
308
+ decode: Callable[[List[int]], str]
309
+ """ Function to decode a list of token ids to a string"""
310
+ encode: Callable[[str], List[int]]
311
+ """ Function to encode a string to a list of token ids"""
312
+
313
+
314
+ def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
315
+ """Split incoming text and return chunks using tokenizer."""
316
+ splits: List[str] = []
317
+ input_ids = tokenizer.encode(text)
318
+ start_idx = 0
319
+ cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
320
+ chunk_ids = input_ids[start_idx:cur_idx]
321
+ while start_idx < len(input_ids):
322
+ splits.append(tokenizer.decode(chunk_ids))
323
+ if cur_idx == len(input_ids):
324
+ break
325
+ start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap
326
+ cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
327
+ chunk_ids = input_ids[start_idx:cur_idx]
328
+ return splits