langchain-text-splitters 0.3.0.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_text_splitters-0.3.0.dev1/PKG-INFO +58 -0
- langchain_text_splitters-0.3.0.dev1/README.md +38 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/__init__.py +76 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/base.py +328 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/character.py +692 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/html.py +321 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/json.py +128 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/konlpy.py +36 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/latex.py +15 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/markdown.py +382 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/nltk.py +31 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/py.typed +0 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/python.py +15 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/sentence_transformers.py +77 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/spacy.py +61 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/xsl/converting_to_header.xslt +29 -0
- langchain_text_splitters-0.3.0.dev1/langchain_text_splitters/xsl/html_chunks_with_headers.xslt +199 -0
- langchain_text_splitters-0.3.0.dev1/pyproject.toml +84 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: langchain-text-splitters
|
|
3
|
+
Version: 0.3.0.dev1
|
|
4
|
+
Summary: LangChain text splitting utilities
|
|
5
|
+
Home-page: https://github.com/langchain-ai/langchain
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9,<4.0
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Dist: langchain-core (>=0.3.0.dev1,<0.4.0)
|
|
15
|
+
Project-URL: Repository, https://github.com/langchain-ai/langchain
|
|
16
|
+
Project-URL: Release Notes, https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-text-splitters%3D%3D0%22&expanded=true
|
|
17
|
+
Project-URL: Source Code, https://github.com/langchain-ai/langchain/tree/master/libs/text-splitters
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# 🦜✂️ LangChain Text Splitters
|
|
21
|
+
|
|
22
|
+
[](https://pepy.tech/project/langchain_text_splitters)
|
|
23
|
+
[](https://opensource.org/licenses/MIT)
|
|
24
|
+
|
|
25
|
+
## Quick Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install langchain-text-splitters
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## What is it?
|
|
32
|
+
|
|
33
|
+
LangChain Text Splitters contains utilities for splitting into chunks a wide variety of text documents.
|
|
34
|
+
|
|
35
|
+
For full documentation see the [API reference](https://api.python.langchain.com/en/stable/text_splitters_api_reference.html)
|
|
36
|
+
and the [Text Splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/) module in the main docs.
|
|
37
|
+
|
|
38
|
+
## 📕 Releases & Versioning
|
|
39
|
+
|
|
40
|
+
`langchain-text-splitters` is currently on version `0.0.x`.
|
|
41
|
+
|
|
42
|
+
Minor version increases will occur for:
|
|
43
|
+
|
|
44
|
+
- Breaking changes for any public interfaces NOT marked `beta`
|
|
45
|
+
|
|
46
|
+
Patch version increases will occur for:
|
|
47
|
+
|
|
48
|
+
- Bug fixes
|
|
49
|
+
- New features
|
|
50
|
+
- Any changes to private interfaces
|
|
51
|
+
- Any changes to `beta` features
|
|
52
|
+
|
|
53
|
+
## 💁 Contributing
|
|
54
|
+
|
|
55
|
+
As an open-source project in a rapidly developing field, we are extremely open to contributions, whether it be in the form of a new feature, improved infrastructure, or better documentation.
|
|
56
|
+
|
|
57
|
+
For detailed information on how to contribute, see the [Contributing Guide](https://python.langchain.com/docs/contributing/).
|
|
58
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# 🦜✂️ LangChain Text Splitters
|
|
2
|
+
|
|
3
|
+
[](https://pepy.tech/project/langchain_text_splitters)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
|
|
6
|
+
## Quick Install
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
pip install langchain-text-splitters
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## What is it?
|
|
13
|
+
|
|
14
|
+
LangChain Text Splitters contains utilities for splitting into chunks a wide variety of text documents.
|
|
15
|
+
|
|
16
|
+
For full documentation see the [API reference](https://api.python.langchain.com/en/stable/text_splitters_api_reference.html)
|
|
17
|
+
and the [Text Splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/) module in the main docs.
|
|
18
|
+
|
|
19
|
+
## 📕 Releases & Versioning
|
|
20
|
+
|
|
21
|
+
`langchain-text-splitters` is currently on version `0.0.x`.
|
|
22
|
+
|
|
23
|
+
Minor version increases will occur for:
|
|
24
|
+
|
|
25
|
+
- Breaking changes for any public interfaces NOT marked `beta`
|
|
26
|
+
|
|
27
|
+
Patch version increases will occur for:
|
|
28
|
+
|
|
29
|
+
- Bug fixes
|
|
30
|
+
- New features
|
|
31
|
+
- Any changes to private interfaces
|
|
32
|
+
- Any changes to `beta` features
|
|
33
|
+
|
|
34
|
+
## 💁 Contributing
|
|
35
|
+
|
|
36
|
+
As an open-source project in a rapidly developing field, we are extremely open to contributions, whether it be in the form of a new feature, improved infrastructure, or better documentation.
|
|
37
|
+
|
|
38
|
+
For detailed information on how to contribute, see the [Contributing Guide](https://python.langchain.com/docs/contributing/).
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""**Text Splitters** are classes for splitting text.
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
**Class hierarchy:**
|
|
5
|
+
|
|
6
|
+
.. code-block::
|
|
7
|
+
|
|
8
|
+
BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter # Example: CharacterTextSplitter
|
|
9
|
+
RecursiveCharacterTextSplitter --> <name>TextSplitter
|
|
10
|
+
|
|
11
|
+
Note: **MarkdownHeaderTextSplitter** and **HTMLHeaderTextSplitter do not derive from TextSplitter.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
**Main helpers:**
|
|
15
|
+
|
|
16
|
+
.. code-block::
|
|
17
|
+
|
|
18
|
+
Document, Tokenizer, Language, LineType, HeaderType
|
|
19
|
+
|
|
20
|
+
""" # noqa: E501
|
|
21
|
+
|
|
22
|
+
from langchain_text_splitters.base import (
|
|
23
|
+
Language,
|
|
24
|
+
TextSplitter,
|
|
25
|
+
Tokenizer,
|
|
26
|
+
TokenTextSplitter,
|
|
27
|
+
split_text_on_tokens,
|
|
28
|
+
)
|
|
29
|
+
from langchain_text_splitters.character import (
|
|
30
|
+
CharacterTextSplitter,
|
|
31
|
+
RecursiveCharacterTextSplitter,
|
|
32
|
+
)
|
|
33
|
+
from langchain_text_splitters.html import (
|
|
34
|
+
ElementType,
|
|
35
|
+
HTMLHeaderTextSplitter,
|
|
36
|
+
HTMLSectionSplitter,
|
|
37
|
+
)
|
|
38
|
+
from langchain_text_splitters.json import RecursiveJsonSplitter
|
|
39
|
+
from langchain_text_splitters.konlpy import KonlpyTextSplitter
|
|
40
|
+
from langchain_text_splitters.latex import LatexTextSplitter
|
|
41
|
+
from langchain_text_splitters.markdown import (
|
|
42
|
+
HeaderType,
|
|
43
|
+
LineType,
|
|
44
|
+
MarkdownHeaderTextSplitter,
|
|
45
|
+
MarkdownTextSplitter,
|
|
46
|
+
)
|
|
47
|
+
from langchain_text_splitters.nltk import NLTKTextSplitter
|
|
48
|
+
from langchain_text_splitters.python import PythonCodeTextSplitter
|
|
49
|
+
from langchain_text_splitters.sentence_transformers import (
|
|
50
|
+
SentenceTransformersTokenTextSplitter,
|
|
51
|
+
)
|
|
52
|
+
from langchain_text_splitters.spacy import SpacyTextSplitter
|
|
53
|
+
|
|
54
|
+
__all__ = [
|
|
55
|
+
"TokenTextSplitter",
|
|
56
|
+
"TextSplitter",
|
|
57
|
+
"Tokenizer",
|
|
58
|
+
"Language",
|
|
59
|
+
"RecursiveCharacterTextSplitter",
|
|
60
|
+
"RecursiveJsonSplitter",
|
|
61
|
+
"LatexTextSplitter",
|
|
62
|
+
"PythonCodeTextSplitter",
|
|
63
|
+
"KonlpyTextSplitter",
|
|
64
|
+
"SpacyTextSplitter",
|
|
65
|
+
"NLTKTextSplitter",
|
|
66
|
+
"split_text_on_tokens",
|
|
67
|
+
"SentenceTransformersTokenTextSplitter",
|
|
68
|
+
"ElementType",
|
|
69
|
+
"HeaderType",
|
|
70
|
+
"LineType",
|
|
71
|
+
"HTMLHeaderTextSplitter",
|
|
72
|
+
"HTMLSectionSplitter",
|
|
73
|
+
"MarkdownHeaderTextSplitter",
|
|
74
|
+
"MarkdownTextSplitter",
|
|
75
|
+
"CharacterTextSplitter",
|
|
76
|
+
]
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import logging
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import (
|
|
9
|
+
AbstractSet,
|
|
10
|
+
Any,
|
|
11
|
+
Callable,
|
|
12
|
+
Collection,
|
|
13
|
+
Iterable,
|
|
14
|
+
List,
|
|
15
|
+
Literal,
|
|
16
|
+
Optional,
|
|
17
|
+
Sequence,
|
|
18
|
+
Type,
|
|
19
|
+
TypeVar,
|
|
20
|
+
Union,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from langchain_core.documents import BaseDocumentTransformer, Document
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
TS = TypeVar("TS", bound="TextSplitter")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TextSplitter(BaseDocumentTransformer, ABC):
|
|
31
|
+
"""Interface for splitting text into chunks."""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
chunk_size: int = 4000,
|
|
36
|
+
chunk_overlap: int = 200,
|
|
37
|
+
length_function: Callable[[str], int] = len,
|
|
38
|
+
keep_separator: Union[bool, Literal["start", "end"]] = False,
|
|
39
|
+
add_start_index: bool = False,
|
|
40
|
+
strip_whitespace: bool = True,
|
|
41
|
+
) -> None:
|
|
42
|
+
"""Create a new TextSplitter.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
chunk_size: Maximum size of chunks to return
|
|
46
|
+
chunk_overlap: Overlap in characters between chunks
|
|
47
|
+
length_function: Function that measures the length of given chunks
|
|
48
|
+
keep_separator: Whether to keep the separator and where to place it
|
|
49
|
+
in each corresponding chunk (True='start')
|
|
50
|
+
add_start_index: If `True`, includes chunk's start index in metadata
|
|
51
|
+
strip_whitespace: If `True`, strips whitespace from the start and end of
|
|
52
|
+
every document
|
|
53
|
+
"""
|
|
54
|
+
if chunk_overlap > chunk_size:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
|
|
57
|
+
f"({chunk_size}), should be smaller."
|
|
58
|
+
)
|
|
59
|
+
self._chunk_size = chunk_size
|
|
60
|
+
self._chunk_overlap = chunk_overlap
|
|
61
|
+
self._length_function = length_function
|
|
62
|
+
self._keep_separator = keep_separator
|
|
63
|
+
self._add_start_index = add_start_index
|
|
64
|
+
self._strip_whitespace = strip_whitespace
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def split_text(self, text: str) -> List[str]:
|
|
68
|
+
"""Split text into multiple components."""
|
|
69
|
+
|
|
70
|
+
def create_documents(
|
|
71
|
+
self, texts: List[str], metadatas: Optional[List[dict]] = None
|
|
72
|
+
) -> List[Document]:
|
|
73
|
+
"""Create documents from a list of texts."""
|
|
74
|
+
_metadatas = metadatas or [{}] * len(texts)
|
|
75
|
+
documents = []
|
|
76
|
+
for i, text in enumerate(texts):
|
|
77
|
+
index = 0
|
|
78
|
+
previous_chunk_len = 0
|
|
79
|
+
for chunk in self.split_text(text):
|
|
80
|
+
metadata = copy.deepcopy(_metadatas[i])
|
|
81
|
+
if self._add_start_index:
|
|
82
|
+
offset = index + previous_chunk_len - self._chunk_overlap
|
|
83
|
+
index = text.find(chunk, max(0, offset))
|
|
84
|
+
metadata["start_index"] = index
|
|
85
|
+
previous_chunk_len = len(chunk)
|
|
86
|
+
new_doc = Document(page_content=chunk, metadata=metadata)
|
|
87
|
+
documents.append(new_doc)
|
|
88
|
+
return documents
|
|
89
|
+
|
|
90
|
+
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
|
|
91
|
+
"""Split documents."""
|
|
92
|
+
texts, metadatas = [], []
|
|
93
|
+
for doc in documents:
|
|
94
|
+
texts.append(doc.page_content)
|
|
95
|
+
metadatas.append(doc.metadata)
|
|
96
|
+
return self.create_documents(texts, metadatas=metadatas)
|
|
97
|
+
|
|
98
|
+
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
|
|
99
|
+
text = separator.join(docs)
|
|
100
|
+
if self._strip_whitespace:
|
|
101
|
+
text = text.strip()
|
|
102
|
+
if text == "":
|
|
103
|
+
return None
|
|
104
|
+
else:
|
|
105
|
+
return text
|
|
106
|
+
|
|
107
|
+
def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
|
|
108
|
+
# We now want to combine these smaller pieces into medium size
|
|
109
|
+
# chunks to send to the LLM.
|
|
110
|
+
separator_len = self._length_function(separator)
|
|
111
|
+
|
|
112
|
+
docs = []
|
|
113
|
+
current_doc: List[str] = []
|
|
114
|
+
total = 0
|
|
115
|
+
for d in splits:
|
|
116
|
+
_len = self._length_function(d)
|
|
117
|
+
if (
|
|
118
|
+
total + _len + (separator_len if len(current_doc) > 0 else 0)
|
|
119
|
+
> self._chunk_size
|
|
120
|
+
):
|
|
121
|
+
if total > self._chunk_size:
|
|
122
|
+
logger.warning(
|
|
123
|
+
f"Created a chunk of size {total}, "
|
|
124
|
+
f"which is longer than the specified {self._chunk_size}"
|
|
125
|
+
)
|
|
126
|
+
if len(current_doc) > 0:
|
|
127
|
+
doc = self._join_docs(current_doc, separator)
|
|
128
|
+
if doc is not None:
|
|
129
|
+
docs.append(doc)
|
|
130
|
+
# Keep on popping if:
|
|
131
|
+
# - we have a larger chunk than in the chunk overlap
|
|
132
|
+
# - or if we still have any chunks and the length is long
|
|
133
|
+
while total > self._chunk_overlap or (
|
|
134
|
+
total + _len + (separator_len if len(current_doc) > 0 else 0)
|
|
135
|
+
> self._chunk_size
|
|
136
|
+
and total > 0
|
|
137
|
+
):
|
|
138
|
+
total -= self._length_function(current_doc[0]) + (
|
|
139
|
+
separator_len if len(current_doc) > 1 else 0
|
|
140
|
+
)
|
|
141
|
+
current_doc = current_doc[1:]
|
|
142
|
+
current_doc.append(d)
|
|
143
|
+
total += _len + (separator_len if len(current_doc) > 1 else 0)
|
|
144
|
+
doc = self._join_docs(current_doc, separator)
|
|
145
|
+
if doc is not None:
|
|
146
|
+
docs.append(doc)
|
|
147
|
+
return docs
|
|
148
|
+
|
|
149
|
+
@classmethod
|
|
150
|
+
def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
|
|
151
|
+
"""Text splitter that uses HuggingFace tokenizer to count length."""
|
|
152
|
+
try:
|
|
153
|
+
from transformers import PreTrainedTokenizerBase
|
|
154
|
+
|
|
155
|
+
if not isinstance(tokenizer, PreTrainedTokenizerBase):
|
|
156
|
+
raise ValueError(
|
|
157
|
+
"Tokenizer received was not an instance of PreTrainedTokenizerBase"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def _huggingface_tokenizer_length(text: str) -> int:
|
|
161
|
+
return len(tokenizer.encode(text))
|
|
162
|
+
|
|
163
|
+
except ImportError:
|
|
164
|
+
raise ValueError(
|
|
165
|
+
"Could not import transformers python package. "
|
|
166
|
+
"Please install it with `pip install transformers`."
|
|
167
|
+
)
|
|
168
|
+
return cls(length_function=_huggingface_tokenizer_length, **kwargs)
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def from_tiktoken_encoder(
|
|
172
|
+
cls: Type[TS],
|
|
173
|
+
encoding_name: str = "gpt2",
|
|
174
|
+
model_name: Optional[str] = None,
|
|
175
|
+
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
|
|
176
|
+
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
|
177
|
+
**kwargs: Any,
|
|
178
|
+
) -> TS:
|
|
179
|
+
"""Text splitter that uses tiktoken encoder to count length."""
|
|
180
|
+
try:
|
|
181
|
+
import tiktoken
|
|
182
|
+
except ImportError:
|
|
183
|
+
raise ImportError(
|
|
184
|
+
"Could not import tiktoken python package. "
|
|
185
|
+
"This is needed in order to calculate max_tokens_for_prompt. "
|
|
186
|
+
"Please install it with `pip install tiktoken`."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if model_name is not None:
|
|
190
|
+
enc = tiktoken.encoding_for_model(model_name)
|
|
191
|
+
else:
|
|
192
|
+
enc = tiktoken.get_encoding(encoding_name)
|
|
193
|
+
|
|
194
|
+
def _tiktoken_encoder(text: str) -> int:
|
|
195
|
+
return len(
|
|
196
|
+
enc.encode(
|
|
197
|
+
text,
|
|
198
|
+
allowed_special=allowed_special,
|
|
199
|
+
disallowed_special=disallowed_special,
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if issubclass(cls, TokenTextSplitter):
|
|
204
|
+
extra_kwargs = {
|
|
205
|
+
"encoding_name": encoding_name,
|
|
206
|
+
"model_name": model_name,
|
|
207
|
+
"allowed_special": allowed_special,
|
|
208
|
+
"disallowed_special": disallowed_special,
|
|
209
|
+
}
|
|
210
|
+
kwargs = {**kwargs, **extra_kwargs}
|
|
211
|
+
|
|
212
|
+
return cls(length_function=_tiktoken_encoder, **kwargs)
|
|
213
|
+
|
|
214
|
+
def transform_documents(
|
|
215
|
+
self, documents: Sequence[Document], **kwargs: Any
|
|
216
|
+
) -> Sequence[Document]:
|
|
217
|
+
"""Transform sequence of documents by splitting them."""
|
|
218
|
+
return self.split_documents(list(documents))
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class TokenTextSplitter(TextSplitter):
|
|
222
|
+
"""Splitting text to tokens using model tokenizer."""
|
|
223
|
+
|
|
224
|
+
def __init__(
|
|
225
|
+
self,
|
|
226
|
+
encoding_name: str = "gpt2",
|
|
227
|
+
model_name: Optional[str] = None,
|
|
228
|
+
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
|
|
229
|
+
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
|
230
|
+
**kwargs: Any,
|
|
231
|
+
) -> None:
|
|
232
|
+
"""Create a new TextSplitter."""
|
|
233
|
+
super().__init__(**kwargs)
|
|
234
|
+
try:
|
|
235
|
+
import tiktoken
|
|
236
|
+
except ImportError:
|
|
237
|
+
raise ImportError(
|
|
238
|
+
"Could not import tiktoken python package. "
|
|
239
|
+
"This is needed in order to for TokenTextSplitter. "
|
|
240
|
+
"Please install it with `pip install tiktoken`."
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
if model_name is not None:
|
|
244
|
+
enc = tiktoken.encoding_for_model(model_name)
|
|
245
|
+
else:
|
|
246
|
+
enc = tiktoken.get_encoding(encoding_name)
|
|
247
|
+
self._tokenizer = enc
|
|
248
|
+
self._allowed_special = allowed_special
|
|
249
|
+
self._disallowed_special = disallowed_special
|
|
250
|
+
|
|
251
|
+
def split_text(self, text: str) -> List[str]:
|
|
252
|
+
def _encode(_text: str) -> List[int]:
|
|
253
|
+
return self._tokenizer.encode(
|
|
254
|
+
_text,
|
|
255
|
+
allowed_special=self._allowed_special,
|
|
256
|
+
disallowed_special=self._disallowed_special,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
tokenizer = Tokenizer(
|
|
260
|
+
chunk_overlap=self._chunk_overlap,
|
|
261
|
+
tokens_per_chunk=self._chunk_size,
|
|
262
|
+
decode=self._tokenizer.decode,
|
|
263
|
+
encode=_encode,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class Language(str, Enum):
|
|
270
|
+
"""Enum of the programming languages."""
|
|
271
|
+
|
|
272
|
+
CPP = "cpp"
|
|
273
|
+
GO = "go"
|
|
274
|
+
JAVA = "java"
|
|
275
|
+
KOTLIN = "kotlin"
|
|
276
|
+
JS = "js"
|
|
277
|
+
TS = "ts"
|
|
278
|
+
PHP = "php"
|
|
279
|
+
PROTO = "proto"
|
|
280
|
+
PYTHON = "python"
|
|
281
|
+
RST = "rst"
|
|
282
|
+
RUBY = "ruby"
|
|
283
|
+
RUST = "rust"
|
|
284
|
+
SCALA = "scala"
|
|
285
|
+
SWIFT = "swift"
|
|
286
|
+
MARKDOWN = "markdown"
|
|
287
|
+
LATEX = "latex"
|
|
288
|
+
HTML = "html"
|
|
289
|
+
SOL = "sol"
|
|
290
|
+
CSHARP = "csharp"
|
|
291
|
+
COBOL = "cobol"
|
|
292
|
+
C = "c"
|
|
293
|
+
LUA = "lua"
|
|
294
|
+
PERL = "perl"
|
|
295
|
+
HASKELL = "haskell"
|
|
296
|
+
ELIXIR = "elixir"
|
|
297
|
+
POWERSHELL = "powershell"
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
@dataclass(frozen=True)
|
|
301
|
+
class Tokenizer:
|
|
302
|
+
"""Tokenizer data class."""
|
|
303
|
+
|
|
304
|
+
chunk_overlap: int
|
|
305
|
+
"""Overlap in tokens between chunks"""
|
|
306
|
+
tokens_per_chunk: int
|
|
307
|
+
"""Maximum number of tokens per chunk"""
|
|
308
|
+
decode: Callable[[List[int]], str]
|
|
309
|
+
""" Function to decode a list of token ids to a string"""
|
|
310
|
+
encode: Callable[[str], List[int]]
|
|
311
|
+
""" Function to encode a string to a list of token ids"""
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
|
|
315
|
+
"""Split incoming text and return chunks using tokenizer."""
|
|
316
|
+
splits: List[str] = []
|
|
317
|
+
input_ids = tokenizer.encode(text)
|
|
318
|
+
start_idx = 0
|
|
319
|
+
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
|
|
320
|
+
chunk_ids = input_ids[start_idx:cur_idx]
|
|
321
|
+
while start_idx < len(input_ids):
|
|
322
|
+
splits.append(tokenizer.decode(chunk_ids))
|
|
323
|
+
if cur_idx == len(input_ids):
|
|
324
|
+
break
|
|
325
|
+
start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap
|
|
326
|
+
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
|
|
327
|
+
chunk_ids = input_ids[start_idx:cur_idx]
|
|
328
|
+
return splits
|