omni-split 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omni_split/__init__.py +16 -0
- omni_split/base/__init__.py +0 -0
- omni_split/base/chonkie_base.py +139 -0
- omni_split/base/chonkie_tokenizer.py +285 -0
- omni_split/base/chonkie_types.py +519 -0
- omni_split/base/md2json_list.py +303 -0
- omni_split/base/md_json_list2chunk.py +348 -0
- omni_split/main.py +73 -0
- omni_split/model/text_chunker_tokenizer/qwen_tokenizer.json +303282 -0
- omni_split/omni_split.py +93 -0
- omni_split/sub_chunker/__init__.py +0 -0
- omni_split/sub_chunker/document_split.py +32 -0
- omni_split/sub_chunker/markdown_split.py +47 -0
- omni_split/sub_chunker/text_split.py +343 -0
- omni_split/test.py +80 -0
- omni_split/utils/__init__.py +0 -0
- omni_split/utils/base_utils.py +181 -0
- omni_split/utils/download_test_doc.py +61 -0
- omni_split-0.0.3.dist-info/METADATA +147 -0
- omni_split-0.0.3.dist-info/RECORD +23 -0
- omni_split-0.0.3.dist-info/WHEEL +5 -0
- omni_split-0.0.3.dist-info/licenses/LICENSE +21 -0
- omni_split-0.0.3.dist-info/top_level.txt +1 -0
omni_split/omni_split.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from transformers import PreTrainedTokenizerFast
|
|
3
|
+
from .sub_chunker.document_split import DocumentChunker
|
|
4
|
+
from .sub_chunker.markdown_split import MarkdownChunker
|
|
5
|
+
from .sub_chunker.text_split import SentenceChunker
|
|
6
|
+
from .utils.base_utils import save_local_images_func
|
|
7
|
+
from importlib.resources import files
|
|
8
|
+
|
|
9
|
+
class OmniSplit:
|
|
10
|
+
def __init__(self, tokenizer_json_path=None, txt_chunk_size=512):
|
|
11
|
+
if tokenizer_json_path is None:
|
|
12
|
+
# 获取当前文件的绝对路径,然后构建模型路径
|
|
13
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
14
|
+
tokenizer_json_path = os.path.join(current_dir, "model", "text_chunker_tokenizer","qwen_tokenizer.json")
|
|
15
|
+
self.tokenizer_json_path = tokenizer_json_path
|
|
16
|
+
self.txt_chunk_size = txt_chunk_size
|
|
17
|
+
self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=self.tokenizer_json_path)
|
|
18
|
+
|
|
19
|
+
def get_text_len_func(self, text):
|
|
20
|
+
"""
|
|
21
|
+
* @description: 获取文本长度
|
|
22
|
+
* @param self :
|
|
23
|
+
* @param text :
|
|
24
|
+
* @return
|
|
25
|
+
"""
|
|
26
|
+
if type(text) == str:
|
|
27
|
+
return len(self.tokenizer.encode(text, add_special_tokens=False))
|
|
28
|
+
else:
|
|
29
|
+
raise ValueError("text must be str")
|
|
30
|
+
|
|
31
|
+
def text_chunk_func(self, text, txt_chunk_size=None):
|
|
32
|
+
"""
|
|
33
|
+
* @description: 纯文本的切割方法
|
|
34
|
+
* @param self :
|
|
35
|
+
* @param text :
|
|
36
|
+
* @return
|
|
37
|
+
"""
|
|
38
|
+
if txt_chunk_size is None:
|
|
39
|
+
txt_chunk_size = self.txt_chunk_size
|
|
40
|
+
text_chunker = SentenceChunker(tokenizer_or_token_counter=self.tokenizer, chunk_size=txt_chunk_size, delim=["!", "?", "\n", "。", ";", ";"], return_type="texts")
|
|
41
|
+
temp_data_list = text_chunker.chunk(text)
|
|
42
|
+
ret_data = []
|
|
43
|
+
for item in temp_data_list:
|
|
44
|
+
ret_data.append({
|
|
45
|
+
"type":"text",
|
|
46
|
+
"text":item,
|
|
47
|
+
"text_len":self.get_text_len_func(item)
|
|
48
|
+
})
|
|
49
|
+
return ret_data
|
|
50
|
+
|
|
51
|
+
def markdown_json_chunk_func(self, markdown_json, txt_chunk_size=None, clear_model=False):
|
|
52
|
+
if txt_chunk_size is None:
|
|
53
|
+
txt_chunk_size = self.txt_chunk_size
|
|
54
|
+
md_chunker = MarkdownChunker(max_chunk_words=txt_chunk_size, clear_model=clear_model)
|
|
55
|
+
ret_data = md_chunker.convert_json_list2chunk_list_func(markdown_json)
|
|
56
|
+
for item in ret_data:
|
|
57
|
+
item["text_len"]= self.get_text_len_func(item["text"])
|
|
58
|
+
return ret_data
|
|
59
|
+
|
|
60
|
+
def markdown_chunk_func(self, markdown_text, txt_chunk_size=None, clear_model=False):
|
|
61
|
+
"""
|
|
62
|
+
* @description: markdown的切割方法
|
|
63
|
+
* @param self :
|
|
64
|
+
* @param text :
|
|
65
|
+
* @return
|
|
66
|
+
"""
|
|
67
|
+
if txt_chunk_size is None:
|
|
68
|
+
txt_chunk_size = self.txt_chunk_size
|
|
69
|
+
md_chunker = MarkdownChunker(max_chunk_words=txt_chunk_size, clear_model=clear_model)
|
|
70
|
+
ret_data = md_chunker.chunk(markdown_text)
|
|
71
|
+
for item in ret_data:
|
|
72
|
+
item["text_len"]= self.get_text_len_func(item["text"])
|
|
73
|
+
return ret_data
|
|
74
|
+
|
|
75
|
+
def document_chunk_func(self, document_content, txt_chunk_size=None, clear_model=False,save_local_images_dir=""):
|
|
76
|
+
"""
|
|
77
|
+
* @description: office文档的切割方法
|
|
78
|
+
* @param self :
|
|
79
|
+
* @param text :
|
|
80
|
+
* @return
|
|
81
|
+
"""
|
|
82
|
+
if txt_chunk_size is None:
|
|
83
|
+
txt_chunk_size = self.txt_chunk_size
|
|
84
|
+
if save_local_images_dir=="":
|
|
85
|
+
save_local_images_dir = save_local_images_dir
|
|
86
|
+
doc_chunker = DocumentChunker(max_chunk_words=txt_chunk_size, clear_model=clear_model)
|
|
87
|
+
ret_data = doc_chunker.chunk(document_content)
|
|
88
|
+
|
|
89
|
+
if save_local_images_dir!="" and not clear_model:
|
|
90
|
+
ret_data = save_local_images_func(ret_data, save_local_images_dir)
|
|
91
|
+
for item in ret_data:
|
|
92
|
+
item["text_len"]= self.get_text_len_func(item["text"])
|
|
93
|
+
return ret_data
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from markitdown import MarkItDown
|
|
2
|
+
from .markdown_split import MarkdownChunker
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DocumentChunker:
|
|
6
|
+
def __init__(self, max_chunk_words=1000, soft_chunk_words=None, hard_limit=None, clear_model=False):
|
|
7
|
+
self.max_chunk_words = max_chunk_words
|
|
8
|
+
if soft_chunk_words is None:
|
|
9
|
+
self.soft_chunk_words = max_chunk_words * 0.4
|
|
10
|
+
else:
|
|
11
|
+
self.soft_chunk_words = soft_chunk_words
|
|
12
|
+
if hard_limit is None:
|
|
13
|
+
self.hard_limit = max_chunk_words * 1.4
|
|
14
|
+
else:
|
|
15
|
+
self.hard_limit = hard_limit
|
|
16
|
+
self.clear_model = clear_model
|
|
17
|
+
self.markitdown = MarkItDown(enable_plugins=False)
|
|
18
|
+
self.markdown_chunker = MarkdownChunker(max_chunk_words=self.max_chunk_words, clear_model=self.clear_model)
|
|
19
|
+
|
|
20
|
+
def convert_document2md_func(self, document_content):
|
|
21
|
+
|
|
22
|
+
# print("convert_document2md_func")
|
|
23
|
+
# print()
|
|
24
|
+
markdown_result = self.markitdown.convert(document_content,keep_data_uris=True)
|
|
25
|
+
content = markdown_result.markdown
|
|
26
|
+
return content
|
|
27
|
+
|
|
28
|
+
def chunk(self, document_content):
|
|
29
|
+
markdown_text = self.convert_document2md_func(document_content)
|
|
30
|
+
chunk_list = self.markdown_chunker.chunk(markdown_text)
|
|
31
|
+
assert type(chunk_list) == list
|
|
32
|
+
return chunk_list
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from ..base.md_json_list2chunk import markdown_json_list2chunk_list
|
|
2
|
+
from ..base.md2json_list import md2json_list_func
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MarkdownChunker:
|
|
6
|
+
def __init__(self, max_chunk_words=1000, soft_chunk_words=None, hard_limit=None, clear_model=False):
|
|
7
|
+
self.max_chunk_words = max_chunk_words
|
|
8
|
+
if soft_chunk_words is None:
|
|
9
|
+
self.soft_chunk_words = max_chunk_words * 0.4
|
|
10
|
+
else:
|
|
11
|
+
self.soft_chunk_words = soft_chunk_words
|
|
12
|
+
if hard_limit is None:
|
|
13
|
+
self.hard_limit = max_chunk_words * 1.4
|
|
14
|
+
else:
|
|
15
|
+
self.hard_limit = hard_limit
|
|
16
|
+
self.clear_model = clear_model
|
|
17
|
+
##
|
|
18
|
+
# self.chunk_markdown_json = self.convert_json_list2chunk_list_func
|
|
19
|
+
def convert_markdown2json_list_func(self, markdown_text):
|
|
20
|
+
markdown_json_list = md2json_list_func(markdown_text)
|
|
21
|
+
assert type(markdown_json_list) == list
|
|
22
|
+
return markdown_json_list
|
|
23
|
+
|
|
24
|
+
def convert_json_list2chunk_list_func(self, json_list):
|
|
25
|
+
if self.clear_model:
|
|
26
|
+
## todo: 处理特殊模式(清理不可以embedding的内容, 如图片)
|
|
27
|
+
json_clear_list = []
|
|
28
|
+
for item in json_list:
|
|
29
|
+
if item["type"] == "image":
|
|
30
|
+
continue
|
|
31
|
+
# note: 其他的按照标准text对待.
|
|
32
|
+
json_clear_list.append(item)
|
|
33
|
+
json_list = json_clear_list
|
|
34
|
+
chunk_list = markdown_json_list2chunk_list(json_list, MAX_CHUNK_WORDS=self.max_chunk_words, SOFT_CHUNK_WORDS=self.soft_chunk_words, HARD_LIMIT=self.hard_limit)
|
|
35
|
+
return chunk_list
|
|
36
|
+
|
|
37
|
+
def chunk(self, text_content):
|
|
38
|
+
markdown_json_list = self.convert_markdown2json_list_func(text_content)
|
|
39
|
+
for item in markdown_json_list:
|
|
40
|
+
if item.get("text_level",None) is not None:
|
|
41
|
+
item["text_level"] = 1
|
|
42
|
+
chunk_list = self.convert_json_list2chunk_list_func(markdown_json_list)
|
|
43
|
+
assert type(markdown_json_list) == list
|
|
44
|
+
return chunk_list
|
|
45
|
+
# import json
|
|
46
|
+
# with open("temp.json","w") as f:
|
|
47
|
+
# json.dump(markdown_json_list,f,ensure_ascii=False,indent=4)
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""Sentence chunker."""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from bisect import bisect_left
|
|
5
|
+
from itertools import accumulate
|
|
6
|
+
from typing import Any, Callable, List, Literal, Union
|
|
7
|
+
|
|
8
|
+
from ..base.chonkie_types import Chunk, Sentence, SentenceChunk
|
|
9
|
+
from ..base.chonkie_base import BaseChunker
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from transformers import PreTrainedTokenizerFast
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SentenceChunker(BaseChunker):
|
|
16
|
+
"""SentenceChunker splits the sentences in a text based on token limits and sentence boundaries.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
tokenizer_or_token_counter: The tokenizer instance to use for encoding/decoding
|
|
20
|
+
chunk_size: Maximum number of tokens per chunk
|
|
21
|
+
chunk_overlap: Number of tokens to overlap between chunks
|
|
22
|
+
min_sentences_per_chunk: Minimum number of sentences per chunk (defaults to 1)
|
|
23
|
+
min_characters_per_sentence: Minimum number of characters per sentence
|
|
24
|
+
approximate: Whether to use approximate token counting (defaults to True)
|
|
25
|
+
delim: Delimiters to split sentences on
|
|
26
|
+
include_delim: Whether to include delimiters in current chunk, next chunk or not at all (defaults to "prev")
|
|
27
|
+
return_type: Whether to return chunks or texts
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
ValueError: If parameters are invalid
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
tokenizer_or_token_counter: Union[str, Callable, Any] = "gpt2",
|
|
37
|
+
chunk_size: int = 512,
|
|
38
|
+
chunk_overlap: int = 0,
|
|
39
|
+
min_sentences_per_chunk: int = 1,
|
|
40
|
+
min_characters_per_sentence: int = 12,
|
|
41
|
+
approximate: bool = True,
|
|
42
|
+
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
|
|
43
|
+
include_delim: Union[Literal["prev", "next"], None] = "prev",
|
|
44
|
+
return_type: Literal["chunks", "texts"] = "chunks",
|
|
45
|
+
):
|
|
46
|
+
"""Initialize the SentenceChunker with configuration parameters.
|
|
47
|
+
|
|
48
|
+
SentenceChunker splits the sentences in a text based on token limits and sentence boundaries.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
tokenizer_or_token_counter: The tokenizer instance to use for encoding/decoding (defaults to "gpt2")
|
|
52
|
+
chunk_size: Maximum number of tokens per chunk (defaults to 512)
|
|
53
|
+
chunk_overlap: Number of tokens to overlap between chunks (defaults to 0)
|
|
54
|
+
min_sentences_per_chunk: Minimum number of sentences per chunk (defaults to 1)
|
|
55
|
+
min_characters_per_sentence: Minimum number of characters per sentence (defaults to 12)
|
|
56
|
+
approximate: Whether to use approximate token counting (defaults to True)
|
|
57
|
+
delim: Delimiters to split sentences on (defaults to [".", "!", "?", "newline"])
|
|
58
|
+
include_delim: Whether to include delimiters in current chunk, next chunk or not at all (defaults to "prev")
|
|
59
|
+
return_type: Whether to return chunks or texts (defaults to "chunks")
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If parameters are invalid
|
|
63
|
+
|
|
64
|
+
"""
|
|
65
|
+
super().__init__(tokenizer_or_token_counter=tokenizer_or_token_counter)
|
|
66
|
+
|
|
67
|
+
if chunk_size <= 0:
|
|
68
|
+
raise ValueError("chunk_size must be positive")
|
|
69
|
+
if chunk_overlap >= chunk_size:
|
|
70
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
71
|
+
if min_sentences_per_chunk < 1:
|
|
72
|
+
raise ValueError("min_sentences_per_chunk must be at least 1")
|
|
73
|
+
if min_characters_per_sentence < 1:
|
|
74
|
+
raise ValueError("min_characters_per_sentence must be at least 1")
|
|
75
|
+
if delim is None:
|
|
76
|
+
raise ValueError("delim must be a list of strings or a string")
|
|
77
|
+
if include_delim not in ["prev", "next", None]:
|
|
78
|
+
raise ValueError("include_delim must be 'prev', 'next' or None")
|
|
79
|
+
if return_type not in ["chunks", "texts"]:
|
|
80
|
+
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
|
|
81
|
+
|
|
82
|
+
# Add chunk_overlap deprecation warning
|
|
83
|
+
if chunk_overlap > 0:
|
|
84
|
+
warnings.warn(
|
|
85
|
+
"chunk_overlap is getting deprecated in v0.6.0. " + "🦛 Chonkie advises you to use OverlapRefinery instead which is more flexible and powerful!",
|
|
86
|
+
DeprecationWarning,
|
|
87
|
+
)
|
|
88
|
+
# Assign the values if they make sense
|
|
89
|
+
self.chunk_size = chunk_size
|
|
90
|
+
self.chunk_overlap = chunk_overlap
|
|
91
|
+
self.min_sentences_per_chunk = min_sentences_per_chunk
|
|
92
|
+
self.min_characters_per_sentence = min_characters_per_sentence
|
|
93
|
+
self.approximate = approximate
|
|
94
|
+
self.delim = delim
|
|
95
|
+
self.include_delim = include_delim
|
|
96
|
+
self.sep = "🦛"
|
|
97
|
+
self.return_type = return_type
|
|
98
|
+
|
|
99
|
+
def _split_sentences(self, text: str) -> List[str]:
|
|
100
|
+
"""Fast sentence splitting while maintaining accuracy.
|
|
101
|
+
|
|
102
|
+
This method is faster than using regex for sentence splitting and is more accurate than using the spaCy sentence tokenizer.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
text: Input text to be split into sentences
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of sentences
|
|
109
|
+
|
|
110
|
+
"""
|
|
111
|
+
t = text
|
|
112
|
+
for c in self.delim:
|
|
113
|
+
if self.include_delim == "prev":
|
|
114
|
+
t = t.replace(c, c + self.sep)
|
|
115
|
+
elif self.include_delim == "next":
|
|
116
|
+
t = t.replace(c, self.sep + c)
|
|
117
|
+
else:
|
|
118
|
+
t = t.replace(c, self.sep)
|
|
119
|
+
|
|
120
|
+
# Initial split
|
|
121
|
+
splits = [s for s in t.split(self.sep) if s != ""]
|
|
122
|
+
|
|
123
|
+
# Combine short splits with previous sentence
|
|
124
|
+
current = ""
|
|
125
|
+
sentences = []
|
|
126
|
+
for s in splits:
|
|
127
|
+
# If the split is short, add to current and if long add to sentences
|
|
128
|
+
if len(s) < self.min_characters_per_sentence:
|
|
129
|
+
current += s
|
|
130
|
+
elif current:
|
|
131
|
+
current += s
|
|
132
|
+
sentences.append(current)
|
|
133
|
+
current = ""
|
|
134
|
+
else:
|
|
135
|
+
sentences.append(s)
|
|
136
|
+
|
|
137
|
+
# At any point if the current sentence is longer than the min_characters_per_sentence,
|
|
138
|
+
# add it to the sentences
|
|
139
|
+
if len(current) >= self.min_characters_per_sentence:
|
|
140
|
+
sentences.append(current)
|
|
141
|
+
current = ""
|
|
142
|
+
|
|
143
|
+
# If there is a current split, add it to the sentences
|
|
144
|
+
if current:
|
|
145
|
+
sentences.append(current)
|
|
146
|
+
|
|
147
|
+
return sentences
|
|
148
|
+
|
|
149
|
+
def _estimate_token_counts(self, sentences: Union[str, List[str]]) -> int:
|
|
150
|
+
"""Estimate token count using character length."""
|
|
151
|
+
CHARS_PER_TOKEN = 6.0 # Avg. char per token for llama3 is b/w 6-7
|
|
152
|
+
if type(sentences) is str:
|
|
153
|
+
return max(1, len(sentences) // CHARS_PER_TOKEN)
|
|
154
|
+
elif type(sentences) is list and type(sentences[0]) is str:
|
|
155
|
+
return [max(1, len(t) // CHARS_PER_TOKEN) for t in sentences]
|
|
156
|
+
else:
|
|
157
|
+
raise ValueError(f"Unknown type passed to _estimate_token_count: {type(sentences)}")
|
|
158
|
+
|
|
159
|
+
def _get_feedback(self, estimate: int, actual: int) -> float:
|
|
160
|
+
"""Validate against the actual token counts and correct the estimates."""
|
|
161
|
+
estimate, actual = max(1, estimate), max(1, actual)
|
|
162
|
+
feedback = max(0.01, 1 - ((estimate - actual) / estimate))
|
|
163
|
+
return feedback
|
|
164
|
+
|
|
165
|
+
def _prepare_sentences(self, text: str) -> List[Sentence]:
|
|
166
|
+
"""Split text into sentences and calculate token counts for each sentence.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
text: Input text to be split into sentences
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
List of Sentence objects
|
|
173
|
+
|
|
174
|
+
"""
|
|
175
|
+
# Split text into sentences
|
|
176
|
+
sentence_texts = self._split_sentences(text)
|
|
177
|
+
if not sentence_texts:
|
|
178
|
+
return []
|
|
179
|
+
|
|
180
|
+
# Calculate positions once
|
|
181
|
+
positions = []
|
|
182
|
+
current_pos = 0
|
|
183
|
+
for sent in sentence_texts:
|
|
184
|
+
positions.append(current_pos)
|
|
185
|
+
current_pos += len(sent) # No +1 space because sentences are already separated by spaces
|
|
186
|
+
|
|
187
|
+
if not self.approximate:
|
|
188
|
+
# Get accurate token counts in batch
|
|
189
|
+
token_counts = self.tokenizer.count_tokens_batch(sentence_texts)
|
|
190
|
+
else:
|
|
191
|
+
# Estimate token counts using character length
|
|
192
|
+
token_counts = self._estimate_token_counts(sentence_texts)
|
|
193
|
+
|
|
194
|
+
# Create sentence objects
|
|
195
|
+
return [Sentence(text=sent, start_index=pos, end_index=pos + len(sent), token_count=count) for sent, pos, count in zip(sentence_texts, positions, token_counts)]
|
|
196
|
+
|
|
197
|
+
def _create_chunk(self, sentences: List[Sentence], token_count: int) -> Chunk:
|
|
198
|
+
"""Create a chunk from a list of sentences.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
sentences: List of sentences to create chunk from
|
|
202
|
+
token_count: Total token count for the chunk
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Chunk object
|
|
206
|
+
|
|
207
|
+
"""
|
|
208
|
+
chunk_text = "".join([sentence.text for sentence in sentences])
|
|
209
|
+
if self.return_type == "texts":
|
|
210
|
+
return chunk_text
|
|
211
|
+
else:
|
|
212
|
+
return SentenceChunk(
|
|
213
|
+
text=chunk_text,
|
|
214
|
+
start_index=sentences[0].start_index,
|
|
215
|
+
end_index=sentences[-1].end_index,
|
|
216
|
+
token_count=token_count,
|
|
217
|
+
sentences=sentences,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def chunk(self, text: str) -> List[Chunk]:
|
|
221
|
+
"""Split text into overlapping chunks based on sentences while respecting token limits.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
text: Input text to be chunked
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
List of Chunk objects containing the chunked text and metadata
|
|
228
|
+
|
|
229
|
+
"""
|
|
230
|
+
if not text.strip():
|
|
231
|
+
return []
|
|
232
|
+
|
|
233
|
+
# Get prepared sentences with token counts
|
|
234
|
+
sentences = self._prepare_sentences(text) # 28mus
|
|
235
|
+
if not sentences:
|
|
236
|
+
return []
|
|
237
|
+
|
|
238
|
+
# Pre-calculate cumulative token counts for bisect
|
|
239
|
+
# Add 1 token for spaces between sentences
|
|
240
|
+
token_sums = list(accumulate([s.token_count for s in sentences], lambda a, b: a + b, initial=0))
|
|
241
|
+
|
|
242
|
+
chunks = []
|
|
243
|
+
feedback = 1.0
|
|
244
|
+
pos = 0
|
|
245
|
+
|
|
246
|
+
while pos < len(sentences):
|
|
247
|
+
# use updated feedback on the token sums
|
|
248
|
+
token_sums = [int(s * feedback) for s in token_sums]
|
|
249
|
+
|
|
250
|
+
# Use bisect_left to find initial split point
|
|
251
|
+
target_tokens = token_sums[pos] + self.chunk_size
|
|
252
|
+
split_idx = bisect_left(token_sums, target_tokens) - 1
|
|
253
|
+
split_idx = min(split_idx, len(sentences))
|
|
254
|
+
|
|
255
|
+
# Ensure we include at least one sentence beyond pos
|
|
256
|
+
split_idx = max(split_idx, pos + 1)
|
|
257
|
+
|
|
258
|
+
# Handle minimum sentences requirement
|
|
259
|
+
if split_idx - pos < self.min_sentences_per_chunk:
|
|
260
|
+
# If the minimum sentences per chunk can be met, set the split index to the minimum sentences per chunk
|
|
261
|
+
# Otherwise, warn the user that the minimum sentences per chunk could not be met for all chunks
|
|
262
|
+
if pos + self.min_sentences_per_chunk <= len(sentences):
|
|
263
|
+
split_idx = pos + self.min_sentences_per_chunk
|
|
264
|
+
else:
|
|
265
|
+
warnings.warn(
|
|
266
|
+
f"Minimum sentences per chunk as {self.min_sentences_per_chunk} could not be met for all chunks. "
|
|
267
|
+
+ f"Last chunk of the text will have only {len(sentences) - pos} sentences. "
|
|
268
|
+
+ "Consider increasing the chunk_size or decreasing the min_sentences_per_chunk."
|
|
269
|
+
)
|
|
270
|
+
split_idx = len(sentences)
|
|
271
|
+
|
|
272
|
+
# Get the estimated token count
|
|
273
|
+
estimate = token_sums[split_idx] - token_sums[pos]
|
|
274
|
+
|
|
275
|
+
# Get candidate sentences and verify actual token count
|
|
276
|
+
chunk_sentences = sentences[pos:split_idx]
|
|
277
|
+
chunk_text = "".join(s.text for s in chunk_sentences)
|
|
278
|
+
actual = self.tokenizer.count_tokens(chunk_text)
|
|
279
|
+
|
|
280
|
+
# Given the actual token_count and the estimate, get a feedback value for the next loop
|
|
281
|
+
feedback = self._get_feedback(estimate, actual)
|
|
282
|
+
# print(f"Estimate: {estimate} Actual: {actual} feedback: {feedback}")
|
|
283
|
+
|
|
284
|
+
# Back off one sentence at a time if we exceeded chunk size
|
|
285
|
+
while actual > self.chunk_size and len(chunk_sentences) > self.min_sentences_per_chunk:
|
|
286
|
+
split_idx -= 1
|
|
287
|
+
chunk_sentences = sentences[pos:split_idx]
|
|
288
|
+
chunk_text = "".join(s.text for s in chunk_sentences)
|
|
289
|
+
actual = self.tokenizer.count_tokens(chunk_text)
|
|
290
|
+
|
|
291
|
+
chunks.append(self._create_chunk(chunk_sentences, actual))
|
|
292
|
+
|
|
293
|
+
# Calculate next position with overlap
|
|
294
|
+
if self.chunk_overlap > 0 and split_idx < len(sentences):
|
|
295
|
+
# Calculate how many sentences we need for overlap
|
|
296
|
+
overlap_tokens = 0
|
|
297
|
+
overlap_idx = split_idx - 1
|
|
298
|
+
|
|
299
|
+
while overlap_idx > pos and overlap_tokens < self.chunk_overlap:
|
|
300
|
+
sent = sentences[overlap_idx]
|
|
301
|
+
next_tokens = overlap_tokens + sent.token_count + 1 # +1 for space
|
|
302
|
+
if next_tokens > self.chunk_overlap:
|
|
303
|
+
break
|
|
304
|
+
overlap_tokens = next_tokens
|
|
305
|
+
overlap_idx -= 1
|
|
306
|
+
|
|
307
|
+
# Move position to after the overlap
|
|
308
|
+
pos = overlap_idx + 1
|
|
309
|
+
else:
|
|
310
|
+
pos = split_idx
|
|
311
|
+
|
|
312
|
+
return chunks
|
|
313
|
+
|
|
314
|
+
def __repr__(self) -> str:
|
|
315
|
+
"""Return a string representation of the SentenceChunker."""
|
|
316
|
+
return (
|
|
317
|
+
f"SentenceChunker(tokenizer={self.tokenizer}, "
|
|
318
|
+
f"chunk_size={self.chunk_size}, "
|
|
319
|
+
f"chunk_overlap={self.chunk_overlap}, "
|
|
320
|
+
f"min_sentences_per_chunk={self.min_sentences_per_chunk}, "
|
|
321
|
+
f"min_characters_per_sentence={self.min_characters_per_sentence}, "
|
|
322
|
+
f"approximate={self.approximate}, delim={self.delim}, "
|
|
323
|
+
f"include_delim={self.include_delim}, "
|
|
324
|
+
f"return_type={self.return_type})"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
if __name__ == "__main__":
|
|
329
|
+
|
|
330
|
+
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./model/qwen_tokenizer.json") # type: ignore
|
|
331
|
+
print("tokenzier load success")
|
|
332
|
+
|
|
333
|
+
file_name = "test/txt.txt"
|
|
334
|
+
with open(file_name, "r") as f:
|
|
335
|
+
text_content = "".join(f.readlines())
|
|
336
|
+
# print(data)
|
|
337
|
+
# 创建解析器对象
|
|
338
|
+
text_chunker = SentenceChunker(tokenizer_or_token_counter=tokenizer, chunk_size=512, delim=["!", "?", "\n", "。", ";", ";"], return_type="texts")
|
|
339
|
+
ret_data = text_chunker.chunk(text_content)
|
|
340
|
+
# print(ret_data)
|
|
341
|
+
for item in ret_data:
|
|
342
|
+
print(item)
|
|
343
|
+
print("=======")
|
omni_split/test.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
|
|
4
|
+
# from pprint import pprint
|
|
5
|
+
from omni_split.omni_split import OmniSplit
|
|
6
|
+
import docx
|
|
7
|
+
|
|
8
|
+
from utils.base_utils import word_preprocessing_and_return_bytesIO
|
|
9
|
+
from omni_split import download_tokenizer_from_network
|
|
10
|
+
omni_spliter = OmniSplit()
|
|
11
|
+
## note: test text split
|
|
12
|
+
test_text = True
|
|
13
|
+
if test_text:
|
|
14
|
+
with open("test/text_test.txt", "r") as f:
|
|
15
|
+
text_content = "".join(f.readlines())
|
|
16
|
+
res = omni_spliter.text_chunk_func(text_content)
|
|
17
|
+
for item in res:
|
|
18
|
+
print(item)
|
|
19
|
+
print("------------")
|
|
20
|
+
print("=" * 10)
|
|
21
|
+
|
|
22
|
+
## note: test markdown json split
|
|
23
|
+
test_markdown = True
|
|
24
|
+
if test_markdown:
|
|
25
|
+
with open("./test/json_list_test.json", "r") as f:
|
|
26
|
+
md_content_json = json.load(f)
|
|
27
|
+
res = omni_spliter.markdown_json_chunk_func(md_content_json)
|
|
28
|
+
for item in res:
|
|
29
|
+
print(item)
|
|
30
|
+
print("------------")
|
|
31
|
+
print("=" * 10)
|
|
32
|
+
|
|
33
|
+
res = omni_spliter.markdown_json_chunk_func(md_content_json,clear_model=True)
|
|
34
|
+
for item in res:
|
|
35
|
+
print(item)
|
|
36
|
+
print("------------")
|
|
37
|
+
print("=" * 10)
|
|
38
|
+
|
|
39
|
+
## note: test markdown split
|
|
40
|
+
test_markdown = True
|
|
41
|
+
if test_markdown:
|
|
42
|
+
with open("./test/markdown_test.md", "r") as f:
|
|
43
|
+
md_content = f.read()
|
|
44
|
+
res = omni_spliter.markdown_chunk_func(md_content)
|
|
45
|
+
for item in res:
|
|
46
|
+
print(item)
|
|
47
|
+
print("------------")
|
|
48
|
+
print("=" * 10)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
res = omni_spliter.markdown_chunk_func(md_content, clear_model=True)
|
|
52
|
+
for item in res:
|
|
53
|
+
print(item)
|
|
54
|
+
print("------------")
|
|
55
|
+
print("=" * 10)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## note: test word split
|
|
59
|
+
test_document = True
|
|
60
|
+
if test_document:
|
|
61
|
+
|
|
62
|
+
new_doc_io = word_preprocessing_and_return_bytesIO("./test/docx_test.docx")
|
|
63
|
+
res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False)
|
|
64
|
+
for item in res:
|
|
65
|
+
print(item)
|
|
66
|
+
print("------------")
|
|
67
|
+
print("=" * 10)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False,save_local_images_dir="./images")
|
|
71
|
+
for item in res:
|
|
72
|
+
print(item)
|
|
73
|
+
print("------------")
|
|
74
|
+
print("=" * 10)
|
|
75
|
+
|
|
76
|
+
res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=True)
|
|
77
|
+
for item in res:
|
|
78
|
+
print(item)
|
|
79
|
+
print("------------")
|
|
80
|
+
print("=" * 10)
|
|
File without changes
|