omni-split 0.0.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omni-split might be problematic. Click here for more details.

@@ -0,0 +1,93 @@
1
+ import os
2
+ from transformers import PreTrainedTokenizerFast
3
+ from .sub_chunker.document_split import DocumentChunker
4
+ from .sub_chunker.markdown_split import MarkdownChunker
5
+ from .sub_chunker.text_split import SentenceChunker
6
+ from .utils.base_utils import save_local_images_func
7
+ from importlib.resources import files
8
+
9
+ class OmniSplit:
10
+ def __init__(self, tokenizer_json_path=None, txt_chunk_size=512):
11
+ if tokenizer_json_path is None:
12
+ # 获取当前文件的绝对路径,然后构建模型路径
13
+ current_dir = os.path.dirname(os.path.abspath(__file__))
14
+ tokenizer_json_path = os.path.join(current_dir, "model", "text_chunker_tokenizer","qwen_tokenizer.json")
15
+ self.tokenizer_json_path = tokenizer_json_path
16
+ self.txt_chunk_size = txt_chunk_size
17
+ self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=self.tokenizer_json_path)
18
+
19
+ def get_text_len_func(self, text):
20
+ """
21
+ * @description: 获取文本长度
22
+ * @param self :
23
+ * @param text :
24
+ * @return
25
+ """
26
+ if type(text) == str:
27
+ return len(self.tokenizer.encode(text, add_special_tokens=False))
28
+ else:
29
+ raise ValueError("text must be str")
30
+
31
+ def text_chunk_func(self, text, txt_chunk_size=None):
32
+ """
33
+ * @description: 纯文本的切割方法
34
+ * @param self :
35
+ * @param text :
36
+ * @return
37
+ """
38
+ if txt_chunk_size is None:
39
+ txt_chunk_size = self.txt_chunk_size
40
+ text_chunker = SentenceChunker(tokenizer_or_token_counter=self.tokenizer, chunk_size=txt_chunk_size, delim=["!", "?", "\n", "。", ";", ";"], return_type="texts")
41
+ temp_data_list = text_chunker.chunk(text)
42
+ ret_data = []
43
+ for item in temp_data_list:
44
+ ret_data.append({
45
+ "type":"text",
46
+ "text":item,
47
+ "text_len":self.get_text_len_func(item)
48
+ })
49
+ return ret_data
50
+
51
+ def markdown_json_chunk_func(self, markdown_json, txt_chunk_size=None, clear_model=False):
52
+ if txt_chunk_size is None:
53
+ txt_chunk_size = self.txt_chunk_size
54
+ md_chunker = MarkdownChunker(max_chunk_words=txt_chunk_size, clear_model=clear_model)
55
+ ret_data = md_chunker.convert_json_list2chunk_list_func(markdown_json)
56
+ for item in ret_data:
57
+ item["text_len"]= self.get_text_len_func(item["text"])
58
+ return ret_data
59
+
60
+ def markdown_chunk_func(self, markdown_text, txt_chunk_size=None, clear_model=False):
61
+ """
62
+ * @description: markdown的切割方法
63
+ * @param self :
64
+ * @param text :
65
+ * @return
66
+ """
67
+ if txt_chunk_size is None:
68
+ txt_chunk_size = self.txt_chunk_size
69
+ md_chunker = MarkdownChunker(max_chunk_words=txt_chunk_size, clear_model=clear_model)
70
+ ret_data = md_chunker.chunk(markdown_text)
71
+ for item in ret_data:
72
+ item["text_len"]= self.get_text_len_func(item["text"])
73
+ return ret_data
74
+
75
+ def document_chunk_func(self, document_content, txt_chunk_size=None, clear_model=False,save_local_images_dir=""):
76
+ """
77
+ * @description: office文档的切割方法
78
+ * @param self :
79
+ * @param text :
80
+ * @return
81
+ """
82
+ if txt_chunk_size is None:
83
+ txt_chunk_size = self.txt_chunk_size
84
+ if save_local_images_dir=="":
85
+ save_local_images_dir = save_local_images_dir
86
+ doc_chunker = DocumentChunker(max_chunk_words=txt_chunk_size, clear_model=clear_model)
87
+ ret_data = doc_chunker.chunk(document_content)
88
+
89
+ if save_local_images_dir!="" and not clear_model:
90
+ ret_data = save_local_images_func(ret_data, save_local_images_dir)
91
+ for item in ret_data:
92
+ item["text_len"]= self.get_text_len_func(item["text"])
93
+ return ret_data
File without changes
@@ -0,0 +1,32 @@
1
+ from markitdown import MarkItDown
2
+ from .markdown_split import MarkdownChunker
3
+
4
+
5
+ class DocumentChunker:
6
+ def __init__(self, max_chunk_words=1000, soft_chunk_words=None, hard_limit=None, clear_model=False):
7
+ self.max_chunk_words = max_chunk_words
8
+ if soft_chunk_words is None:
9
+ self.soft_chunk_words = max_chunk_words * 0.4
10
+ else:
11
+ self.soft_chunk_words = soft_chunk_words
12
+ if hard_limit is None:
13
+ self.hard_limit = max_chunk_words * 1.4
14
+ else:
15
+ self.hard_limit = hard_limit
16
+ self.clear_model = clear_model
17
+ self.markitdown = MarkItDown(enable_plugins=False)
18
+ self.markdown_chunker = MarkdownChunker(max_chunk_words=self.max_chunk_words, clear_model=self.clear_model)
19
+
20
+ def convert_document2md_func(self, document_content):
21
+
22
+ # print("convert_document2md_func")
23
+ # print()
24
+ markdown_result = self.markitdown.convert(document_content,keep_data_uris=True)
25
+ content = markdown_result.markdown
26
+ return content
27
+
28
+ def chunk(self, document_content):
29
+ markdown_text = self.convert_document2md_func(document_content)
30
+ chunk_list = self.markdown_chunker.chunk(markdown_text)
31
+ assert type(chunk_list) == list
32
+ return chunk_list
@@ -0,0 +1,47 @@
1
+ from ..base.md_json_list2chunk import markdown_json_list2chunk_list
2
+ from ..base.md2json_list import md2json_list_func
3
+
4
+
5
+ class MarkdownChunker:
6
+ def __init__(self, max_chunk_words=1000, soft_chunk_words=None, hard_limit=None, clear_model=False):
7
+ self.max_chunk_words = max_chunk_words
8
+ if soft_chunk_words is None:
9
+ self.soft_chunk_words = max_chunk_words * 0.4
10
+ else:
11
+ self.soft_chunk_words = soft_chunk_words
12
+ if hard_limit is None:
13
+ self.hard_limit = max_chunk_words * 1.4
14
+ else:
15
+ self.hard_limit = hard_limit
16
+ self.clear_model = clear_model
17
+ ##
18
+ # self.chunk_markdown_json = self.convert_json_list2chunk_list_func
19
+ def convert_markdown2json_list_func(self, markdown_text):
20
+ markdown_json_list = md2json_list_func(markdown_text)
21
+ assert type(markdown_json_list) == list
22
+ return markdown_json_list
23
+
24
+ def convert_json_list2chunk_list_func(self, json_list):
25
+ if self.clear_model:
26
+ ## todo: 处理特殊模式(清理不可以embedding的内容, 如图片)
27
+ json_clear_list = []
28
+ for item in json_list:
29
+ if item["type"] == "image":
30
+ continue
31
+ # note: 其他的按照标准text对待.
32
+ json_clear_list.append(item)
33
+ json_list = json_clear_list
34
+ chunk_list = markdown_json_list2chunk_list(json_list, MAX_CHUNK_WORDS=self.max_chunk_words, SOFT_CHUNK_WORDS=self.soft_chunk_words, HARD_LIMIT=self.hard_limit)
35
+ return chunk_list
36
+
37
+ def chunk(self, text_content):
38
+ markdown_json_list = self.convert_markdown2json_list_func(text_content)
39
+ for item in markdown_json_list:
40
+ if item.get("text_level",None) is not None:
41
+ item["text_level"] = 1
42
+ chunk_list = self.convert_json_list2chunk_list_func(markdown_json_list)
43
+ assert type(markdown_json_list) == list
44
+ return chunk_list
45
+ # import json
46
+ # with open("temp.json","w") as f:
47
+ # json.dump(markdown_json_list,f,ensure_ascii=False,indent=4)
@@ -0,0 +1,343 @@
1
+ """Sentence chunker."""
2
+
3
+ import warnings
4
+ from bisect import bisect_left
5
+ from itertools import accumulate
6
+ from typing import Any, Callable, List, Literal, Union
7
+
8
+ from ..base.chonkie_types import Chunk, Sentence, SentenceChunk
9
+ from ..base.chonkie_base import BaseChunker
10
+
11
+
12
+ from transformers import PreTrainedTokenizerFast
13
+
14
+
15
+ class SentenceChunker(BaseChunker):
16
+ """SentenceChunker splits the sentences in a text based on token limits and sentence boundaries.
17
+
18
+ Args:
19
+ tokenizer_or_token_counter: The tokenizer instance to use for encoding/decoding
20
+ chunk_size: Maximum number of tokens per chunk
21
+ chunk_overlap: Number of tokens to overlap between chunks
22
+ min_sentences_per_chunk: Minimum number of sentences per chunk (defaults to 1)
23
+ min_characters_per_sentence: Minimum number of characters per sentence
24
+ approximate: Whether to use approximate token counting (defaults to True)
25
+ delim: Delimiters to split sentences on
26
+ include_delim: Whether to include delimiters in current chunk, next chunk or not at all (defaults to "prev")
27
+ return_type: Whether to return chunks or texts
28
+
29
+ Raises:
30
+ ValueError: If parameters are invalid
31
+
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ tokenizer_or_token_counter: Union[str, Callable, Any] = "gpt2",
37
+ chunk_size: int = 512,
38
+ chunk_overlap: int = 0,
39
+ min_sentences_per_chunk: int = 1,
40
+ min_characters_per_sentence: int = 12,
41
+ approximate: bool = True,
42
+ delim: Union[str, List[str]] = [".", "!", "?", "\n"],
43
+ include_delim: Union[Literal["prev", "next"], None] = "prev",
44
+ return_type: Literal["chunks", "texts"] = "chunks",
45
+ ):
46
+ """Initialize the SentenceChunker with configuration parameters.
47
+
48
+ SentenceChunker splits the sentences in a text based on token limits and sentence boundaries.
49
+
50
+ Args:
51
+ tokenizer_or_token_counter: The tokenizer instance to use for encoding/decoding (defaults to "gpt2")
52
+ chunk_size: Maximum number of tokens per chunk (defaults to 512)
53
+ chunk_overlap: Number of tokens to overlap between chunks (defaults to 0)
54
+ min_sentences_per_chunk: Minimum number of sentences per chunk (defaults to 1)
55
+ min_characters_per_sentence: Minimum number of characters per sentence (defaults to 12)
56
+ approximate: Whether to use approximate token counting (defaults to True)
57
+ delim: Delimiters to split sentences on (defaults to [".", "!", "?", "newline"])
58
+ include_delim: Whether to include delimiters in current chunk, next chunk or not at all (defaults to "prev")
59
+ return_type: Whether to return chunks or texts (defaults to "chunks")
60
+
61
+ Raises:
62
+ ValueError: If parameters are invalid
63
+
64
+ """
65
+ super().__init__(tokenizer_or_token_counter=tokenizer_or_token_counter)
66
+
67
+ if chunk_size <= 0:
68
+ raise ValueError("chunk_size must be positive")
69
+ if chunk_overlap >= chunk_size:
70
+ raise ValueError("chunk_overlap must be less than chunk_size")
71
+ if min_sentences_per_chunk < 1:
72
+ raise ValueError("min_sentences_per_chunk must be at least 1")
73
+ if min_characters_per_sentence < 1:
74
+ raise ValueError("min_characters_per_sentence must be at least 1")
75
+ if delim is None:
76
+ raise ValueError("delim must be a list of strings or a string")
77
+ if include_delim not in ["prev", "next", None]:
78
+ raise ValueError("include_delim must be 'prev', 'next' or None")
79
+ if return_type not in ["chunks", "texts"]:
80
+ raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
81
+
82
+ # Add chunk_overlap deprecation warning
83
+ if chunk_overlap > 0:
84
+ warnings.warn(
85
+ "chunk_overlap is getting deprecated in v0.6.0. " + "🦛 Chonkie advises you to use OverlapRefinery instead which is more flexible and powerful!",
86
+ DeprecationWarning,
87
+ )
88
+ # Assign the values if they make sense
89
+ self.chunk_size = chunk_size
90
+ self.chunk_overlap = chunk_overlap
91
+ self.min_sentences_per_chunk = min_sentences_per_chunk
92
+ self.min_characters_per_sentence = min_characters_per_sentence
93
+ self.approximate = approximate
94
+ self.delim = delim
95
+ self.include_delim = include_delim
96
+ self.sep = "🦛"
97
+ self.return_type = return_type
98
+
99
+ def _split_sentences(self, text: str) -> List[str]:
100
+ """Fast sentence splitting while maintaining accuracy.
101
+
102
+ This method is faster than using regex for sentence splitting and is more accurate than using the spaCy sentence tokenizer.
103
+
104
+ Args:
105
+ text: Input text to be split into sentences
106
+
107
+ Returns:
108
+ List of sentences
109
+
110
+ """
111
+ t = text
112
+ for c in self.delim:
113
+ if self.include_delim == "prev":
114
+ t = t.replace(c, c + self.sep)
115
+ elif self.include_delim == "next":
116
+ t = t.replace(c, self.sep + c)
117
+ else:
118
+ t = t.replace(c, self.sep)
119
+
120
+ # Initial split
121
+ splits = [s for s in t.split(self.sep) if s != ""]
122
+
123
+ # Combine short splits with previous sentence
124
+ current = ""
125
+ sentences = []
126
+ for s in splits:
127
+ # If the split is short, add to current and if long add to sentences
128
+ if len(s) < self.min_characters_per_sentence:
129
+ current += s
130
+ elif current:
131
+ current += s
132
+ sentences.append(current)
133
+ current = ""
134
+ else:
135
+ sentences.append(s)
136
+
137
+ # At any point if the current sentence is longer than the min_characters_per_sentence,
138
+ # add it to the sentences
139
+ if len(current) >= self.min_characters_per_sentence:
140
+ sentences.append(current)
141
+ current = ""
142
+
143
+ # If there is a current split, add it to the sentences
144
+ if current:
145
+ sentences.append(current)
146
+
147
+ return sentences
148
+
149
+ def _estimate_token_counts(self, sentences: Union[str, List[str]]) -> int:
150
+ """Estimate token count using character length."""
151
+ CHARS_PER_TOKEN = 6.0 # Avg. char per token for llama3 is b/w 6-7
152
+ if type(sentences) is str:
153
+ return max(1, len(sentences) // CHARS_PER_TOKEN)
154
+ elif type(sentences) is list and type(sentences[0]) is str:
155
+ return [max(1, len(t) // CHARS_PER_TOKEN) for t in sentences]
156
+ else:
157
+ raise ValueError(f"Unknown type passed to _estimate_token_count: {type(sentences)}")
158
+
159
+ def _get_feedback(self, estimate: int, actual: int) -> float:
160
+ """Validate against the actual token counts and correct the estimates."""
161
+ estimate, actual = max(1, estimate), max(1, actual)
162
+ feedback = max(0.01, 1 - ((estimate - actual) / estimate))
163
+ return feedback
164
+
165
+ def _prepare_sentences(self, text: str) -> List[Sentence]:
166
+ """Split text into sentences and calculate token counts for each sentence.
167
+
168
+ Args:
169
+ text: Input text to be split into sentences
170
+
171
+ Returns:
172
+ List of Sentence objects
173
+
174
+ """
175
+ # Split text into sentences
176
+ sentence_texts = self._split_sentences(text)
177
+ if not sentence_texts:
178
+ return []
179
+
180
+ # Calculate positions once
181
+ positions = []
182
+ current_pos = 0
183
+ for sent in sentence_texts:
184
+ positions.append(current_pos)
185
+ current_pos += len(sent) # No +1 space because sentences are already separated by spaces
186
+
187
+ if not self.approximate:
188
+ # Get accurate token counts in batch
189
+ token_counts = self.tokenizer.count_tokens_batch(sentence_texts)
190
+ else:
191
+ # Estimate token counts using character length
192
+ token_counts = self._estimate_token_counts(sentence_texts)
193
+
194
+ # Create sentence objects
195
+ return [Sentence(text=sent, start_index=pos, end_index=pos + len(sent), token_count=count) for sent, pos, count in zip(sentence_texts, positions, token_counts)]
196
+
197
+ def _create_chunk(self, sentences: List[Sentence], token_count: int) -> Chunk:
198
+ """Create a chunk from a list of sentences.
199
+
200
+ Args:
201
+ sentences: List of sentences to create chunk from
202
+ token_count: Total token count for the chunk
203
+
204
+ Returns:
205
+ Chunk object
206
+
207
+ """
208
+ chunk_text = "".join([sentence.text for sentence in sentences])
209
+ if self.return_type == "texts":
210
+ return chunk_text
211
+ else:
212
+ return SentenceChunk(
213
+ text=chunk_text,
214
+ start_index=sentences[0].start_index,
215
+ end_index=sentences[-1].end_index,
216
+ token_count=token_count,
217
+ sentences=sentences,
218
+ )
219
+
220
+ def chunk(self, text: str) -> List[Chunk]:
221
+ """Split text into overlapping chunks based on sentences while respecting token limits.
222
+
223
+ Args:
224
+ text: Input text to be chunked
225
+
226
+ Returns:
227
+ List of Chunk objects containing the chunked text and metadata
228
+
229
+ """
230
+ if not text.strip():
231
+ return []
232
+
233
+ # Get prepared sentences with token counts
234
+ sentences = self._prepare_sentences(text) # 28mus
235
+ if not sentences:
236
+ return []
237
+
238
+ # Pre-calculate cumulative token counts for bisect
239
+ # Add 1 token for spaces between sentences
240
+ token_sums = list(accumulate([s.token_count for s in sentences], lambda a, b: a + b, initial=0))
241
+
242
+ chunks = []
243
+ feedback = 1.0
244
+ pos = 0
245
+
246
+ while pos < len(sentences):
247
+ # use updated feedback on the token sums
248
+ token_sums = [int(s * feedback) for s in token_sums]
249
+
250
+ # Use bisect_left to find initial split point
251
+ target_tokens = token_sums[pos] + self.chunk_size
252
+ split_idx = bisect_left(token_sums, target_tokens) - 1
253
+ split_idx = min(split_idx, len(sentences))
254
+
255
+ # Ensure we include at least one sentence beyond pos
256
+ split_idx = max(split_idx, pos + 1)
257
+
258
+ # Handle minimum sentences requirement
259
+ if split_idx - pos < self.min_sentences_per_chunk:
260
+ # If the minimum sentences per chunk can be met, set the split index to the minimum sentences per chunk
261
+ # Otherwise, warn the user that the minimum sentences per chunk could not be met for all chunks
262
+ if pos + self.min_sentences_per_chunk <= len(sentences):
263
+ split_idx = pos + self.min_sentences_per_chunk
264
+ else:
265
+ warnings.warn(
266
+ f"Minimum sentences per chunk as {self.min_sentences_per_chunk} could not be met for all chunks. "
267
+ + f"Last chunk of the text will have only {len(sentences) - pos} sentences. "
268
+ + "Consider increasing the chunk_size or decreasing the min_sentences_per_chunk."
269
+ )
270
+ split_idx = len(sentences)
271
+
272
+ # Get the estimated token count
273
+ estimate = token_sums[split_idx] - token_sums[pos]
274
+
275
+ # Get candidate sentences and verify actual token count
276
+ chunk_sentences = sentences[pos:split_idx]
277
+ chunk_text = "".join(s.text for s in chunk_sentences)
278
+ actual = self.tokenizer.count_tokens(chunk_text)
279
+
280
+ # Given the actual token_count and the estimate, get a feedback value for the next loop
281
+ feedback = self._get_feedback(estimate, actual)
282
+ # print(f"Estimate: {estimate} Actual: {actual} feedback: {feedback}")
283
+
284
+ # Back off one sentence at a time if we exceeded chunk size
285
+ while actual > self.chunk_size and len(chunk_sentences) > self.min_sentences_per_chunk:
286
+ split_idx -= 1
287
+ chunk_sentences = sentences[pos:split_idx]
288
+ chunk_text = "".join(s.text for s in chunk_sentences)
289
+ actual = self.tokenizer.count_tokens(chunk_text)
290
+
291
+ chunks.append(self._create_chunk(chunk_sentences, actual))
292
+
293
+ # Calculate next position with overlap
294
+ if self.chunk_overlap > 0 and split_idx < len(sentences):
295
+ # Calculate how many sentences we need for overlap
296
+ overlap_tokens = 0
297
+ overlap_idx = split_idx - 1
298
+
299
+ while overlap_idx > pos and overlap_tokens < self.chunk_overlap:
300
+ sent = sentences[overlap_idx]
301
+ next_tokens = overlap_tokens + sent.token_count + 1 # +1 for space
302
+ if next_tokens > self.chunk_overlap:
303
+ break
304
+ overlap_tokens = next_tokens
305
+ overlap_idx -= 1
306
+
307
+ # Move position to after the overlap
308
+ pos = overlap_idx + 1
309
+ else:
310
+ pos = split_idx
311
+
312
+ return chunks
313
+
314
+ def __repr__(self) -> str:
315
+ """Return a string representation of the SentenceChunker."""
316
+ return (
317
+ f"SentenceChunker(tokenizer={self.tokenizer}, "
318
+ f"chunk_size={self.chunk_size}, "
319
+ f"chunk_overlap={self.chunk_overlap}, "
320
+ f"min_sentences_per_chunk={self.min_sentences_per_chunk}, "
321
+ f"min_characters_per_sentence={self.min_characters_per_sentence}, "
322
+ f"approximate={self.approximate}, delim={self.delim}, "
323
+ f"include_delim={self.include_delim}, "
324
+ f"return_type={self.return_type})"
325
+ )
326
+
327
+
328
+ if __name__ == "__main__":
329
+
330
+ tokenizer = PreTrainedTokenizerFast(tokenizer_file="./model/qwen_tokenizer.json") # type: ignore
331
+ print("tokenzier load success")
332
+
333
+ file_name = "test/txt.txt"
334
+ with open(file_name, "r") as f:
335
+ text_content = "".join(f.readlines())
336
+ # print(data)
337
+ # 创建解析器对象
338
+ text_chunker = SentenceChunker(tokenizer_or_token_counter=tokenizer, chunk_size=512, delim=["!", "?", "\n", "。", ";", ";"], return_type="texts")
339
+ ret_data = text_chunker.chunk(text_content)
340
+ # print(ret_data)
341
+ for item in ret_data:
342
+ print(item)
343
+ print("=======")
omni_split/test.py ADDED
@@ -0,0 +1,80 @@
1
+ import json
2
+ from io import BytesIO
3
+
4
+ # from pprint import pprint
5
+ from omni_split.omni_split import OmniSplit
6
+ import docx
7
+
8
+ from utils.base_utils import word_preprocessing_and_return_bytesIO
9
+ from omni_split import download_tokenizer_from_network
10
+ omni_spliter = OmniSplit()
11
+ ## note: test text split
12
+ test_text = True
13
+ if test_text:
14
+ with open("test/text_test.txt", "r") as f:
15
+ text_content = "".join(f.readlines())
16
+ res = omni_spliter.text_chunk_func(text_content)
17
+ for item in res:
18
+ print(item)
19
+ print("------------")
20
+ print("=" * 10)
21
+
22
+ ## note: test markdown json split
23
+ test_markdown = True
24
+ if test_markdown:
25
+ with open("./test/json_list_test.json", "r") as f:
26
+ md_content_json = json.load(f)
27
+ res = omni_spliter.markdown_json_chunk_func(md_content_json)
28
+ for item in res:
29
+ print(item)
30
+ print("------------")
31
+ print("=" * 10)
32
+
33
+ res = omni_spliter.markdown_json_chunk_func(md_content_json,clear_model=True)
34
+ for item in res:
35
+ print(item)
36
+ print("------------")
37
+ print("=" * 10)
38
+
39
+ ## note: test markdown split
40
+ test_markdown = True
41
+ if test_markdown:
42
+ with open("./test/markdown_test.md", "r") as f:
43
+ md_content = f.read()
44
+ res = omni_spliter.markdown_chunk_func(md_content)
45
+ for item in res:
46
+ print(item)
47
+ print("------------")
48
+ print("=" * 10)
49
+
50
+
51
+ res = omni_spliter.markdown_chunk_func(md_content, clear_model=True)
52
+ for item in res:
53
+ print(item)
54
+ print("------------")
55
+ print("=" * 10)
56
+
57
+
58
+ ## note: test word split
59
+ test_document = True
60
+ if test_document:
61
+
62
+ new_doc_io = word_preprocessing_and_return_bytesIO("./test/docx_test.docx")
63
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False)
64
+ for item in res:
65
+ print(item)
66
+ print("------------")
67
+ print("=" * 10)
68
+
69
+
70
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False,save_local_images_dir="./images")
71
+ for item in res:
72
+ print(item)
73
+ print("------------")
74
+ print("=" * 10)
75
+
76
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=True)
77
+ for item in res:
78
+ print(item)
79
+ print("------------")
80
+ print("=" * 10)
File without changes