chunknorris 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunknorris/__init__.py +4 -0
- chunknorris/__init__.pyi +4 -0
- chunknorris/chunkers/__init__.py +2 -0
- chunknorris/chunkers/__init__.pyi +2 -0
- chunknorris/chunkers/html_chunknorris.py +25 -0
- chunknorris/chunkers/html_chunknorris.pyi +6 -0
- chunknorris/chunkers/markdown_chunknorris.py +543 -0
- chunknorris/chunkers/markdown_chunknorris.pyi +26 -0
- chunknorris/custom_chunkers/__init__.py +1 -0
- chunknorris/custom_chunkers/__init__.pyi +1 -0
- chunknorris/custom_chunkers/wikit_chunknorris.py +178 -0
- chunknorris/custom_chunkers/wikit_chunknorris.pyi +14 -0
- chunknorris/exceptions/__init__.py +1 -0
- chunknorris/exceptions/__init__.pyi +1 -0
- chunknorris/exceptions/exceptions.py +8 -0
- chunknorris/exceptions/exceptions.pyi +5 -0
- chunknorris/types/types.py +12 -0
- chunknorris/types/types.pyi +11 -0
- chunknorris/utils/__init__.py +0 -0
- chunknorris/utils/__init__.pyi +0 -0
- chunknorris/utils/utils.py +64 -0
- chunknorris/utils/utils.pyi +10 -0
- chunknorris-0.0.1.dist-info/LICENCE +15 -0
- chunknorris-0.0.1.dist-info/METADATA +142 -0
- chunknorris-0.0.1.dist-info/RECORD +27 -0
- chunknorris-0.0.1.dist-info/WHEEL +5 -0
- chunknorris-0.0.1.dist-info/top_level.txt +1 -0
chunknorris/__init__.py
ADDED
chunknorris/__init__.pyi
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from markdownify import markdownify
|
|
2
|
+
|
|
3
|
+
from .markdown_chunknorris import MarkdownChunkNorris
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class HTMLChunkNorris(MarkdownChunkNorris):
|
|
7
|
+
|
|
8
|
+
def __call__(self, html_text: str, **kwargs) -> str:
|
|
9
|
+
text = HTMLChunkNorris.apply_markdownify(html_text)
|
|
10
|
+
|
|
11
|
+
return super().__call__(text, **kwargs)
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def apply_markdownify(html_text) -> str:
|
|
15
|
+
"""Applies markdownify to the html text
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
html_text (str): the text of the html file
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
str: the markdownified string
|
|
22
|
+
"""
|
|
23
|
+
md_text = markdownify(html_text, strip=["figure", "img"], bullets="-*+")
|
|
24
|
+
|
|
25
|
+
return md_text
|
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import tiktoken
|
|
5
|
+
from unicodedata import normalize
|
|
6
|
+
|
|
7
|
+
from ..exceptions.exceptions import ChunkSizeExceeded
|
|
8
|
+
from ..types.types import Chunk, Chunks, TocTree
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MarkdownChunkNorris:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
14
|
+
|
|
15
|
+
def __call__(self, md_text: str, **kwargs) -> Chunks:
|
|
16
|
+
"""Gets chunks from markdown string
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
md_text (str): the markdown string
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Chunks: the list of chunk's texts
|
|
23
|
+
"""
|
|
24
|
+
toc_tree = self.get_toc_tree(md_text, **kwargs)
|
|
25
|
+
chunks = self.get_chunks(toc_tree, **kwargs)
|
|
26
|
+
|
|
27
|
+
return chunks
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def _check_string_argument_is_valid(argname: str, argvalue: str):
|
|
31
|
+
"""Checks that an argument has a valid value
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
argname (str): the name of the argument
|
|
35
|
+
argvalue (str): the value of the argument
|
|
36
|
+
"""
|
|
37
|
+
allowed_values_dict = {
|
|
38
|
+
"header_style": ["setext", "atx"],
|
|
39
|
+
"max_title_level_to_use": ["h1", "h2", "h3", "h4", "h5"],
|
|
40
|
+
"link_placement": [
|
|
41
|
+
"remove",
|
|
42
|
+
"end_of_chunk",
|
|
43
|
+
"in_sentence",
|
|
44
|
+
"leave_as_markdown",
|
|
45
|
+
],
|
|
46
|
+
"chunk_tokens_exceeded_handling": ["raise_error", "split"],
|
|
47
|
+
}
|
|
48
|
+
allowed_values = allowed_values_dict[argname]
|
|
49
|
+
assert argvalue in allowed_values, ValueError(
|
|
50
|
+
f"Argument '{argname}' should be one of {allowed_values}. Got '{argvalue}'"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def _get_header_regex_patterns(self, header_style: str = "setext") -> dict:
|
|
54
|
+
"""Get the header regex patterns depending on the header_style
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
header_style (str, optional): the markdown header style. Must be 'atx' or 'setext'.
|
|
58
|
+
Defaults to "setext".
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
(dict) : a mapping between header name and regex patterns
|
|
62
|
+
"""
|
|
63
|
+
self._check_string_argument_is_valid("header_style", header_style)
|
|
64
|
+
patterns = {
|
|
65
|
+
"h1": re.compile(r"(.+?)\n={3,}", re.MULTILINE),
|
|
66
|
+
"h2": re.compile(r"(.+?)\n-{3,}", re.MULTILINE),
|
|
67
|
+
"h3": re.compile(r"^(?:- )?(#{3} .+)", re.MULTILINE),
|
|
68
|
+
"h4": re.compile(r"^(?:- )?(#{4} .+)", re.MULTILINE),
|
|
69
|
+
"h5": re.compile(r"^(?:- )?(#{5} .+)", re.MULTILINE),
|
|
70
|
+
}
|
|
71
|
+
if header_style == "atx":
|
|
72
|
+
patterns["h1"] = re.compile(r"^(?:- )?(#{1} .+)", re.MULTILINE)
|
|
73
|
+
patterns["h2"] = re.compile(r"^(?:- )?(#{2} .+)", re.MULTILINE)
|
|
74
|
+
|
|
75
|
+
return patterns
|
|
76
|
+
|
|
77
|
+
def _convert_setext_to_atx(self, markdown_string: str) -> str:
|
|
78
|
+
"""Converts headers from setext style to markdown style
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
markdown_string (str): the markdown string
|
|
82
|
+
|
|
83
|
+
Return:
|
|
84
|
+
str: the string with formatted headers
|
|
85
|
+
"""
|
|
86
|
+
regex_patterns = self._get_header_regex_patterns("setext")
|
|
87
|
+
for match in re.finditer(regex_patterns["h1"], markdown_string):
|
|
88
|
+
markdown_string = markdown_string.replace(match[0], f"# {match[1]}")
|
|
89
|
+
for match in re.finditer(regex_patterns["h2"], markdown_string):
|
|
90
|
+
markdown_string = markdown_string.replace(match[0], f"## {match[1]}")
|
|
91
|
+
|
|
92
|
+
return markdown_string
|
|
93
|
+
|
|
94
|
+
def get_toc_tree(
|
|
95
|
+
self,
|
|
96
|
+
markdown_string: str,
|
|
97
|
+
max_title_level_to_use: str = "h4",
|
|
98
|
+
header_style="setext",
|
|
99
|
+
**kwargs,
|
|
100
|
+
) -> TocTree:
|
|
101
|
+
"""Builds the table of content tree based on header
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
markdown_string (str): the string, as markdown
|
|
105
|
+
max_title_level_to_use (str, optional): the maximum title level to use. Headers with lower
|
|
106
|
+
level than this won't be considered as headers. Defaults to "h4".
|
|
107
|
+
header_style (str, optional): the type of header format. Defaults to "setext".
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
TocTree: the table of content
|
|
111
|
+
"""
|
|
112
|
+
MarkdownChunkNorris._check_string_argument_is_valid(
|
|
113
|
+
"header_style", header_style
|
|
114
|
+
)
|
|
115
|
+
MarkdownChunkNorris._check_string_argument_is_valid(
|
|
116
|
+
"max_title_level_to_use", max_title_level_to_use
|
|
117
|
+
)
|
|
118
|
+
if header_style == "setext":
|
|
119
|
+
markdown_string = self._convert_setext_to_atx(markdown_string)
|
|
120
|
+
title_types_to_consider = [
|
|
121
|
+
f"h{i}" for i in range(1, int(max_title_level_to_use[1]) + 1)
|
|
122
|
+
]
|
|
123
|
+
regex_patterns = self._get_header_regex_patterns("atx")
|
|
124
|
+
lines = markdown_string.splitlines()
|
|
125
|
+
tree = {
|
|
126
|
+
"title": "",
|
|
127
|
+
"children": [],
|
|
128
|
+
"content": "",
|
|
129
|
+
"id": -1,
|
|
130
|
+
"line_index": -1,
|
|
131
|
+
"level": -1,
|
|
132
|
+
"parent": {},
|
|
133
|
+
}
|
|
134
|
+
current_level = -1
|
|
135
|
+
current_node = tree
|
|
136
|
+
id_cntr = 0
|
|
137
|
+
for line_idx, line in enumerate(lines):
|
|
138
|
+
for level, title_type in enumerate(title_types_to_consider):
|
|
139
|
+
match = re.match(regex_patterns[title_type], line)
|
|
140
|
+
if match:
|
|
141
|
+
title = match.group(1)
|
|
142
|
+
while level <= current_level:
|
|
143
|
+
current_node = current_node["parent"]
|
|
144
|
+
current_level = current_node["level"]
|
|
145
|
+
new_node = {
|
|
146
|
+
"title": title,
|
|
147
|
+
"children": [],
|
|
148
|
+
"content": "",
|
|
149
|
+
"id": id_cntr,
|
|
150
|
+
"line_index": line_idx,
|
|
151
|
+
"level": level,
|
|
152
|
+
"parent": current_node,
|
|
153
|
+
}
|
|
154
|
+
current_node["children"].append(new_node)
|
|
155
|
+
current_node = new_node
|
|
156
|
+
current_level = level
|
|
157
|
+
id_cntr += 1
|
|
158
|
+
break
|
|
159
|
+
if not match:
|
|
160
|
+
current_node["content"] += line + "\n"
|
|
161
|
+
|
|
162
|
+
MarkdownChunkNorris._cleanup_tree_texts(tree)
|
|
163
|
+
|
|
164
|
+
return tree
|
|
165
|
+
|
|
166
|
+
@staticmethod
|
|
167
|
+
def save_toc_tree(toc_tree: TocTree, output_path: str = "toc_tree.json"):
|
|
168
|
+
"""Save the toc tree as json
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
toc_tree (TocTree): the toc tree to save
|
|
172
|
+
output_path (str, optional): the output path. Defaults to "toc_tree.json".
|
|
173
|
+
"""
|
|
174
|
+
dumpable_dict = copy.deepcopy(toc_tree)
|
|
175
|
+
MarkdownChunkNorris._remove_circular_refs(dumpable_dict)
|
|
176
|
+
with open(output_path, "w", encoding="UTF-8") as f:
|
|
177
|
+
json.dump(dumpable_dict, f)
|
|
178
|
+
|
|
179
|
+
@staticmethod
|
|
180
|
+
def _remove_circular_refs(toc_tree_element: TocTree):
|
|
181
|
+
"""Recursively removes the circular ref in dict
|
|
182
|
+
(used to save as json).
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
toc_tree_element (TocTree): the element of toc tree
|
|
186
|
+
"""
|
|
187
|
+
if "parent" in toc_tree_element:
|
|
188
|
+
del toc_tree_element["parent"]
|
|
189
|
+
if "children" in toc_tree_element:
|
|
190
|
+
for child in toc_tree_element["children"]:
|
|
191
|
+
MarkdownChunkNorris._remove_circular_refs(child)
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _cleanup_tree_texts(tree: TocTree):
|
|
195
|
+
"""Cleans up the texts at each 'content' key of the toc tree
|
|
196
|
+
Uses recursion to go through each child
|
|
197
|
+
Args:
|
|
198
|
+
tree (TocTree): the toc tree
|
|
199
|
+
"""
|
|
200
|
+
text = tree["content"]
|
|
201
|
+
text = normalize("NFKD", text)
|
|
202
|
+
# remove special characters
|
|
203
|
+
special_chars = ["**", r"\*"]
|
|
204
|
+
for char in special_chars:
|
|
205
|
+
text = text.replace(char, "")
|
|
206
|
+
# remove white spaces and newlines
|
|
207
|
+
text = re.sub(r"\n\s*\n", "\n", text)
|
|
208
|
+
text = text.rstrip().lstrip()
|
|
209
|
+
tree["content"] = text
|
|
210
|
+
if tree["children"]:
|
|
211
|
+
for child in tree["children"]:
|
|
212
|
+
tree = MarkdownChunkNorris._cleanup_tree_texts(child)
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def get_title_by_id(toc_tree: TocTree, id: int) -> TocTree:
|
|
216
|
+
"""Gets a toc tree using its id
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
toc_tree (TocTree): the toc tree
|
|
220
|
+
id (int): the id of the title we are looking for. Defaults to int.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
TocTree: the element of title we were looking for
|
|
224
|
+
"""
|
|
225
|
+
if toc_tree.get("id") == id:
|
|
226
|
+
return toc_tree
|
|
227
|
+
for child in toc_tree.get("children", []):
|
|
228
|
+
target = MarkdownChunkNorris.get_title_by_id(child, id)
|
|
229
|
+
if target:
|
|
230
|
+
return target
|
|
231
|
+
|
|
232
|
+
def get_chunks(self, toc_tree: TocTree, **kwargs) -> Chunks:
|
|
233
|
+
"""Wrapper that build the chunk's texts, check
|
|
234
|
+
that they fit in size, replace links formatting.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
toc_tree (TocTree): the toc tree of the document
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Chunks: the chunks text, formatted
|
|
241
|
+
"""
|
|
242
|
+
chunks = MarkdownChunkNorris.get_chunks_texts(toc_tree, **kwargs)
|
|
243
|
+
chunks = [MarkdownChunkNorris.change_links_format(c, **kwargs) for c in chunks]
|
|
244
|
+
chunks = MarkdownChunkNorris.remove_small_chunks(chunks, **kwargs)
|
|
245
|
+
chunks = self.remove_small_chunks(chunks, **kwargs)
|
|
246
|
+
|
|
247
|
+
return chunks
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def get_chunks_texts(
|
|
251
|
+
toc_tree_element: TocTree, already_ok_chunks: Chunks | None = None, **kwargs
|
|
252
|
+
) -> Chunks:
|
|
253
|
+
"""Uses the toc tree to build the chunks. Uses recursion.
|
|
254
|
+
Method :
|
|
255
|
+
- build the chunk (= titles from sections above + section content + content of subsections)
|
|
256
|
+
- if the chunk is too big:
|
|
257
|
+
- save the section as title + content (if section has content)
|
|
258
|
+
- subdivide section recursively using subsections
|
|
259
|
+
- else save it as is
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
toc_tree_element (TocTree): _description_
|
|
263
|
+
already_ok_chunks (Chunks, optional): the chunks already built.
|
|
264
|
+
Used for recursion. Defaults to None.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Chunks: list of chunk's texts
|
|
268
|
+
"""
|
|
269
|
+
if not already_ok_chunks:
|
|
270
|
+
already_ok_chunks = []
|
|
271
|
+
|
|
272
|
+
current_chunk = MarkdownChunkNorris.build_chunk_text(toc_tree_element)
|
|
273
|
+
|
|
274
|
+
if MarkdownChunkNorris._chunk_is_too_big(current_chunk, **kwargs):
|
|
275
|
+
if toc_tree_element["content"]:
|
|
276
|
+
parents = MarkdownChunkNorris.get_parents_headers(toc_tree_element)
|
|
277
|
+
current_chunk = "\n".join(
|
|
278
|
+
parents + [toc_tree_element["title"], toc_tree_element["content"]]
|
|
279
|
+
)
|
|
280
|
+
already_ok_chunks.append(current_chunk)
|
|
281
|
+
for child in toc_tree_element["children"]:
|
|
282
|
+
already_ok_chunks = MarkdownChunkNorris.get_chunks_texts(
|
|
283
|
+
child, already_ok_chunks, **kwargs
|
|
284
|
+
)
|
|
285
|
+
else:
|
|
286
|
+
already_ok_chunks.append(current_chunk)
|
|
287
|
+
|
|
288
|
+
return already_ok_chunks
|
|
289
|
+
|
|
290
|
+
@staticmethod
|
|
291
|
+
def build_chunk_text(toc_tree_element: TocTree) -> Chunk:
|
|
292
|
+
"""Builds a chunk by apposing the text of headers
|
|
293
|
+
and recursively getting the content of children
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
toc_tree_element (TocTree): the toc tree element
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
str: the chunk content. parent's headers + content
|
|
300
|
+
"""
|
|
301
|
+
parents = MarkdownChunkNorris.get_parents_headers(toc_tree_element)
|
|
302
|
+
content = MarkdownChunkNorris._build_chunk_content(toc_tree_element)
|
|
303
|
+
|
|
304
|
+
return "\n".join(parents) + "\n" + content
|
|
305
|
+
|
|
306
|
+
@staticmethod
|
|
307
|
+
def _build_chunk_content(toc_tree_element: TocTree) -> str:
|
|
308
|
+
"""Builds a chunk content (i.e without headers above)
|
|
309
|
+
from a toc tree. It uses the toc tree's content, and recursively
|
|
310
|
+
adds the header and content of its children
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
toc_tree_element (TocTree): the toc tree (or element of toc tree)
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
str: the text of the chunk (without the headers of parents)
|
|
317
|
+
"""
|
|
318
|
+
text = toc_tree_element.get("title") + "\n" + toc_tree_element.get("content")
|
|
319
|
+
if toc_tree_element.get("children"):
|
|
320
|
+
for child in toc_tree_element.get("children"):
|
|
321
|
+
text += "\n" + MarkdownChunkNorris._build_chunk_content(child)
|
|
322
|
+
|
|
323
|
+
return text
|
|
324
|
+
|
|
325
|
+
@staticmethod
|
|
326
|
+
def get_parents_headers(toc_tree_element: TocTree) -> list[str]:
|
|
327
|
+
"""Gets a list of the titles that are parent
|
|
328
|
+
of the provided toc tree element. The list
|
|
329
|
+
is ordered in descending order in terms of header level.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
toc_tree_element (TocTree): the toc tree element
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
list[str]: the list of header's text
|
|
336
|
+
"""
|
|
337
|
+
headers = []
|
|
338
|
+
while toc_tree_element.get("parent"):
|
|
339
|
+
toc_tree_element = toc_tree_element.get("parent")
|
|
340
|
+
headers.append(toc_tree_element.get("title"))
|
|
341
|
+
# remove empty string header, such as root header
|
|
342
|
+
headers = [h for h in headers if h]
|
|
343
|
+
|
|
344
|
+
return list(reversed(headers))
|
|
345
|
+
|
|
346
|
+
@staticmethod
|
|
347
|
+
def _chunk_is_too_big(
|
|
348
|
+
chunk: str, max_chunk_word_length: int = 200, **kwargs
|
|
349
|
+
) -> bool:
|
|
350
|
+
"""Returns True if the chunk is bigger than the value
|
|
351
|
+
specified as max_chunk_length in terms of word count
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
chunk (str): a chunk of text
|
|
355
|
+
max_chunk_length (int): the max size of the chunk in words. Default to 200.
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
bool: True if the chunk is too big, else false
|
|
359
|
+
"""
|
|
360
|
+
return len(chunk.split()) > max_chunk_word_length
|
|
361
|
+
|
|
362
|
+
@staticmethod
|
|
363
|
+
def _header_has_children(toc_tree_element: TocTree) -> bool:
|
|
364
|
+
"""Mostly used for code comprehension.
|
|
365
|
+
Returns True if the title has children.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
toc_tree_element (TocTree): the header to check for children
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
bool: True if the header has children
|
|
372
|
+
"""
|
|
373
|
+
return bool(toc_tree_element.get("children"))
|
|
374
|
+
|
|
375
|
+
@staticmethod
|
|
376
|
+
def change_links_format(text, link_placement: str = "in_sentence", **kwargs) -> str:
|
|
377
|
+
"""Removes the markdown format of the links in the text.
|
|
378
|
+
The links are treated as specified by 'link_position':
|
|
379
|
+
- remove : links are removed
|
|
380
|
+
- in_sentence : the link is placed in the sentence, between parenthesis
|
|
381
|
+
- end_of_chunk : all links are added at the end of the text
|
|
382
|
+
- leave_as_markdown: leave links as markdown format
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
text (str): the text to find the links in
|
|
386
|
+
link_placement (str, optional): How the links should be handled. Defaults to end_of_chunk.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
str: the formated text
|
|
390
|
+
"""
|
|
391
|
+
MarkdownChunkNorris._check_string_argument_is_valid(
|
|
392
|
+
"link_placement", link_placement
|
|
393
|
+
)
|
|
394
|
+
if link_placement == "leave_as_markdown":
|
|
395
|
+
return text
|
|
396
|
+
|
|
397
|
+
patterns = {
|
|
398
|
+
"image_as_link": r"\[!\[(.*?)\]\(.*?\)\]\((.*?)\)",
|
|
399
|
+
"image": r"!\[(.*?)\]\((https?:[^\s'()]+).*?\)",
|
|
400
|
+
"link": r"\[(.+?)\]\((https?:.+?)\)",
|
|
401
|
+
}
|
|
402
|
+
for pattern in patterns.values():
|
|
403
|
+
matches = re.finditer(pattern, text)
|
|
404
|
+
replacements = [(m[0], m[1], m[2]) for m in matches]
|
|
405
|
+
if replacements is not None:
|
|
406
|
+
text = MarkdownChunkNorris._handle_link_replacements(
|
|
407
|
+
text, replacements, link_placement=link_placement
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
return text
|
|
411
|
+
|
|
412
|
+
@staticmethod
|
|
413
|
+
def _handle_link_replacements(
|
|
414
|
+
text: str,
|
|
415
|
+
replacements: list[tuple[str, str, str]],
|
|
416
|
+
link_placement: str = "in_sentence",
|
|
417
|
+
) -> str:
|
|
418
|
+
"""Handles the replacement of links in the text
|
|
419
|
+
according to specified format
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
text (str): the text to replace the links ind
|
|
423
|
+
replacements (list[tuple[str, str, str]]): the replacements. Is is a list of
|
|
424
|
+
[(entire regex match, link's text, link's url)]
|
|
425
|
+
link_placement (str, optional): _description_. Defaults to "in_sentence".
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
str: the text with links modified
|
|
429
|
+
"""
|
|
430
|
+
for i, m in enumerate(replacements):
|
|
431
|
+
match link_placement:
|
|
432
|
+
case "remove":
|
|
433
|
+
text = text.replace(m[0], m[1])
|
|
434
|
+
case "end_of_chunk":
|
|
435
|
+
if i == 0:
|
|
436
|
+
text += "\nPour plus d'informations:\n"
|
|
437
|
+
text = text.replace(m[0], m[1])
|
|
438
|
+
text += f"- {m[1]}: {m[2]}"
|
|
439
|
+
case "in_sentence":
|
|
440
|
+
text = text.replace(m[0], f"{m[1]} (lien : {m[2]})")
|
|
441
|
+
|
|
442
|
+
return text
|
|
443
|
+
|
|
444
|
+
@staticmethod
|
|
445
|
+
def remove_small_chunks(
|
|
446
|
+
chunks: Chunks, min_chunk_word_count: int = 15, **kwargs
|
|
447
|
+
) -> Chunks:
|
|
448
|
+
"""Removes chunks that have less words than the specified limit
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
chunks (Chunks): _description_
|
|
452
|
+
min_chunk_tokens (int, optional): the minimum size a chunk is allowed to be,
|
|
453
|
+
in words. Chunks with less words than this are dicarded. Defaults to 15.
|
|
454
|
+
Returns:
|
|
455
|
+
Chunks: the chunks with more words than the specified threshold
|
|
456
|
+
"""
|
|
457
|
+
return [c for c in chunks if len(c.split()) >= min_chunk_word_count]
|
|
458
|
+
|
|
459
|
+
def split_big_chunks(
|
|
460
|
+
self,
|
|
461
|
+
chunks: Chunks,
|
|
462
|
+
max_chunk_tokens: int = 8191,
|
|
463
|
+
chunk_tokens_exceeded_handling: str = "raise_error",
|
|
464
|
+
**kwargs,
|
|
465
|
+
) -> Chunks:
|
|
466
|
+
"""Checks that the chunks do not exceed the token limit, considered as a hard limit
|
|
467
|
+
If chunk_tokens_exceeded_handling is:
|
|
468
|
+
- "raise_error" -> it will raise an error in case a chunk to big is found
|
|
469
|
+
for it to be investigated.
|
|
470
|
+
- "split" -> Chunks exceeding the max size will be split to fit max_chunk_tokens
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
chunks (Chunks): The chunks obtained from the get_chunks() method
|
|
474
|
+
max_chunk_tokens (int, optional): the maximum size a chunk is allowed to be,
|
|
475
|
+
in tokens. Defaults to 8191.
|
|
476
|
+
chunk_tokens_exceeded_handling (str, optional): whether or not error should be raised if a big
|
|
477
|
+
chunk is encountered, or split. Defaults to raise_error.
|
|
478
|
+
"""
|
|
479
|
+
MarkdownChunkNorris._check_string_argument_is_valid(
|
|
480
|
+
"chunk_tokens_exceeded_handling", chunk_tokens_exceeded_handling
|
|
481
|
+
)
|
|
482
|
+
splitted_chunks = []
|
|
483
|
+
for chunk in chunks:
|
|
484
|
+
if self.get_token_count(chunk) < max_chunk_tokens:
|
|
485
|
+
splitted_chunks.append(chunk)
|
|
486
|
+
else:
|
|
487
|
+
match chunk_tokens_exceeded_handling:
|
|
488
|
+
case "raise_error":
|
|
489
|
+
raise ChunkSizeExceeded(
|
|
490
|
+
(
|
|
491
|
+
f"Found chunk bigger than the specified token limit {max_chunk_tokens}:",
|
|
492
|
+
"You can disable this error and allow dummy splitting of this chunk by passing 'raise_error=False'",
|
|
493
|
+
f"The chunk : {chunk}",
|
|
494
|
+
)
|
|
495
|
+
)
|
|
496
|
+
case "split":
|
|
497
|
+
splitted_chunk = self._dummy_split_big_chunk(
|
|
498
|
+
chunk, max_chunk_tokens
|
|
499
|
+
)
|
|
500
|
+
splitted_chunks.extend(splitted_chunk)
|
|
501
|
+
|
|
502
|
+
return splitted_chunks
|
|
503
|
+
|
|
504
|
+
def get_token_count(self, text: Chunk) -> int:
|
|
505
|
+
"""Get the token count of a chunk
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
text (Chunk): the text to get the token count from
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
int: the token count
|
|
512
|
+
"""
|
|
513
|
+
return len(self.tokenizer.encode(text))
|
|
514
|
+
|
|
515
|
+
def _dummy_split_big_chunk(
|
|
516
|
+
self,
|
|
517
|
+
chunk: Chunk,
|
|
518
|
+
max_chunk_tokens: int = 8191,
|
|
519
|
+
) -> Chunks:
|
|
520
|
+
"""Splits the chunk so that the subchunks fit un max_chunk_size
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
chunk (Chunk): _description_
|
|
524
|
+
max_chunk_tokens (int, optional): maximum chunk size. Defaults to 8191.
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
Chunks: the subdivided chunks
|
|
528
|
+
"""
|
|
529
|
+
token_count = self.get_token_count(chunk)
|
|
530
|
+
if token_count < max_chunk_tokens:
|
|
531
|
+
return [chunk]
|
|
532
|
+
|
|
533
|
+
split_count = (token_count // max_chunk_tokens) + 1
|
|
534
|
+
split_token_size = token_count // split_count
|
|
535
|
+
tokenized_text = self.tokenizer.encode(chunk)
|
|
536
|
+
splitted_text = [
|
|
537
|
+
self.tokenizer.decode(
|
|
538
|
+
tokenized_text[i * split_token_size : (i + 1) * split_token_size]
|
|
539
|
+
)
|
|
540
|
+
for i in range(split_count)
|
|
541
|
+
]
|
|
542
|
+
|
|
543
|
+
return splitted_text
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from ..exceptions.exceptions import ChunkSizeExceeded as ChunkSizeExceeded
|
|
2
|
+
from ..types.types import Chunk as Chunk, Chunks as Chunks, TocTree as TocTree
|
|
3
|
+
from _typeshed import Incomplete
|
|
4
|
+
|
|
5
|
+
class MarkdownChunkNorris:
|
|
6
|
+
tokenizer: Incomplete
|
|
7
|
+
def __init__(self) -> None: ...
|
|
8
|
+
def __call__(self, md_text: str, **kwargs) -> Chunks: ...
|
|
9
|
+
def get_toc_tree(self, markdown_string: str, max_title_level_to_use: str = 'h4', header_style: str = 'setext', **kwargs) -> TocTree: ...
|
|
10
|
+
@staticmethod
|
|
11
|
+
def save_toc_tree(toc_tree: TocTree, output_path: str = 'toc_tree.json'): ...
|
|
12
|
+
@staticmethod
|
|
13
|
+
def get_title_by_id(toc_tree: TocTree, id: int) -> TocTree: ...
|
|
14
|
+
def get_chunks(self, toc_tree: TocTree, **kwargs) -> Chunks: ...
|
|
15
|
+
@staticmethod
|
|
16
|
+
def get_chunks_texts(toc_tree_element: TocTree, already_ok_chunks: Chunks | None = None, **kwargs) -> Chunks: ...
|
|
17
|
+
@staticmethod
|
|
18
|
+
def build_chunk_text(toc_tree_element: TocTree) -> Chunk: ...
|
|
19
|
+
@staticmethod
|
|
20
|
+
def get_parents_headers(toc_tree_element: TocTree) -> list[str]: ...
|
|
21
|
+
@staticmethod
|
|
22
|
+
def change_links_format(text, link_placement: str = 'in_sentence', **kwargs) -> str: ...
|
|
23
|
+
@staticmethod
|
|
24
|
+
def remove_small_chunks(chunks: Chunks, min_chunk_word_count: int = 15, **kwargs) -> Chunks: ...
|
|
25
|
+
def split_big_chunks(self, chunks: Chunks, max_chunk_tokens: int = 8191, chunk_tokens_exceeded_handling: str = 'raise_error', **kwargs) -> Chunks: ...
|
|
26
|
+
def get_token_count(self, text: Chunk) -> int: ...
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .wikit_chunknorris import WikitChunkNorris
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .wikit_chunknorris import WikitChunkNorris as WikitChunkNorris
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
from argparse import ArgumentParser
|
|
5
|
+
from ..chunkers.html_chunknorris import HTMLChunkNorris
|
|
6
|
+
from ..exceptions.exceptions import ChunkNorrisException
|
|
7
|
+
from ..types.types import Chunks
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WikitChunkNorris(HTMLChunkNorris):
|
|
11
|
+
|
|
12
|
+
@staticmethod
|
|
13
|
+
def read_file(filepath: str, return_full_content: bool = False) -> str:
|
|
14
|
+
"""Reads a html or json file
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
filepath (str): path to a file. must end with .json or .html
|
|
18
|
+
return_full_content (bool): Only applies to JSON files. Indicates whether or not
|
|
19
|
+
the entire content of the file is returned or just the text content.
|
|
20
|
+
Returns:
|
|
21
|
+
str: the text, mardkownified
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
with open(filepath, "r", encoding="UTF-8") as f:
|
|
25
|
+
if filepath.endswith(".json"):
|
|
26
|
+
file = json.load(f)
|
|
27
|
+
if not return_full_content:
|
|
28
|
+
file = file["hasPart"][0]["text"]
|
|
29
|
+
elif filepath.endswith(".html"):
|
|
30
|
+
file = f.read()
|
|
31
|
+
else:
|
|
32
|
+
raise Exception("Can only open JSON or HTML files")
|
|
33
|
+
except Exception as e:
|
|
34
|
+
raise ChunkNorrisException(f"Can't open file '{filepath}': {e}") from e
|
|
35
|
+
|
|
36
|
+
return file
|
|
37
|
+
|
|
38
|
+
def chunk_file(self, filepath: str, output_filepath: str = None, **kwargs):
|
|
39
|
+
"""Chunks a json file and save the chunked file
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
filepath (str): path to the file
|
|
43
|
+
output_filepath (str): path of the output file
|
|
44
|
+
make_dirs (bool): whether of not to make the path if folders don't exist
|
|
45
|
+
"""
|
|
46
|
+
filepath = os.path.normpath(filepath)
|
|
47
|
+
html_text = WikitChunkNorris.read_file(filepath)
|
|
48
|
+
chunks = self(html_text, **kwargs)
|
|
49
|
+
file_content = WikitChunkNorris.format_output(filepath, chunks)
|
|
50
|
+
|
|
51
|
+
if not output_filepath:
|
|
52
|
+
output_dir = os.path.dirname(filepath) + "-chunked"
|
|
53
|
+
output_filepath = os.path.join(output_dir, os.path.basename(filepath))
|
|
54
|
+
else:
|
|
55
|
+
output_dir = os.path.dirname(output_filepath)
|
|
56
|
+
if not os.path.exists(output_dir):
|
|
57
|
+
os.makedirs(output_dir)
|
|
58
|
+
|
|
59
|
+
with open(output_filepath, "w", encoding="UTF-8") as f:
|
|
60
|
+
json.dump(file_content, f, ensure_ascii=False)
|
|
61
|
+
|
|
62
|
+
def chunk_directory(self, input_dir: str, output_dir: str = None, **kwargs) -> None:
|
|
63
|
+
"""Chunks the json files of entire directory, recursively
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
input_dir (str): the path to directory
|
|
67
|
+
output_dir (str): the directory where chunks will be saved
|
|
68
|
+
"""
|
|
69
|
+
input_dir = os.path.normpath(input_dir)
|
|
70
|
+
if not output_dir:
|
|
71
|
+
output_dir = f"{input_dir}-chunked"
|
|
72
|
+
|
|
73
|
+
filepaths = [
|
|
74
|
+
subdir
|
|
75
|
+
for dir in os.walk(input_dir)
|
|
76
|
+
for subdir in glob.glob(os.path.join(dir[0], "*.json"))
|
|
77
|
+
]
|
|
78
|
+
for fp in filepaths:
|
|
79
|
+
self.chunk_file(fp, fp.replace(input_dir, output_dir), **kwargs)
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def format_output(filepath: str, chunks: Chunks) -> dict:
|
|
83
|
+
"""Formats the chunks according to the input json file
|
|
84
|
+
i.e places the chunks inside the key [hasPart]
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
filepath (str): path toward the json file
|
|
88
|
+
chunks (Chunks): the chunks
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
dict: the formatted file content
|
|
92
|
+
"""
|
|
93
|
+
file_content = WikitChunkNorris.read_file(filepath, return_full_content=True)
|
|
94
|
+
chunks = [
|
|
95
|
+
{
|
|
96
|
+
"@type": "DocumentChunk",
|
|
97
|
+
"text": c,
|
|
98
|
+
"position": i,
|
|
99
|
+
}
|
|
100
|
+
for i, c in enumerate(chunks)
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
file_content["hasPart"] = chunks
|
|
104
|
+
|
|
105
|
+
return file_content
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def parse_arguments():
|
|
109
|
+
"""Parse the given command-line arguments."""
|
|
110
|
+
|
|
111
|
+
parser = ArgumentParser(
|
|
112
|
+
description="Chunking a folder of json files containing a HTML test under the key ['hasPart'][0]['text']"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
parser.add_argument(
|
|
116
|
+
"--input_dir",
|
|
117
|
+
type=str,
|
|
118
|
+
required=True,
|
|
119
|
+
help="Path to folder containing the files to chunk",
|
|
120
|
+
)
|
|
121
|
+
parser.add_argument(
|
|
122
|
+
"--output_dir",
|
|
123
|
+
type=str,
|
|
124
|
+
default=None,
|
|
125
|
+
help="Path to output folder where the chunked files will be stored",
|
|
126
|
+
)
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--max_title_level_to_use",
|
|
129
|
+
type=str,
|
|
130
|
+
choices=["h1", "h2", "h3", "h4", "h5"],
|
|
131
|
+
default="h4",
|
|
132
|
+
help="The maximum level of titles to use for chunking",
|
|
133
|
+
)
|
|
134
|
+
parser.add_argument(
|
|
135
|
+
"--max_chunk_tokens",
|
|
136
|
+
type=int,
|
|
137
|
+
default=8191,
|
|
138
|
+
help="Hard limit of the token size of a chunk. Chunks bigger than that will be handled according to chunk_tokens_exceeded_handling",
|
|
139
|
+
)
|
|
140
|
+
parser.add_argument(
|
|
141
|
+
"--chunk_tokens_exceeded_handling",
|
|
142
|
+
type=str,
|
|
143
|
+
choices=["split", "raise_error"],
|
|
144
|
+
default="raise_error",
|
|
145
|
+
help="Whether a big chunk with not headers should be split or error should be raised",
|
|
146
|
+
)
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--link_placement",
|
|
149
|
+
type=str,
|
|
150
|
+
choices=["remove", "end_of_chunk", "in_sentence", "end_of_sentence"],
|
|
151
|
+
default="end_of_chunk",
|
|
152
|
+
help="Where the links are placed in the chunks",
|
|
153
|
+
)
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--max_chunk_word_count",
|
|
156
|
+
type=int,
|
|
157
|
+
default=250,
|
|
158
|
+
help="Soft limit of chunk size. Chunks bigger than this limit will be subdivided with lower level headers.",
|
|
159
|
+
)
|
|
160
|
+
parser.add_argument(
|
|
161
|
+
"--min_chunk_word_count",
|
|
162
|
+
type=int,
|
|
163
|
+
default=15,
|
|
164
|
+
help="Minium amount a word a chunk must have. If lower, the chunk is discarded.",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return parser.parse_args()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def main():
|
|
171
|
+
args = parse_arguments()
|
|
172
|
+
|
|
173
|
+
wcn = WikitChunkNorris()
|
|
174
|
+
wcn.chunk_directory(**vars(args))
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
if __name__ == "__main__":
|
|
178
|
+
main()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from ..chunkers.html_chunknorris import HTMLChunkNorris as HTMLChunkNorris
|
|
2
|
+
from ..exceptions.exceptions import ChunkNorrisException as ChunkNorrisException
|
|
3
|
+
from ..types.types import Chunks as Chunks
|
|
4
|
+
|
|
5
|
+
class WikitChunkNorris(HTMLChunkNorris):
|
|
6
|
+
@staticmethod
|
|
7
|
+
def read_file(filepath: str, return_full_content: bool = False) -> str: ...
|
|
8
|
+
def chunk_file(self, filepath: str, output_filepath: str = None, **kwargs): ...
|
|
9
|
+
def chunk_directory(self, input_dir: str, output_dir: str = None, **kwargs) -> None: ...
|
|
10
|
+
@staticmethod
|
|
11
|
+
def format_output(filepath: str, chunks: Chunks) -> dict: ...
|
|
12
|
+
|
|
13
|
+
def parse_arguments(): ...
|
|
14
|
+
def main() -> None: ...
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .exceptions import *
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .exceptions import *
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
alphabets = "([A-Za-z])"
|
|
4
|
+
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
|
|
5
|
+
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
|
|
6
|
+
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
|
7
|
+
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
|
8
|
+
websites = "[.](com|net|org|io|gov|edu|me)"
|
|
9
|
+
digits = "([0-9])"
|
|
10
|
+
multiple_dots = r"\.{2,}"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def split_into_sentences(text: str) -> list[str]:
|
|
14
|
+
"""
|
|
15
|
+
Split the text into sentences.
|
|
16
|
+
|
|
17
|
+
If the text contains substrings "<prd>" or "<stop>", they would lead
|
|
18
|
+
to incorrect splitting because they are used as markers for splitting.
|
|
19
|
+
|
|
20
|
+
:param text: text to be split into sentences
|
|
21
|
+
:type text: str
|
|
22
|
+
|
|
23
|
+
:return: list of sentences
|
|
24
|
+
:rtype: list[str]
|
|
25
|
+
"""
|
|
26
|
+
text = " " + text + " "
|
|
27
|
+
text = text.replace("\n", " ")
|
|
28
|
+
text = re.sub(prefixes, "\\1<prd>", text)
|
|
29
|
+
text = re.sub(websites, "<prd>\\1", text)
|
|
30
|
+
text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
|
|
31
|
+
text = re.sub(
|
|
32
|
+
multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text
|
|
33
|
+
)
|
|
34
|
+
if "Ph.D" in text:
|
|
35
|
+
text = text.replace("Ph.D.", "Ph<prd>D<prd>")
|
|
36
|
+
text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
|
|
37
|
+
text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
|
|
38
|
+
text = re.sub(
|
|
39
|
+
alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",
|
|
40
|
+
"\\1<prd>\\2<prd>\\3<prd>",
|
|
41
|
+
text,
|
|
42
|
+
)
|
|
43
|
+
text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text)
|
|
44
|
+
text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text)
|
|
45
|
+
text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
|
|
46
|
+
text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
|
|
47
|
+
if "”" in text:
|
|
48
|
+
text = text.replace(".”", "”.")
|
|
49
|
+
if '"' in text:
|
|
50
|
+
text = text.replace('."', '".')
|
|
51
|
+
if "!" in text:
|
|
52
|
+
text = text.replace('!"', '"!')
|
|
53
|
+
if "?" in text:
|
|
54
|
+
text = text.replace('?"', '"?')
|
|
55
|
+
text = text.replace(".", ".<stop>")
|
|
56
|
+
text = text.replace("?", "?<stop>")
|
|
57
|
+
text = text.replace("!", "!<stop>")
|
|
58
|
+
text = text.replace("<prd>", ".")
|
|
59
|
+
sentences = text.split("<stop>")
|
|
60
|
+
sentences = [s.strip() for s in sentences]
|
|
61
|
+
if sentences and not sentences[-1]:
|
|
62
|
+
sentences = sentences[:-1]
|
|
63
|
+
|
|
64
|
+
return sentences
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
ChunkNorris - A package for reliable chunking of documents
|
|
2
|
+
Copyright (C) 2024 Wikit.ai
|
|
3
|
+
|
|
4
|
+
This program is free software: you can redistribute it and/or modify
|
|
5
|
+
it under the terms of the GNU Affero General Public License as
|
|
6
|
+
published by the Free Software Foundation, either version 3 of the
|
|
7
|
+
License, or (at your option) any later version.
|
|
8
|
+
|
|
9
|
+
This program is distributed in the hope that it will be useful,
|
|
10
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12
|
+
GNU Affero General Public License for more details.
|
|
13
|
+
|
|
14
|
+
You should have received a copy of the GNU Affero General Public License
|
|
15
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: chunknorris
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A package for chunking documents from various formats
|
|
5
|
+
Author-email: Wikit <dev@wikit.ai>
|
|
6
|
+
Project-URL: Homepage, https://gitlab.com/wikit/research-and-development/chunk-norris
|
|
7
|
+
Project-URL: Issues, https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues
|
|
8
|
+
Keywords: chunk,document,split,html,markdown,pdf,header
|
|
9
|
+
Classifier: Natural Language :: English
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Framework :: Pytest
|
|
12
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
13
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
14
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENCE
|
|
19
|
+
Requires-Dist: markdownify >=0.11.6
|
|
20
|
+
Requires-Dist: tiktoken >=0.5.2
|
|
21
|
+
Requires-Dist: PyMuPDF >=1.23.16
|
|
22
|
+
|
|
23
|
+
# Chunk Norris
|
|
24
|
+
|
|
25
|
+
## Goal
|
|
26
|
+
|
|
27
|
+
This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
|
|
28
|
+
An optimized chunking method might lead to smaller chunks, meaning :
|
|
29
|
+
- **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
|
|
30
|
+
- **Less errors** because of chunks exceeding the API limit in terms of number of tokens
|
|
31
|
+
- **Less hallucinations** of generation models because of superfluous information in the prompt
|
|
32
|
+
- **Reduced cost** as the prompt would have reduced size
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
Using Pypi, just run the following command :
|
|
37
|
+
```pip install chunknorris```
|
|
38
|
+
|
|
39
|
+
## Chunkers
|
|
40
|
+
|
|
41
|
+
The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
|
|
42
|
+
|
|
43
|
+
All chunkers follow a similar logic :
|
|
44
|
+
- Extract table of contents (= headers)
|
|
45
|
+
- Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
|
|
46
|
+
|
|
47
|
+

|
|
48
|
+
|
|
49
|
+
### MarkdownChunkNorris
|
|
50
|
+
|
|
51
|
+
This chunker is meant to be used **on markdown-formatted text**.
|
|
52
|
+
|
|
53
|
+
Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
|
|
54
|
+
|
|
55
|
+
#### Usage
|
|
56
|
+
|
|
57
|
+
```py
|
|
58
|
+
from chunkers import MarkdownChunkNorris
|
|
59
|
+
|
|
60
|
+
text = """
|
|
61
|
+
# This is a header
|
|
62
|
+
This is a text
|
|
63
|
+
## This is another header
|
|
64
|
+
And another text
|
|
65
|
+
## With this final header
|
|
66
|
+
And this last text
|
|
67
|
+
"""
|
|
68
|
+
chunker = MarkdownChunkNorris()
|
|
69
|
+
header_style = "atx" # or "setext" depending on headers in your text
|
|
70
|
+
chunks = chunker(text, header_style=header_style)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### HTMLChunkNorris
|
|
74
|
+
|
|
75
|
+
This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
|
|
76
|
+
|
|
77
|
+
#### Usage
|
|
78
|
+
|
|
79
|
+
```py
|
|
80
|
+
from chunkers import HTMLChunkNorris
|
|
81
|
+
|
|
82
|
+
text = """
|
|
83
|
+
<h1>This is 1st level heading</h1>
|
|
84
|
+
<p>This is a test paragraph.</p>
|
|
85
|
+
<h2>This is 2nd level heading</h2>
|
|
86
|
+
<p>This is a test paragraph.</p>
|
|
87
|
+
<h2>This is another level heading</h2>
|
|
88
|
+
<p>This is another test paragraph.</p>
|
|
89
|
+
"""
|
|
90
|
+
hcn = HTMLChunkNorris()
|
|
91
|
+
chunks = hcn(text)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Advanced usage of chunkers
|
|
95
|
+
|
|
96
|
+
Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
|
|
97
|
+
|
|
98
|
+
```py
|
|
99
|
+
from chunkers import MarkdownChunkNorris
|
|
100
|
+
|
|
101
|
+
mystring = "# header\nThis is a markdown string"
|
|
102
|
+
|
|
103
|
+
chunker = MarkdownChunkNorris() # or any other chunker
|
|
104
|
+
chunks = chunker(
|
|
105
|
+
mystring,
|
|
106
|
+
max_title_level_to_use="h3",
|
|
107
|
+
max_chunk_word_length=200,
|
|
108
|
+
link_placement="in_sentence",
|
|
109
|
+
max_chunk_tokens=8191,
|
|
110
|
+
chunk_tokens_exceeded_handling="split",
|
|
111
|
+
min_chunk_wordcount=15,
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
***max_title_level_to_use***
|
|
116
|
+
(str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
|
|
117
|
+
|
|
118
|
+
***max_chunk_word_length***
|
|
119
|
+
(int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
|
|
120
|
+
|
|
121
|
+
***link_placement***
|
|
122
|
+
(str): How the links should be handled. Defaults to in_sentence.
|
|
123
|
+
Options :
|
|
124
|
+
- "remove" : text is kept but links are removed
|
|
125
|
+
- "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
|
|
126
|
+
- "in_sentence" : the links is added between parenthesis inside the sentence
|
|
127
|
+
|
|
128
|
+
***max_chunk_tokens***
|
|
129
|
+
(int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
|
|
130
|
+
|
|
131
|
+
***chunk_tokens_exceeded_handling***
|
|
132
|
+
(str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
|
|
133
|
+
Options:
|
|
134
|
+
- "raise_error": raises an error, indicated the chunk could not be split according to headers
|
|
135
|
+
- "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
|
|
136
|
+
|
|
137
|
+
***min_chunk_wordcount***
|
|
138
|
+
(int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
|
|
139
|
+
|
|
140
|
+
### PDFChunkNorris
|
|
141
|
+
|
|
142
|
+
#TODO:
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
chunknorris/__init__.py,sha256=GMddZWF-AhcCso9CLeuZzDYLH1wF_mWB_Z1UALPS6i4,102
|
|
2
|
+
chunknorris/__init__.pyi,sha256=GMddZWF-AhcCso9CLeuZzDYLH1wF_mWB_Z1UALPS6i4,102
|
|
3
|
+
chunknorris/chunkers/__init__.py,sha256=pm6GiSXWpn09xJP1xlKbnU7bJglY4l_3g-CBfr0adVE,100
|
|
4
|
+
chunknorris/chunkers/__init__.pyi,sha256=Pn5uEk6TwEE_dMb1yzwPZeIcbwMB8zRPVdhKO8ntKg0,142
|
|
5
|
+
chunknorris/chunkers/html_chunknorris.py,sha256=X9LYqd2maej_7bFUa0pY_CAGWdanM_7_tUY6yoyRUR0,663
|
|
6
|
+
chunknorris/chunkers/html_chunknorris.pyi,sha256=M2xQe5xvoysJCHB1HtGpQ7jv4sj-JFB7vpOJegw0LI4,250
|
|
7
|
+
chunknorris/chunkers/markdown_chunknorris.py,sha256=ZdB1NpZASTDiEMfv3CBR8FRvrGl8Rd6r-GZcmBqCRBk,20268
|
|
8
|
+
chunknorris/chunkers/markdown_chunknorris.pyi,sha256=kk8I3La__qcvCyRVtwxD-5NuxFn721LnZRZhUVDx2WM,1486
|
|
9
|
+
chunknorris/custom_chunkers/__init__.py,sha256=qMlMdmT4KHJnyRr42ZWT0g-RnhDdwDCENdJ9BztLF00,48
|
|
10
|
+
chunknorris/custom_chunkers/__init__.pyi,sha256=sley_LOReM3qVfDtSqKa_13soCe32eDBHGZUgAoiASI,68
|
|
11
|
+
chunknorris/custom_chunkers/wikit_chunknorris.py,sha256=IA-7Xb0ogXmxgqp8ITJM-P7n4clvVaSRAV_DRtT92nE,5887
|
|
12
|
+
chunknorris/custom_chunkers/wikit_chunknorris.pyi,sha256=YlZTYx8iAVqNHbjGw8kbSQZ_kw550qPzYjUNNTuPdok,652
|
|
13
|
+
chunknorris/exceptions/__init__.py,sha256=nWnJwEphtAjFsYg_dIYrrbWdkIB0G2Nu89d7d6DJGKc,25
|
|
14
|
+
chunknorris/exceptions/__init__.pyi,sha256=y6AJu3xWHud92ZK_pfU3WzDj8gLIYvXfFNJ-phZmjJo,26
|
|
15
|
+
chunknorris/exceptions/exceptions.py,sha256=OUj1qxRcQU0RdYTJMs3L2abOarD16nhjkAPTMRhr9V0,169
|
|
16
|
+
chunknorris/exceptions/exceptions.pyi,sha256=HZeW31MY2y1BijHfSlHElqpTkn1jxawO6LvdMa1Ft-s,166
|
|
17
|
+
chunknorris/types/types.py,sha256=ii8PICCROmuRSZSH2lM_iAr4PYBMBEI5ePm1S9eG8Cw,173
|
|
18
|
+
chunknorris/types/types.pyi,sha256=iW3elE5TLioEZLL6HUSPP9LH8izJCWE0JY-73eqAe1I,170
|
|
19
|
+
chunknorris/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
+
chunknorris/utils/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
chunknorris/utils/utils.py,sha256=nA-qCm22dAYRtynCJwWa7S9AK8SCYoDzG6Xh296uX1g,2288
|
|
22
|
+
chunknorris/utils/utils.pyi,sha256=ApwdRg48ZViJhdr-g91E6F76Qdj5EBfqVVKHQ6qtSJI,171
|
|
23
|
+
chunknorris-0.0.1.dist-info/LICENCE,sha256=Z4Dj4xyHOkzx-Ggiig2yGWeOy-gnR7OF_hkJ6kCcCNw,720
|
|
24
|
+
chunknorris-0.0.1.dist-info/METADATA,sha256=hGylMSJp_O-uaIRAWXGybjRILsPxdeGF14a59eMpEZk,5200
|
|
25
|
+
chunknorris-0.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
26
|
+
chunknorris-0.0.1.dist-info/top_level.txt,sha256=zYBuOKW7poeXPcU-OLQF5PBbdbrw2aZbWgtPuge0x7Y,12
|
|
27
|
+
chunknorris-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chunknorris
|