chunknorris 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ from .chunkers import *
2
+ from .custom_chunkers import *
3
+ from .exceptions import *
4
+ from .types import *
@@ -0,0 +1,4 @@
1
+ from .chunkers import *
2
+ from .custom_chunkers import *
3
+ from .exceptions import *
4
+ from .types import *
@@ -0,0 +1,2 @@
1
+ from .html_chunknorris import HTMLChunkNorris
2
+ from .markdown_chunknorris import MarkdownChunkNorris
@@ -0,0 +1,2 @@
1
+ from .html_chunknorris import HTMLChunkNorris as HTMLChunkNorris
2
+ from .markdown_chunknorris import MarkdownChunkNorris as MarkdownChunkNorris
@@ -0,0 +1,25 @@
1
+ from markdownify import markdownify
2
+
3
+ from .markdown_chunknorris import MarkdownChunkNorris
4
+
5
+
6
+ class HTMLChunkNorris(MarkdownChunkNorris):
7
+
8
+ def __call__(self, html_text: str, **kwargs) -> str:
9
+ text = HTMLChunkNorris.apply_markdownify(html_text)
10
+
11
+ return super().__call__(text, **kwargs)
12
+
13
+ @staticmethod
14
+ def apply_markdownify(html_text) -> str:
15
+ """Applies markdownify to the html text
16
+
17
+ Args:
18
+ html_text (str): the text of the html file
19
+
20
+ Returns:
21
+ str: the markdownified string
22
+ """
23
+ md_text = markdownify(html_text, strip=["figure", "img"], bullets="-*+")
24
+
25
+ return md_text
@@ -0,0 +1,6 @@
1
+ from .markdown_chunknorris import MarkdownChunkNorris as MarkdownChunkNorris
2
+
3
+ class HTMLChunkNorris(MarkdownChunkNorris):
4
+ def __call__(self, html_text: str, **kwargs) -> str: ...
5
+ @staticmethod
6
+ def apply_markdownify(html_text) -> str: ...
@@ -0,0 +1,543 @@
1
+ import copy
2
+ import json
3
+ import re
4
+ import tiktoken
5
+ from unicodedata import normalize
6
+
7
+ from ..exceptions.exceptions import ChunkSizeExceeded
8
+ from ..types.types import Chunk, Chunks, TocTree
9
+
10
+
11
+ class MarkdownChunkNorris:
12
+ def __init__(self):
13
+ self.tokenizer = tiktoken.get_encoding("cl100k_base")
14
+
15
+ def __call__(self, md_text: str, **kwargs) -> Chunks:
16
+ """Gets chunks from markdown string
17
+
18
+ Args:
19
+ md_text (str): the markdown string
20
+
21
+ Returns:
22
+ Chunks: the list of chunk's texts
23
+ """
24
+ toc_tree = self.get_toc_tree(md_text, **kwargs)
25
+ chunks = self.get_chunks(toc_tree, **kwargs)
26
+
27
+ return chunks
28
+
29
+ @staticmethod
30
+ def _check_string_argument_is_valid(argname: str, argvalue: str):
31
+ """Checks that an argument has a valid value
32
+
33
+ Args:
34
+ argname (str): the name of the argument
35
+ argvalue (str): the value of the argument
36
+ """
37
+ allowed_values_dict = {
38
+ "header_style": ["setext", "atx"],
39
+ "max_title_level_to_use": ["h1", "h2", "h3", "h4", "h5"],
40
+ "link_placement": [
41
+ "remove",
42
+ "end_of_chunk",
43
+ "in_sentence",
44
+ "leave_as_markdown",
45
+ ],
46
+ "chunk_tokens_exceeded_handling": ["raise_error", "split"],
47
+ }
48
+ allowed_values = allowed_values_dict[argname]
49
+ assert argvalue in allowed_values, ValueError(
50
+ f"Argument '{argname}' should be one of {allowed_values}. Got '{argvalue}'"
51
+ )
52
+
53
+ def _get_header_regex_patterns(self, header_style: str = "setext") -> dict:
54
+ """Get the header regex patterns depending on the header_style
55
+
56
+ Args:
57
+ header_style (str, optional): the markdown header style. Must be 'atx' or 'setext'.
58
+ Defaults to "setext".
59
+
60
+ Returns:
61
+ (dict) : a mapping between header name and regex patterns
62
+ """
63
+ self._check_string_argument_is_valid("header_style", header_style)
64
+ patterns = {
65
+ "h1": re.compile(r"(.+?)\n={3,}", re.MULTILINE),
66
+ "h2": re.compile(r"(.+?)\n-{3,}", re.MULTILINE),
67
+ "h3": re.compile(r"^(?:- )?(#{3} .+)", re.MULTILINE),
68
+ "h4": re.compile(r"^(?:- )?(#{4} .+)", re.MULTILINE),
69
+ "h5": re.compile(r"^(?:- )?(#{5} .+)", re.MULTILINE),
70
+ }
71
+ if header_style == "atx":
72
+ patterns["h1"] = re.compile(r"^(?:- )?(#{1} .+)", re.MULTILINE)
73
+ patterns["h2"] = re.compile(r"^(?:- )?(#{2} .+)", re.MULTILINE)
74
+
75
+ return patterns
76
+
77
+ def _convert_setext_to_atx(self, markdown_string: str) -> str:
78
+ """Converts headers from setext style to markdown style
79
+
80
+ Args:
81
+ markdown_string (str): the markdown string
82
+
83
+ Return:
84
+ str: the string with formatted headers
85
+ """
86
+ regex_patterns = self._get_header_regex_patterns("setext")
87
+ for match in re.finditer(regex_patterns["h1"], markdown_string):
88
+ markdown_string = markdown_string.replace(match[0], f"# {match[1]}")
89
+ for match in re.finditer(regex_patterns["h2"], markdown_string):
90
+ markdown_string = markdown_string.replace(match[0], f"## {match[1]}")
91
+
92
+ return markdown_string
93
+
94
+ def get_toc_tree(
95
+ self,
96
+ markdown_string: str,
97
+ max_title_level_to_use: str = "h4",
98
+ header_style="setext",
99
+ **kwargs,
100
+ ) -> TocTree:
101
+ """Builds the table of content tree based on header
102
+
103
+ Args:
104
+ markdown_string (str): the string, as markdown
105
+ max_title_level_to_use (str, optional): the maximum title level to use. Headers with lower
106
+ level than this won't be considered as headers. Defaults to "h4".
107
+ header_style (str, optional): the type of header format. Defaults to "setext".
108
+
109
+ Returns:
110
+ TocTree: the table of content
111
+ """
112
+ MarkdownChunkNorris._check_string_argument_is_valid(
113
+ "header_style", header_style
114
+ )
115
+ MarkdownChunkNorris._check_string_argument_is_valid(
116
+ "max_title_level_to_use", max_title_level_to_use
117
+ )
118
+ if header_style == "setext":
119
+ markdown_string = self._convert_setext_to_atx(markdown_string)
120
+ title_types_to_consider = [
121
+ f"h{i}" for i in range(1, int(max_title_level_to_use[1]) + 1)
122
+ ]
123
+ regex_patterns = self._get_header_regex_patterns("atx")
124
+ lines = markdown_string.splitlines()
125
+ tree = {
126
+ "title": "",
127
+ "children": [],
128
+ "content": "",
129
+ "id": -1,
130
+ "line_index": -1,
131
+ "level": -1,
132
+ "parent": {},
133
+ }
134
+ current_level = -1
135
+ current_node = tree
136
+ id_cntr = 0
137
+ for line_idx, line in enumerate(lines):
138
+ for level, title_type in enumerate(title_types_to_consider):
139
+ match = re.match(regex_patterns[title_type], line)
140
+ if match:
141
+ title = match.group(1)
142
+ while level <= current_level:
143
+ current_node = current_node["parent"]
144
+ current_level = current_node["level"]
145
+ new_node = {
146
+ "title": title,
147
+ "children": [],
148
+ "content": "",
149
+ "id": id_cntr,
150
+ "line_index": line_idx,
151
+ "level": level,
152
+ "parent": current_node,
153
+ }
154
+ current_node["children"].append(new_node)
155
+ current_node = new_node
156
+ current_level = level
157
+ id_cntr += 1
158
+ break
159
+ if not match:
160
+ current_node["content"] += line + "\n"
161
+
162
+ MarkdownChunkNorris._cleanup_tree_texts(tree)
163
+
164
+ return tree
165
+
166
+ @staticmethod
167
+ def save_toc_tree(toc_tree: TocTree, output_path: str = "toc_tree.json"):
168
+ """Save the toc tree as json
169
+
170
+ Args:
171
+ toc_tree (TocTree): the toc tree to save
172
+ output_path (str, optional): the output path. Defaults to "toc_tree.json".
173
+ """
174
+ dumpable_dict = copy.deepcopy(toc_tree)
175
+ MarkdownChunkNorris._remove_circular_refs(dumpable_dict)
176
+ with open(output_path, "w", encoding="UTF-8") as f:
177
+ json.dump(dumpable_dict, f)
178
+
179
+ @staticmethod
180
+ def _remove_circular_refs(toc_tree_element: TocTree):
181
+ """Recursively removes the circular ref in dict
182
+ (used to save as json).
183
+
184
+ Args:
185
+ toc_tree_element (TocTree): the element of toc tree
186
+ """
187
+ if "parent" in toc_tree_element:
188
+ del toc_tree_element["parent"]
189
+ if "children" in toc_tree_element:
190
+ for child in toc_tree_element["children"]:
191
+ MarkdownChunkNorris._remove_circular_refs(child)
192
+
193
+ @staticmethod
194
+ def _cleanup_tree_texts(tree: TocTree):
195
+ """Cleans up the texts at each 'content' key of the toc tree
196
+ Uses recursion to go through each child
197
+ Args:
198
+ tree (TocTree): the toc tree
199
+ """
200
+ text = tree["content"]
201
+ text = normalize("NFKD", text)
202
+ # remove special characters
203
+ special_chars = ["**", r"\*"]
204
+ for char in special_chars:
205
+ text = text.replace(char, "")
206
+ # remove white spaces and newlines
207
+ text = re.sub(r"\n\s*\n", "\n", text)
208
+ text = text.rstrip().lstrip()
209
+ tree["content"] = text
210
+ if tree["children"]:
211
+ for child in tree["children"]:
212
+ tree = MarkdownChunkNorris._cleanup_tree_texts(child)
213
+
214
+ @staticmethod
215
+ def get_title_by_id(toc_tree: TocTree, id: int) -> TocTree:
216
+ """Gets a toc tree using its id
217
+
218
+ Args:
219
+ toc_tree (TocTree): the toc tree
220
+ id (int): the id of the title we are looking for. Defaults to int.
221
+
222
+ Returns:
223
+ TocTree: the element of title we were looking for
224
+ """
225
+ if toc_tree.get("id") == id:
226
+ return toc_tree
227
+ for child in toc_tree.get("children", []):
228
+ target = MarkdownChunkNorris.get_title_by_id(child, id)
229
+ if target:
230
+ return target
231
+
232
+ def get_chunks(self, toc_tree: TocTree, **kwargs) -> Chunks:
233
+ """Wrapper that build the chunk's texts, check
234
+ that they fit in size, replace links formatting.
235
+
236
+ Args:
237
+ toc_tree (TocTree): the toc tree of the document
238
+
239
+ Returns:
240
+ Chunks: the chunks text, formatted
241
+ """
242
+ chunks = MarkdownChunkNorris.get_chunks_texts(toc_tree, **kwargs)
243
+ chunks = [MarkdownChunkNorris.change_links_format(c, **kwargs) for c in chunks]
244
+ chunks = MarkdownChunkNorris.remove_small_chunks(chunks, **kwargs)
245
+ chunks = self.remove_small_chunks(chunks, **kwargs)
246
+
247
+ return chunks
248
+
249
+ @staticmethod
250
+ def get_chunks_texts(
251
+ toc_tree_element: TocTree, already_ok_chunks: Chunks | None = None, **kwargs
252
+ ) -> Chunks:
253
+ """Uses the toc tree to build the chunks. Uses recursion.
254
+ Method :
255
+ - build the chunk (= titles from sections above + section content + content of subsections)
256
+ - if the chunk is too big:
257
+ - save the section as title + content (if section has content)
258
+ - subdivide section recursively using subsections
259
+ - else save it as is
260
+
261
+ Args:
262
+ toc_tree_element (TocTree): _description_
263
+ already_ok_chunks (Chunks, optional): the chunks already built.
264
+ Used for recursion. Defaults to None.
265
+
266
+ Returns:
267
+ Chunks: list of chunk's texts
268
+ """
269
+ if not already_ok_chunks:
270
+ already_ok_chunks = []
271
+
272
+ current_chunk = MarkdownChunkNorris.build_chunk_text(toc_tree_element)
273
+
274
+ if MarkdownChunkNorris._chunk_is_too_big(current_chunk, **kwargs):
275
+ if toc_tree_element["content"]:
276
+ parents = MarkdownChunkNorris.get_parents_headers(toc_tree_element)
277
+ current_chunk = "\n".join(
278
+ parents + [toc_tree_element["title"], toc_tree_element["content"]]
279
+ )
280
+ already_ok_chunks.append(current_chunk)
281
+ for child in toc_tree_element["children"]:
282
+ already_ok_chunks = MarkdownChunkNorris.get_chunks_texts(
283
+ child, already_ok_chunks, **kwargs
284
+ )
285
+ else:
286
+ already_ok_chunks.append(current_chunk)
287
+
288
+ return already_ok_chunks
289
+
290
+ @staticmethod
291
+ def build_chunk_text(toc_tree_element: TocTree) -> Chunk:
292
+ """Builds a chunk by apposing the text of headers
293
+ and recursively getting the content of children
294
+
295
+ Args:
296
+ toc_tree_element (TocTree): the toc tree element
297
+
298
+ Returns:
299
+ str: the chunk content. parent's headers + content
300
+ """
301
+ parents = MarkdownChunkNorris.get_parents_headers(toc_tree_element)
302
+ content = MarkdownChunkNorris._build_chunk_content(toc_tree_element)
303
+
304
+ return "\n".join(parents) + "\n" + content
305
+
306
+ @staticmethod
307
+ def _build_chunk_content(toc_tree_element: TocTree) -> str:
308
+ """Builds a chunk content (i.e without headers above)
309
+ from a toc tree. It uses the toc tree's content, and recursively
310
+ adds the header and content of its children
311
+
312
+ Args:
313
+ toc_tree_element (TocTree): the toc tree (or element of toc tree)
314
+
315
+ Returns:
316
+ str: the text of the chunk (without the headers of parents)
317
+ """
318
+ text = toc_tree_element.get("title") + "\n" + toc_tree_element.get("content")
319
+ if toc_tree_element.get("children"):
320
+ for child in toc_tree_element.get("children"):
321
+ text += "\n" + MarkdownChunkNorris._build_chunk_content(child)
322
+
323
+ return text
324
+
325
+ @staticmethod
326
+ def get_parents_headers(toc_tree_element: TocTree) -> list[str]:
327
+ """Gets a list of the titles that are parent
328
+ of the provided toc tree element. The list
329
+ is ordered in descending order in terms of header level.
330
+
331
+ Args:
332
+ toc_tree_element (TocTree): the toc tree element
333
+
334
+ Returns:
335
+ list[str]: the list of header's text
336
+ """
337
+ headers = []
338
+ while toc_tree_element.get("parent"):
339
+ toc_tree_element = toc_tree_element.get("parent")
340
+ headers.append(toc_tree_element.get("title"))
341
+ # remove empty string header, such as root header
342
+ headers = [h for h in headers if h]
343
+
344
+ return list(reversed(headers))
345
+
346
+ @staticmethod
347
+ def _chunk_is_too_big(
348
+ chunk: str, max_chunk_word_length: int = 200, **kwargs
349
+ ) -> bool:
350
+ """Returns True if the chunk is bigger than the value
351
+ specified as max_chunk_length in terms of word count
352
+
353
+ Args:
354
+ chunk (str): a chunk of text
355
+ max_chunk_length (int): the max size of the chunk in words. Default to 200.
356
+
357
+ Returns:
358
+ bool: True if the chunk is too big, else false
359
+ """
360
+ return len(chunk.split()) > max_chunk_word_length
361
+
362
+ @staticmethod
363
+ def _header_has_children(toc_tree_element: TocTree) -> bool:
364
+ """Mostly used for code comprehension.
365
+ Returns True if the title has children.
366
+
367
+ Args:
368
+ toc_tree_element (TocTree): the header to check for children
369
+
370
+ Returns:
371
+ bool: True if the header has children
372
+ """
373
+ return bool(toc_tree_element.get("children"))
374
+
375
+ @staticmethod
376
+ def change_links_format(text, link_placement: str = "in_sentence", **kwargs) -> str:
377
+ """Removes the markdown format of the links in the text.
378
+ The links are treated as specified by 'link_position':
379
+ - remove : links are removed
380
+ - in_sentence : the link is placed in the sentence, between parenthesis
381
+ - end_of_chunk : all links are added at the end of the text
382
+ - leave_as_markdown: leave links as markdown format
383
+
384
+ Args:
385
+ text (str): the text to find the links in
386
+ link_placement (str, optional): How the links should be handled. Defaults to end_of_chunk.
387
+
388
+ Returns:
389
+ str: the formated text
390
+ """
391
+ MarkdownChunkNorris._check_string_argument_is_valid(
392
+ "link_placement", link_placement
393
+ )
394
+ if link_placement == "leave_as_markdown":
395
+ return text
396
+
397
+ patterns = {
398
+ "image_as_link": r"\[!\[(.*?)\]\(.*?\)\]\((.*?)\)",
399
+ "image": r"!\[(.*?)\]\((https?:[^\s'()]+).*?\)",
400
+ "link": r"\[(.+?)\]\((https?:.+?)\)",
401
+ }
402
+ for pattern in patterns.values():
403
+ matches = re.finditer(pattern, text)
404
+ replacements = [(m[0], m[1], m[2]) for m in matches]
405
+ if replacements is not None:
406
+ text = MarkdownChunkNorris._handle_link_replacements(
407
+ text, replacements, link_placement=link_placement
408
+ )
409
+
410
+ return text
411
+
412
+ @staticmethod
413
+ def _handle_link_replacements(
414
+ text: str,
415
+ replacements: list[tuple[str, str, str]],
416
+ link_placement: str = "in_sentence",
417
+ ) -> str:
418
+ """Handles the replacement of links in the text
419
+ according to specified format
420
+
421
+ Args:
422
+ text (str): the text to replace the links ind
423
+ replacements (list[tuple[str, str, str]]): the replacements. Is is a list of
424
+ [(entire regex match, link's text, link's url)]
425
+ link_placement (str, optional): _description_. Defaults to "in_sentence".
426
+
427
+ Returns:
428
+ str: the text with links modified
429
+ """
430
+ for i, m in enumerate(replacements):
431
+ match link_placement:
432
+ case "remove":
433
+ text = text.replace(m[0], m[1])
434
+ case "end_of_chunk":
435
+ if i == 0:
436
+ text += "\nPour plus d'informations:\n"
437
+ text = text.replace(m[0], m[1])
438
+ text += f"- {m[1]}: {m[2]}"
439
+ case "in_sentence":
440
+ text = text.replace(m[0], f"{m[1]} (lien : {m[2]})")
441
+
442
+ return text
443
+
444
+ @staticmethod
445
+ def remove_small_chunks(
446
+ chunks: Chunks, min_chunk_word_count: int = 15, **kwargs
447
+ ) -> Chunks:
448
+ """Removes chunks that have less words than the specified limit
449
+
450
+ Args:
451
+ chunks (Chunks): _description_
452
+ min_chunk_tokens (int, optional): the minimum size a chunk is allowed to be,
453
+ in words. Chunks with less words than this are dicarded. Defaults to 15.
454
+ Returns:
455
+ Chunks: the chunks with more words than the specified threshold
456
+ """
457
+ return [c for c in chunks if len(c.split()) >= min_chunk_word_count]
458
+
459
+ def split_big_chunks(
460
+ self,
461
+ chunks: Chunks,
462
+ max_chunk_tokens: int = 8191,
463
+ chunk_tokens_exceeded_handling: str = "raise_error",
464
+ **kwargs,
465
+ ) -> Chunks:
466
+ """Checks that the chunks do not exceed the token limit, considered as a hard limit
467
+ If chunk_tokens_exceeded_handling is:
468
+ - "raise_error" -> it will raise an error in case a chunk to big is found
469
+ for it to be investigated.
470
+ - "split" -> Chunks exceeding the max size will be split to fit max_chunk_tokens
471
+
472
+ Args:
473
+ chunks (Chunks): The chunks obtained from the get_chunks() method
474
+ max_chunk_tokens (int, optional): the maximum size a chunk is allowed to be,
475
+ in tokens. Defaults to 8191.
476
+ chunk_tokens_exceeded_handling (str, optional): whether or not error should be raised if a big
477
+ chunk is encountered, or split. Defaults to raise_error.
478
+ """
479
+ MarkdownChunkNorris._check_string_argument_is_valid(
480
+ "chunk_tokens_exceeded_handling", chunk_tokens_exceeded_handling
481
+ )
482
+ splitted_chunks = []
483
+ for chunk in chunks:
484
+ if self.get_token_count(chunk) < max_chunk_tokens:
485
+ splitted_chunks.append(chunk)
486
+ else:
487
+ match chunk_tokens_exceeded_handling:
488
+ case "raise_error":
489
+ raise ChunkSizeExceeded(
490
+ (
491
+ f"Found chunk bigger than the specified token limit {max_chunk_tokens}:",
492
+ "You can disable this error and allow dummy splitting of this chunk by passing 'raise_error=False'",
493
+ f"The chunk : {chunk}",
494
+ )
495
+ )
496
+ case "split":
497
+ splitted_chunk = self._dummy_split_big_chunk(
498
+ chunk, max_chunk_tokens
499
+ )
500
+ splitted_chunks.extend(splitted_chunk)
501
+
502
+ return splitted_chunks
503
+
504
+ def get_token_count(self, text: Chunk) -> int:
505
+ """Get the token count of a chunk
506
+
507
+ Args:
508
+ text (Chunk): the text to get the token count from
509
+
510
+ Returns:
511
+ int: the token count
512
+ """
513
+ return len(self.tokenizer.encode(text))
514
+
515
+ def _dummy_split_big_chunk(
516
+ self,
517
+ chunk: Chunk,
518
+ max_chunk_tokens: int = 8191,
519
+ ) -> Chunks:
520
+ """Splits the chunk so that the subchunks fit un max_chunk_size
521
+
522
+ Args:
523
+ chunk (Chunk): _description_
524
+ max_chunk_tokens (int, optional): maximum chunk size. Defaults to 8191.
525
+
526
+ Returns:
527
+ Chunks: the subdivided chunks
528
+ """
529
+ token_count = self.get_token_count(chunk)
530
+ if token_count < max_chunk_tokens:
531
+ return [chunk]
532
+
533
+ split_count = (token_count // max_chunk_tokens) + 1
534
+ split_token_size = token_count // split_count
535
+ tokenized_text = self.tokenizer.encode(chunk)
536
+ splitted_text = [
537
+ self.tokenizer.decode(
538
+ tokenized_text[i * split_token_size : (i + 1) * split_token_size]
539
+ )
540
+ for i in range(split_count)
541
+ ]
542
+
543
+ return splitted_text
@@ -0,0 +1,26 @@
1
+ from ..exceptions.exceptions import ChunkSizeExceeded as ChunkSizeExceeded
2
+ from ..types.types import Chunk as Chunk, Chunks as Chunks, TocTree as TocTree
3
+ from _typeshed import Incomplete
4
+
5
+ class MarkdownChunkNorris:
6
+ tokenizer: Incomplete
7
+ def __init__(self) -> None: ...
8
+ def __call__(self, md_text: str, **kwargs) -> Chunks: ...
9
+ def get_toc_tree(self, markdown_string: str, max_title_level_to_use: str = 'h4', header_style: str = 'setext', **kwargs) -> TocTree: ...
10
+ @staticmethod
11
+ def save_toc_tree(toc_tree: TocTree, output_path: str = 'toc_tree.json'): ...
12
+ @staticmethod
13
+ def get_title_by_id(toc_tree: TocTree, id: int) -> TocTree: ...
14
+ def get_chunks(self, toc_tree: TocTree, **kwargs) -> Chunks: ...
15
+ @staticmethod
16
+ def get_chunks_texts(toc_tree_element: TocTree, already_ok_chunks: Chunks | None = None, **kwargs) -> Chunks: ...
17
+ @staticmethod
18
+ def build_chunk_text(toc_tree_element: TocTree) -> Chunk: ...
19
+ @staticmethod
20
+ def get_parents_headers(toc_tree_element: TocTree) -> list[str]: ...
21
+ @staticmethod
22
+ def change_links_format(text, link_placement: str = 'in_sentence', **kwargs) -> str: ...
23
+ @staticmethod
24
+ def remove_small_chunks(chunks: Chunks, min_chunk_word_count: int = 15, **kwargs) -> Chunks: ...
25
+ def split_big_chunks(self, chunks: Chunks, max_chunk_tokens: int = 8191, chunk_tokens_exceeded_handling: str = 'raise_error', **kwargs) -> Chunks: ...
26
+ def get_token_count(self, text: Chunk) -> int: ...
@@ -0,0 +1 @@
1
+ from .wikit_chunknorris import WikitChunkNorris
@@ -0,0 +1 @@
1
+ from .wikit_chunknorris import WikitChunkNorris as WikitChunkNorris
@@ -0,0 +1,178 @@
1
+ import glob
2
+ import os
3
+ import json
4
+ from argparse import ArgumentParser
5
+ from ..chunkers.html_chunknorris import HTMLChunkNorris
6
+ from ..exceptions.exceptions import ChunkNorrisException
7
+ from ..types.types import Chunks
8
+
9
+
10
+ class WikitChunkNorris(HTMLChunkNorris):
11
+
12
+ @staticmethod
13
+ def read_file(filepath: str, return_full_content: bool = False) -> str:
14
+ """Reads a html or json file
15
+
16
+ Args:
17
+ filepath (str): path to a file. must end with .json or .html
18
+ return_full_content (bool): Only applies to JSON files. Indicates whether or not
19
+ the entire content of the file is returned or just the text content.
20
+ Returns:
21
+ str: the text, mardkownified
22
+ """
23
+ try:
24
+ with open(filepath, "r", encoding="UTF-8") as f:
25
+ if filepath.endswith(".json"):
26
+ file = json.load(f)
27
+ if not return_full_content:
28
+ file = file["hasPart"][0]["text"]
29
+ elif filepath.endswith(".html"):
30
+ file = f.read()
31
+ else:
32
+ raise Exception("Can only open JSON or HTML files")
33
+ except Exception as e:
34
+ raise ChunkNorrisException(f"Can't open file '{filepath}': {e}") from e
35
+
36
+ return file
37
+
38
+ def chunk_file(self, filepath: str, output_filepath: str = None, **kwargs):
39
+ """Chunks a json file and save the chunked file
40
+
41
+ Args:
42
+ filepath (str): path to the file
43
+ output_filepath (str): path of the output file
44
+ make_dirs (bool): whether of not to make the path if folders don't exist
45
+ """
46
+ filepath = os.path.normpath(filepath)
47
+ html_text = WikitChunkNorris.read_file(filepath)
48
+ chunks = self(html_text, **kwargs)
49
+ file_content = WikitChunkNorris.format_output(filepath, chunks)
50
+
51
+ if not output_filepath:
52
+ output_dir = os.path.dirname(filepath) + "-chunked"
53
+ output_filepath = os.path.join(output_dir, os.path.basename(filepath))
54
+ else:
55
+ output_dir = os.path.dirname(output_filepath)
56
+ if not os.path.exists(output_dir):
57
+ os.makedirs(output_dir)
58
+
59
+ with open(output_filepath, "w", encoding="UTF-8") as f:
60
+ json.dump(file_content, f, ensure_ascii=False)
61
+
62
+ def chunk_directory(self, input_dir: str, output_dir: str = None, **kwargs) -> None:
63
+ """Chunks the json files of entire directory, recursively
64
+
65
+ Args:
66
+ input_dir (str): the path to directory
67
+ output_dir (str): the directory where chunks will be saved
68
+ """
69
+ input_dir = os.path.normpath(input_dir)
70
+ if not output_dir:
71
+ output_dir = f"{input_dir}-chunked"
72
+
73
+ filepaths = [
74
+ subdir
75
+ for dir in os.walk(input_dir)
76
+ for subdir in glob.glob(os.path.join(dir[0], "*.json"))
77
+ ]
78
+ for fp in filepaths:
79
+ self.chunk_file(fp, fp.replace(input_dir, output_dir), **kwargs)
80
+
81
+ @staticmethod
82
+ def format_output(filepath: str, chunks: Chunks) -> dict:
83
+ """Formats the chunks according to the input json file
84
+ i.e places the chunks inside the key [hasPart]
85
+
86
+ Args:
87
+ filepath (str): path toward the json file
88
+ chunks (Chunks): the chunks
89
+
90
+ Returns:
91
+ dict: the formatted file content
92
+ """
93
+ file_content = WikitChunkNorris.read_file(filepath, return_full_content=True)
94
+ chunks = [
95
+ {
96
+ "@type": "DocumentChunk",
97
+ "text": c,
98
+ "position": i,
99
+ }
100
+ for i, c in enumerate(chunks)
101
+ ]
102
+
103
+ file_content["hasPart"] = chunks
104
+
105
+ return file_content
106
+
107
+
108
+ def parse_arguments():
109
+ """Parse the given command-line arguments."""
110
+
111
+ parser = ArgumentParser(
112
+ description="Chunking a folder of json files containing a HTML test under the key ['hasPart'][0]['text']"
113
+ )
114
+
115
+ parser.add_argument(
116
+ "--input_dir",
117
+ type=str,
118
+ required=True,
119
+ help="Path to folder containing the files to chunk",
120
+ )
121
+ parser.add_argument(
122
+ "--output_dir",
123
+ type=str,
124
+ default=None,
125
+ help="Path to output folder where the chunked files will be stored",
126
+ )
127
+ parser.add_argument(
128
+ "--max_title_level_to_use",
129
+ type=str,
130
+ choices=["h1", "h2", "h3", "h4", "h5"],
131
+ default="h4",
132
+ help="The maximum level of titles to use for chunking",
133
+ )
134
+ parser.add_argument(
135
+ "--max_chunk_tokens",
136
+ type=int,
137
+ default=8191,
138
+ help="Hard limit of the token size of a chunk. Chunks bigger than that will be handled according to chunk_tokens_exceeded_handling",
139
+ )
140
+ parser.add_argument(
141
+ "--chunk_tokens_exceeded_handling",
142
+ type=str,
143
+ choices=["split", "raise_error"],
144
+ default="raise_error",
145
+ help="Whether a big chunk with not headers should be split or error should be raised",
146
+ )
147
+ parser.add_argument(
148
+ "--link_placement",
149
+ type=str,
150
+ choices=["remove", "end_of_chunk", "in_sentence", "end_of_sentence"],
151
+ default="end_of_chunk",
152
+ help="Where the links are placed in the chunks",
153
+ )
154
+ parser.add_argument(
155
+ "--max_chunk_word_count",
156
+ type=int,
157
+ default=250,
158
+ help="Soft limit of chunk size. Chunks bigger than this limit will be subdivided with lower level headers.",
159
+ )
160
+ parser.add_argument(
161
+ "--min_chunk_word_count",
162
+ type=int,
163
+ default=15,
164
+ help="Minium amount a word a chunk must have. If lower, the chunk is discarded.",
165
+ )
166
+
167
+ return parser.parse_args()
168
+
169
+
170
+ def main():
171
+ args = parse_arguments()
172
+
173
+ wcn = WikitChunkNorris()
174
+ wcn.chunk_directory(**vars(args))
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()
@@ -0,0 +1,14 @@
1
+ from ..chunkers.html_chunknorris import HTMLChunkNorris as HTMLChunkNorris
2
+ from ..exceptions.exceptions import ChunkNorrisException as ChunkNorrisException
3
+ from ..types.types import Chunks as Chunks
4
+
5
+ class WikitChunkNorris(HTMLChunkNorris):
6
+ @staticmethod
7
+ def read_file(filepath: str, return_full_content: bool = False) -> str: ...
8
+ def chunk_file(self, filepath: str, output_filepath: str = None, **kwargs): ...
9
+ def chunk_directory(self, input_dir: str, output_dir: str = None, **kwargs) -> None: ...
10
+ @staticmethod
11
+ def format_output(filepath: str, chunks: Chunks) -> dict: ...
12
+
13
+ def parse_arguments(): ...
14
+ def main() -> None: ...
@@ -0,0 +1 @@
1
+ from .exceptions import *
@@ -0,0 +1 @@
1
+ from .exceptions import *
@@ -0,0 +1,8 @@
1
+ class ChunkNorrisException(Exception):
2
+ def __init__(self, message):
3
+ pass
4
+
5
+
6
+ class ChunkSizeExceeded(Exception):
7
+ def __init__(self, message):
8
+ pass
@@ -0,0 +1,5 @@
1
+ class ChunkNorrisException(Exception):
2
+ def __init__(self, message) -> None: ...
3
+
4
+ class ChunkSizeExceeded(Exception):
5
+ def __init__(self, message) -> None: ...
@@ -0,0 +1,12 @@
1
+ Chunk = str
2
+
3
+ Chunks = list[Chunk]
4
+
5
+ class TocTree():
6
+ id: int
7
+ text: str
8
+ level: int
9
+ line_index: int
10
+ content: str
11
+ parents: dict
12
+ children: list[dict]
@@ -0,0 +1,11 @@
1
+ Chunk = str
2
+ Chunks = list[Chunk]
3
+
4
+ class TocTree:
5
+ id: int
6
+ text: str
7
+ level: int
8
+ line_index: int
9
+ content: str
10
+ parents: dict
11
+ children: list[dict]
File without changes
File without changes
@@ -0,0 +1,64 @@
1
+ import re
2
+
3
+ alphabets = "([A-Za-z])"
4
+ prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
5
+ suffixes = "(Inc|Ltd|Jr|Sr|Co)"
6
+ starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
7
+ acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
8
+ websites = "[.](com|net|org|io|gov|edu|me)"
9
+ digits = "([0-9])"
10
+ multiple_dots = r"\.{2,}"
11
+
12
+
13
+ def split_into_sentences(text: str) -> list[str]:
14
+ """
15
+ Split the text into sentences.
16
+
17
+ If the text contains substrings "<prd>" or "<stop>", they would lead
18
+ to incorrect splitting because they are used as markers for splitting.
19
+
20
+ :param text: text to be split into sentences
21
+ :type text: str
22
+
23
+ :return: list of sentences
24
+ :rtype: list[str]
25
+ """
26
+ text = " " + text + " "
27
+ text = text.replace("\n", " ")
28
+ text = re.sub(prefixes, "\\1<prd>", text)
29
+ text = re.sub(websites, "<prd>\\1", text)
30
+ text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
31
+ text = re.sub(
32
+ multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text
33
+ )
34
+ if "Ph.D" in text:
35
+ text = text.replace("Ph.D.", "Ph<prd>D<prd>")
36
+ text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
37
+ text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
38
+ text = re.sub(
39
+ alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",
40
+ "\\1<prd>\\2<prd>\\3<prd>",
41
+ text,
42
+ )
43
+ text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text)
44
+ text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text)
45
+ text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
46
+ text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
47
+ if "”" in text:
48
+ text = text.replace(".”", "”.")
49
+ if '"' in text:
50
+ text = text.replace('."', '".')
51
+ if "!" in text:
52
+ text = text.replace('!"', '"!')
53
+ if "?" in text:
54
+ text = text.replace('?"', '"?')
55
+ text = text.replace(".", ".<stop>")
56
+ text = text.replace("?", "?<stop>")
57
+ text = text.replace("!", "!<stop>")
58
+ text = text.replace("<prd>", ".")
59
+ sentences = text.split("<stop>")
60
+ sentences = [s.strip() for s in sentences]
61
+ if sentences and not sentences[-1]:
62
+ sentences = sentences[:-1]
63
+
64
+ return sentences
@@ -0,0 +1,10 @@
1
+ alphabets: str
2
+ prefixes: str
3
+ suffixes: str
4
+ starters: str
5
+ acronyms: str
6
+ websites: str
7
+ digits: str
8
+ multiple_dots: str
9
+
10
+ def split_into_sentences(text: str) -> list[str]: ...
@@ -0,0 +1,15 @@
1
+ ChunkNorris - A package for reliable chunking of documents
2
+ Copyright (C) 2024 Wikit.ai
3
+
4
+ This program is free software: you can redistribute it and/or modify
5
+ it under the terms of the GNU Affero General Public License as
6
+ published by the Free Software Foundation, either version 3 of the
7
+ License, or (at your option) any later version.
8
+
9
+ This program is distributed in the hope that it will be useful,
10
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ GNU Affero General Public License for more details.
13
+
14
+ You should have received a copy of the GNU Affero General Public License
15
+ along with this program. If not, see <https://www.gnu.org/licenses/>
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.1
2
+ Name: chunknorris
3
+ Version: 0.0.1
4
+ Summary: A package for chunking documents from various formats
5
+ Author-email: Wikit <dev@wikit.ai>
6
+ Project-URL: Homepage, https://gitlab.com/wikit/research-and-development/chunk-norris
7
+ Project-URL: Issues, https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues
8
+ Keywords: chunk,document,split,html,markdown,pdf,header
9
+ Classifier: Natural Language :: English
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Framework :: Pytest
12
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
13
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
14
+ Classifier: Topic :: Text Processing :: Markup :: HTML
15
+ Classifier: Operating System :: OS Independent
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENCE
19
+ Requires-Dist: markdownify >=0.11.6
20
+ Requires-Dist: tiktoken >=0.5.2
21
+ Requires-Dist: PyMuPDF >=1.23.16
22
+
23
+ # Chunk Norris
24
+
25
+ ## Goal
26
+
27
+ This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
28
+ An optimized chunking method might lead to smaller chunks, meaning :
29
+ - **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
30
+ - **Less errors** because of chunks exceeding the API limit in terms of number of tokens
31
+ - **Less hallucinations** of generation models because of superfluous information in the prompt
32
+ - **Reduced cost** as the prompt would have reduced size
33
+
34
+ ## Installation
35
+
36
+ Using Pypi, just run the following command :
37
+ ```pip install chunknorris```
38
+
39
+ ## Chunkers
40
+
41
+ The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
42
+
43
+ All chunkers follow a similar logic :
44
+ - Extract table of contents (= headers)
45
+ - Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
46
+
47
+ ![](images/chunk_method.png)
48
+
49
+ ### MarkdownChunkNorris
50
+
51
+ This chunker is meant to be used **on markdown-formatted text**.
52
+
53
+ Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
54
+
55
+ #### Usage
56
+
57
+ ```py
58
+ from chunkers import MarkdownChunkNorris
59
+
60
+ text = """
61
+ # This is a header
62
+ This is a text
63
+ ## This is another header
64
+ And another text
65
+ ## With this final header
66
+ And this last text
67
+ """
68
+ chunker = MarkdownChunkNorris()
69
+ header_style = "atx" # or "setext" depending on headers in your text
70
+ chunks = chunker(text, header_style=header_style)
71
+ ```
72
+
73
+ ### HTMLChunkNorris
74
+
75
+ This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
76
+
77
+ #### Usage
78
+
79
+ ```py
80
+ from chunkers import HTMLChunkNorris
81
+
82
+ text = """
83
+ <h1>This is 1st level heading</h1>
84
+ <p>This is a test paragraph.</p>
85
+ <h2>This is 2nd level heading</h2>
86
+ <p>This is a test paragraph.</p>
87
+ <h2>This is another level heading</h2>
88
+ <p>This is another test paragraph.</p>
89
+ """
90
+ hcn = HTMLChunkNorris()
91
+ chunks = hcn(text)
92
+ ```
93
+
94
+ ### Advanced usage of chunkers
95
+
96
+ Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
97
+
98
+ ```py
99
+ from chunkers import MarkdownChunkNorris
100
+
101
+ mystring = "# header\nThis is a markdown string"
102
+
103
+ chunker = MarkdownChunkNorris() # or any other chunker
104
+ chunks = chunker(
105
+ mystring,
106
+ max_title_level_to_use="h3",
107
+ max_chunk_word_length=200,
108
+ link_placement="in_sentence",
109
+ max_chunk_tokens=8191,
110
+ chunk_tokens_exceeded_handling="split",
111
+ min_chunk_wordcount=15,
112
+ )
113
+ ```
114
+
115
+ ***max_title_level_to_use***
116
+ (str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
117
+
118
+ ***max_chunk_word_length***
119
+ (int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
120
+
121
+ ***link_placement***
122
+ (str): How the links should be handled. Defaults to in_sentence.
123
+ Options :
124
+ - "remove" : text is kept but links are removed
125
+ - "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
126
+ - "in_sentence" : the links is added between parenthesis inside the sentence
127
+
128
+ ***max_chunk_tokens***
129
+ (int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
130
+
131
+ ***chunk_tokens_exceeded_handling***
132
+ (str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
133
+ Options:
134
+ - "raise_error": raises an error, indicated the chunk could not be split according to headers
135
+ - "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
136
+
137
+ ***min_chunk_wordcount***
138
+ (int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
139
+
140
+ ### PDFChunkNorris
141
+
142
+ #TODO:
@@ -0,0 +1,27 @@
1
+ chunknorris/__init__.py,sha256=GMddZWF-AhcCso9CLeuZzDYLH1wF_mWB_Z1UALPS6i4,102
2
+ chunknorris/__init__.pyi,sha256=GMddZWF-AhcCso9CLeuZzDYLH1wF_mWB_Z1UALPS6i4,102
3
+ chunknorris/chunkers/__init__.py,sha256=pm6GiSXWpn09xJP1xlKbnU7bJglY4l_3g-CBfr0adVE,100
4
+ chunknorris/chunkers/__init__.pyi,sha256=Pn5uEk6TwEE_dMb1yzwPZeIcbwMB8zRPVdhKO8ntKg0,142
5
+ chunknorris/chunkers/html_chunknorris.py,sha256=X9LYqd2maej_7bFUa0pY_CAGWdanM_7_tUY6yoyRUR0,663
6
+ chunknorris/chunkers/html_chunknorris.pyi,sha256=M2xQe5xvoysJCHB1HtGpQ7jv4sj-JFB7vpOJegw0LI4,250
7
+ chunknorris/chunkers/markdown_chunknorris.py,sha256=ZdB1NpZASTDiEMfv3CBR8FRvrGl8Rd6r-GZcmBqCRBk,20268
8
+ chunknorris/chunkers/markdown_chunknorris.pyi,sha256=kk8I3La__qcvCyRVtwxD-5NuxFn721LnZRZhUVDx2WM,1486
9
+ chunknorris/custom_chunkers/__init__.py,sha256=qMlMdmT4KHJnyRr42ZWT0g-RnhDdwDCENdJ9BztLF00,48
10
+ chunknorris/custom_chunkers/__init__.pyi,sha256=sley_LOReM3qVfDtSqKa_13soCe32eDBHGZUgAoiASI,68
11
+ chunknorris/custom_chunkers/wikit_chunknorris.py,sha256=IA-7Xb0ogXmxgqp8ITJM-P7n4clvVaSRAV_DRtT92nE,5887
12
+ chunknorris/custom_chunkers/wikit_chunknorris.pyi,sha256=YlZTYx8iAVqNHbjGw8kbSQZ_kw550qPzYjUNNTuPdok,652
13
+ chunknorris/exceptions/__init__.py,sha256=nWnJwEphtAjFsYg_dIYrrbWdkIB0G2Nu89d7d6DJGKc,25
14
+ chunknorris/exceptions/__init__.pyi,sha256=y6AJu3xWHud92ZK_pfU3WzDj8gLIYvXfFNJ-phZmjJo,26
15
+ chunknorris/exceptions/exceptions.py,sha256=OUj1qxRcQU0RdYTJMs3L2abOarD16nhjkAPTMRhr9V0,169
16
+ chunknorris/exceptions/exceptions.pyi,sha256=HZeW31MY2y1BijHfSlHElqpTkn1jxawO6LvdMa1Ft-s,166
17
+ chunknorris/types/types.py,sha256=ii8PICCROmuRSZSH2lM_iAr4PYBMBEI5ePm1S9eG8Cw,173
18
+ chunknorris/types/types.pyi,sha256=iW3elE5TLioEZLL6HUSPP9LH8izJCWE0JY-73eqAe1I,170
19
+ chunknorris/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ chunknorris/utils/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ chunknorris/utils/utils.py,sha256=nA-qCm22dAYRtynCJwWa7S9AK8SCYoDzG6Xh296uX1g,2288
22
+ chunknorris/utils/utils.pyi,sha256=ApwdRg48ZViJhdr-g91E6F76Qdj5EBfqVVKHQ6qtSJI,171
23
+ chunknorris-0.0.1.dist-info/LICENCE,sha256=Z4Dj4xyHOkzx-Ggiig2yGWeOy-gnR7OF_hkJ6kCcCNw,720
24
+ chunknorris-0.0.1.dist-info/METADATA,sha256=hGylMSJp_O-uaIRAWXGybjRILsPxdeGF14a59eMpEZk,5200
25
+ chunknorris-0.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
26
+ chunknorris-0.0.1.dist-info/top_level.txt,sha256=zYBuOKW7poeXPcU-OLQF5PBbdbrw2aZbWgtPuge0x7Y,12
27
+ chunknorris-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.43.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ chunknorris