chunknorris 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunknorris-0.0.1/LICENCE +15 -0
- chunknorris-0.0.1/PKG-INFO +142 -0
- chunknorris-0.0.1/README.md +120 -0
- chunknorris-0.0.1/chunknorris.egg-info/PKG-INFO +142 -0
- chunknorris-0.0.1/chunknorris.egg-info/SOURCES.txt +30 -0
- chunknorris-0.0.1/chunknorris.egg-info/dependency_links.txt +1 -0
- chunknorris-0.0.1/chunknorris.egg-info/requires.txt +3 -0
- chunknorris-0.0.1/chunknorris.egg-info/top_level.txt +1 -0
- chunknorris-0.0.1/pyproject.toml +38 -0
- chunknorris-0.0.1/setup.cfg +4 -0
- chunknorris-0.0.1/src/chunknorris/__init__.py +4 -0
- chunknorris-0.0.1/src/chunknorris/__init__.pyi +4 -0
- chunknorris-0.0.1/src/chunknorris/chunkers/__init__.py +2 -0
- chunknorris-0.0.1/src/chunknorris/chunkers/__init__.pyi +2 -0
- chunknorris-0.0.1/src/chunknorris/chunkers/html_chunknorris.py +25 -0
- chunknorris-0.0.1/src/chunknorris/chunkers/html_chunknorris.pyi +6 -0
- chunknorris-0.0.1/src/chunknorris/chunkers/markdown_chunknorris.py +543 -0
- chunknorris-0.0.1/src/chunknorris/chunkers/markdown_chunknorris.pyi +26 -0
- chunknorris-0.0.1/src/chunknorris/custom_chunkers/__init__.py +1 -0
- chunknorris-0.0.1/src/chunknorris/custom_chunkers/__init__.pyi +1 -0
- chunknorris-0.0.1/src/chunknorris/custom_chunkers/wikit_chunknorris.py +178 -0
- chunknorris-0.0.1/src/chunknorris/custom_chunkers/wikit_chunknorris.pyi +14 -0
- chunknorris-0.0.1/src/chunknorris/exceptions/__init__.py +1 -0
- chunknorris-0.0.1/src/chunknorris/exceptions/__init__.pyi +1 -0
- chunknorris-0.0.1/src/chunknorris/exceptions/exceptions.py +8 -0
- chunknorris-0.0.1/src/chunknorris/exceptions/exceptions.pyi +5 -0
- chunknorris-0.0.1/src/chunknorris/types/types.py +12 -0
- chunknorris-0.0.1/src/chunknorris/types/types.pyi +11 -0
- chunknorris-0.0.1/src/chunknorris/utils/__init__.py +0 -0
- chunknorris-0.0.1/src/chunknorris/utils/__init__.pyi +0 -0
- chunknorris-0.0.1/src/chunknorris/utils/utils.py +64 -0
- chunknorris-0.0.1/src/chunknorris/utils/utils.pyi +10 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
ChunkNorris - A package for reliable chunking of documents
|
|
2
|
+
Copyright (C) 2024 Wikit.ai
|
|
3
|
+
|
|
4
|
+
This program is free software: you can redistribute it and/or modify
|
|
5
|
+
it under the terms of the GNU Affero General Public License as
|
|
6
|
+
published by the Free Software Foundation, either version 3 of the
|
|
7
|
+
License, or (at your option) any later version.
|
|
8
|
+
|
|
9
|
+
This program is distributed in the hope that it will be useful,
|
|
10
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12
|
+
GNU Affero General Public License for more details.
|
|
13
|
+
|
|
14
|
+
You should have received a copy of the GNU Affero General Public License
|
|
15
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: chunknorris
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A package for chunking documents from various formats
|
|
5
|
+
Author-email: Wikit <dev@wikit.ai>
|
|
6
|
+
Project-URL: Homepage, https://gitlab.com/wikit/research-and-development/chunk-norris
|
|
7
|
+
Project-URL: Issues, https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues
|
|
8
|
+
Keywords: chunk,document,split,html,markdown,pdf,header
|
|
9
|
+
Classifier: Natural Language :: English
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Framework :: Pytest
|
|
12
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
13
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
14
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENCE
|
|
19
|
+
Requires-Dist: markdownify>=0.11.6
|
|
20
|
+
Requires-Dist: tiktoken>=0.5.2
|
|
21
|
+
Requires-Dist: PyMuPDF>=1.23.16
|
|
22
|
+
|
|
23
|
+
# Chunk Norris
|
|
24
|
+
|
|
25
|
+
## Goal
|
|
26
|
+
|
|
27
|
+
This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
|
|
28
|
+
An optimized chunking method might lead to smaller chunks, meaning :
|
|
29
|
+
- **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
|
|
30
|
+
- **Less errors** because of chunks exceeding the API limit in terms of number of tokens
|
|
31
|
+
- **Less hallucinations** of generation models because of superfluous information in the prompt
|
|
32
|
+
- **Reduced cost** as the prompt would have reduced size
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
Using Pypi, just run the following command :
|
|
37
|
+
```pip install chunknorris```
|
|
38
|
+
|
|
39
|
+
## Chunkers
|
|
40
|
+
|
|
41
|
+
The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
|
|
42
|
+
|
|
43
|
+
All chunkers follow a similar logic :
|
|
44
|
+
- Extract table of contents (= headers)
|
|
45
|
+
- Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
|
|
46
|
+
|
|
47
|
+

|
|
48
|
+
|
|
49
|
+
### MarkdownChunkNorris
|
|
50
|
+
|
|
51
|
+
This chunker is meant to be used **on markdown-formatted text**.
|
|
52
|
+
|
|
53
|
+
Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
|
|
54
|
+
|
|
55
|
+
#### Usage
|
|
56
|
+
|
|
57
|
+
```py
|
|
58
|
+
from chunkers import MarkdownChunkNorris
|
|
59
|
+
|
|
60
|
+
text = """
|
|
61
|
+
# This is a header
|
|
62
|
+
This is a text
|
|
63
|
+
## This is another header
|
|
64
|
+
And another text
|
|
65
|
+
## With this final header
|
|
66
|
+
And this last text
|
|
67
|
+
"""
|
|
68
|
+
chunker = MarkdownChunkNorris()
|
|
69
|
+
header_style = "atx" # or "setext" depending on headers in your text
|
|
70
|
+
chunks = chunker(text, header_style=header_style)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### HTMLChunkNorris
|
|
74
|
+
|
|
75
|
+
This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
|
|
76
|
+
|
|
77
|
+
#### Usage
|
|
78
|
+
|
|
79
|
+
```py
|
|
80
|
+
from chunkers import HTMLChunkNorris
|
|
81
|
+
|
|
82
|
+
text = """
|
|
83
|
+
<h1>This is 1st level heading</h1>
|
|
84
|
+
<p>This is a test paragraph.</p>
|
|
85
|
+
<h2>This is 2nd level heading</h2>
|
|
86
|
+
<p>This is a test paragraph.</p>
|
|
87
|
+
<h2>This is another level heading</h2>
|
|
88
|
+
<p>This is another test paragraph.</p>
|
|
89
|
+
"""
|
|
90
|
+
hcn = HTMLChunkNorris()
|
|
91
|
+
chunks = hcn(text)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Advanced usage of chunkers
|
|
95
|
+
|
|
96
|
+
Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
|
|
97
|
+
|
|
98
|
+
```py
|
|
99
|
+
from chunkers import MarkdownChunkNorris
|
|
100
|
+
|
|
101
|
+
mystring = "# header\nThis is a markdown string"
|
|
102
|
+
|
|
103
|
+
chunker = MarkdownChunkNorris() # or any other chunker
|
|
104
|
+
chunks = chunker(
|
|
105
|
+
mystring,
|
|
106
|
+
max_title_level_to_use="h3",
|
|
107
|
+
max_chunk_word_length=200,
|
|
108
|
+
link_placement="in_sentence",
|
|
109
|
+
max_chunk_tokens=8191,
|
|
110
|
+
chunk_tokens_exceeded_handling="split",
|
|
111
|
+
min_chunk_wordcount=15,
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
***max_title_level_to_use***
|
|
116
|
+
(str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
|
|
117
|
+
|
|
118
|
+
***max_chunk_word_length***
|
|
119
|
+
(int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
|
|
120
|
+
|
|
121
|
+
***link_placement***
|
|
122
|
+
(str): How the links should be handled. Defaults to in_sentence.
|
|
123
|
+
Options :
|
|
124
|
+
- "remove" : text is kept but links are removed
|
|
125
|
+
- "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
|
|
126
|
+
- "in_sentence" : the links is added between parenthesis inside the sentence
|
|
127
|
+
|
|
128
|
+
***max_chunk_tokens***
|
|
129
|
+
(int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
|
|
130
|
+
|
|
131
|
+
***chunk_tokens_exceeded_handling***
|
|
132
|
+
(str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
|
|
133
|
+
Options:
|
|
134
|
+
- "raise_error": raises an error, indicated the chunk could not be split according to headers
|
|
135
|
+
- "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
|
|
136
|
+
|
|
137
|
+
***min_chunk_wordcount***
|
|
138
|
+
(int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
|
|
139
|
+
|
|
140
|
+
### PDFChunkNorris
|
|
141
|
+
|
|
142
|
+
#TODO:
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Chunk Norris
|
|
2
|
+
|
|
3
|
+
## Goal
|
|
4
|
+
|
|
5
|
+
This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
|
|
6
|
+
An optimized chunking method might lead to smaller chunks, meaning :
|
|
7
|
+
- **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
|
|
8
|
+
- **Less errors** because of chunks exceeding the API limit in terms of number of tokens
|
|
9
|
+
- **Less hallucinations** of generation models because of superfluous information in the prompt
|
|
10
|
+
- **Reduced cost** as the prompt would have reduced size
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
Using Pypi, just run the following command :
|
|
15
|
+
```pip install chunknorris```
|
|
16
|
+
|
|
17
|
+
## Chunkers
|
|
18
|
+
|
|
19
|
+
The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
|
|
20
|
+
|
|
21
|
+
All chunkers follow a similar logic :
|
|
22
|
+
- Extract table of contents (= headers)
|
|
23
|
+
- Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
|
|
24
|
+
|
|
25
|
+

|
|
26
|
+
|
|
27
|
+
### MarkdownChunkNorris
|
|
28
|
+
|
|
29
|
+
This chunker is meant to be used **on markdown-formatted text**.
|
|
30
|
+
|
|
31
|
+
Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
|
|
32
|
+
|
|
33
|
+
#### Usage
|
|
34
|
+
|
|
35
|
+
```py
|
|
36
|
+
from chunkers import MarkdownChunkNorris
|
|
37
|
+
|
|
38
|
+
text = """
|
|
39
|
+
# This is a header
|
|
40
|
+
This is a text
|
|
41
|
+
## This is another header
|
|
42
|
+
And another text
|
|
43
|
+
## With this final header
|
|
44
|
+
And this last text
|
|
45
|
+
"""
|
|
46
|
+
chunker = MarkdownChunkNorris()
|
|
47
|
+
header_style = "atx" # or "setext" depending on headers in your text
|
|
48
|
+
chunks = chunker(text, header_style=header_style)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### HTMLChunkNorris
|
|
52
|
+
|
|
53
|
+
This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
|
|
54
|
+
|
|
55
|
+
#### Usage
|
|
56
|
+
|
|
57
|
+
```py
|
|
58
|
+
from chunkers import HTMLChunkNorris
|
|
59
|
+
|
|
60
|
+
text = """
|
|
61
|
+
<h1>This is 1st level heading</h1>
|
|
62
|
+
<p>This is a test paragraph.</p>
|
|
63
|
+
<h2>This is 2nd level heading</h2>
|
|
64
|
+
<p>This is a test paragraph.</p>
|
|
65
|
+
<h2>This is another level heading</h2>
|
|
66
|
+
<p>This is another test paragraph.</p>
|
|
67
|
+
"""
|
|
68
|
+
hcn = HTMLChunkNorris()
|
|
69
|
+
chunks = hcn(text)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Advanced usage of chunkers
|
|
73
|
+
|
|
74
|
+
Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
|
|
75
|
+
|
|
76
|
+
```py
|
|
77
|
+
from chunkers import MarkdownChunkNorris
|
|
78
|
+
|
|
79
|
+
mystring = "# header\nThis is a markdown string"
|
|
80
|
+
|
|
81
|
+
chunker = MarkdownChunkNorris() # or any other chunker
|
|
82
|
+
chunks = chunker(
|
|
83
|
+
mystring,
|
|
84
|
+
max_title_level_to_use="h3",
|
|
85
|
+
max_chunk_word_length=200,
|
|
86
|
+
link_placement="in_sentence",
|
|
87
|
+
max_chunk_tokens=8191,
|
|
88
|
+
chunk_tokens_exceeded_handling="split",
|
|
89
|
+
min_chunk_wordcount=15,
|
|
90
|
+
)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
***max_title_level_to_use***
|
|
94
|
+
(str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
|
|
95
|
+
|
|
96
|
+
***max_chunk_word_length***
|
|
97
|
+
(int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
|
|
98
|
+
|
|
99
|
+
***link_placement***
|
|
100
|
+
(str): How the links should be handled. Defaults to in_sentence.
|
|
101
|
+
Options :
|
|
102
|
+
- "remove" : text is kept but links are removed
|
|
103
|
+
- "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
|
|
104
|
+
- "in_sentence" : the links is added between parenthesis inside the sentence
|
|
105
|
+
|
|
106
|
+
***max_chunk_tokens***
|
|
107
|
+
(int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
|
|
108
|
+
|
|
109
|
+
***chunk_tokens_exceeded_handling***
|
|
110
|
+
(str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
|
|
111
|
+
Options:
|
|
112
|
+
- "raise_error": raises an error, indicated the chunk could not be split according to headers
|
|
113
|
+
- "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
|
|
114
|
+
|
|
115
|
+
***min_chunk_wordcount***
|
|
116
|
+
(int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
|
|
117
|
+
|
|
118
|
+
### PDFChunkNorris
|
|
119
|
+
|
|
120
|
+
#TODO:
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: chunknorris
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A package for chunking documents from various formats
|
|
5
|
+
Author-email: Wikit <dev@wikit.ai>
|
|
6
|
+
Project-URL: Homepage, https://gitlab.com/wikit/research-and-development/chunk-norris
|
|
7
|
+
Project-URL: Issues, https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues
|
|
8
|
+
Keywords: chunk,document,split,html,markdown,pdf,header
|
|
9
|
+
Classifier: Natural Language :: English
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Framework :: Pytest
|
|
12
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
13
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
14
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENCE
|
|
19
|
+
Requires-Dist: markdownify>=0.11.6
|
|
20
|
+
Requires-Dist: tiktoken>=0.5.2
|
|
21
|
+
Requires-Dist: PyMuPDF>=1.23.16
|
|
22
|
+
|
|
23
|
+
# Chunk Norris
|
|
24
|
+
|
|
25
|
+
## Goal
|
|
26
|
+
|
|
27
|
+
This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
|
|
28
|
+
An optimized chunking method might lead to smaller chunks, meaning :
|
|
29
|
+
- **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
|
|
30
|
+
- **Less errors** because of chunks exceeding the API limit in terms of number of tokens
|
|
31
|
+
- **Less hallucinations** of generation models because of superfluous information in the prompt
|
|
32
|
+
- **Reduced cost** as the prompt would have reduced size
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
Using Pypi, just run the following command :
|
|
37
|
+
```pip install chunknorris```
|
|
38
|
+
|
|
39
|
+
## Chunkers
|
|
40
|
+
|
|
41
|
+
The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
|
|
42
|
+
|
|
43
|
+
All chunkers follow a similar logic :
|
|
44
|
+
- Extract table of contents (= headers)
|
|
45
|
+
- Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
|
|
46
|
+
|
|
47
|
+

|
|
48
|
+
|
|
49
|
+
### MarkdownChunkNorris
|
|
50
|
+
|
|
51
|
+
This chunker is meant to be used **on markdown-formatted text**.
|
|
52
|
+
|
|
53
|
+
Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
|
|
54
|
+
|
|
55
|
+
#### Usage
|
|
56
|
+
|
|
57
|
+
```py
|
|
58
|
+
from chunkers import MarkdownChunkNorris
|
|
59
|
+
|
|
60
|
+
text = """
|
|
61
|
+
# This is a header
|
|
62
|
+
This is a text
|
|
63
|
+
## This is another header
|
|
64
|
+
And another text
|
|
65
|
+
## With this final header
|
|
66
|
+
And this last text
|
|
67
|
+
"""
|
|
68
|
+
chunker = MarkdownChunkNorris()
|
|
69
|
+
header_style = "atx" # or "setext" depending on headers in your text
|
|
70
|
+
chunks = chunker(text, header_style=header_style)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### HTMLChunkNorris
|
|
74
|
+
|
|
75
|
+
This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
|
|
76
|
+
|
|
77
|
+
#### Usage
|
|
78
|
+
|
|
79
|
+
```py
|
|
80
|
+
from chunkers import HTMLChunkNorris
|
|
81
|
+
|
|
82
|
+
text = """
|
|
83
|
+
<h1>This is 1st level heading</h1>
|
|
84
|
+
<p>This is a test paragraph.</p>
|
|
85
|
+
<h2>This is 2nd level heading</h2>
|
|
86
|
+
<p>This is a test paragraph.</p>
|
|
87
|
+
<h2>This is another level heading</h2>
|
|
88
|
+
<p>This is another test paragraph.</p>
|
|
89
|
+
"""
|
|
90
|
+
hcn = HTMLChunkNorris()
|
|
91
|
+
chunks = hcn(text)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Advanced usage of chunkers
|
|
95
|
+
|
|
96
|
+
Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
|
|
97
|
+
|
|
98
|
+
```py
|
|
99
|
+
from chunkers import MarkdownChunkNorris
|
|
100
|
+
|
|
101
|
+
mystring = "# header\nThis is a markdown string"
|
|
102
|
+
|
|
103
|
+
chunker = MarkdownChunkNorris() # or any other chunker
|
|
104
|
+
chunks = chunker(
|
|
105
|
+
mystring,
|
|
106
|
+
max_title_level_to_use="h3",
|
|
107
|
+
max_chunk_word_length=200,
|
|
108
|
+
link_placement="in_sentence",
|
|
109
|
+
max_chunk_tokens=8191,
|
|
110
|
+
chunk_tokens_exceeded_handling="split",
|
|
111
|
+
min_chunk_wordcount=15,
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
***max_title_level_to_use***
|
|
116
|
+
(str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
|
|
117
|
+
|
|
118
|
+
***max_chunk_word_length***
|
|
119
|
+
(int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
|
|
120
|
+
|
|
121
|
+
***link_placement***
|
|
122
|
+
(str): How the links should be handled. Defaults to in_sentence.
|
|
123
|
+
Options :
|
|
124
|
+
- "remove" : text is kept but links are removed
|
|
125
|
+
- "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
|
|
126
|
+
- "in_sentence" : the links is added between parenthesis inside the sentence
|
|
127
|
+
|
|
128
|
+
***max_chunk_tokens***
|
|
129
|
+
(int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
|
|
130
|
+
|
|
131
|
+
***chunk_tokens_exceeded_handling***
|
|
132
|
+
(str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
|
|
133
|
+
Options:
|
|
134
|
+
- "raise_error": raises an error, indicated the chunk could not be split according to headers
|
|
135
|
+
- "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
|
|
136
|
+
|
|
137
|
+
***min_chunk_wordcount***
|
|
138
|
+
(int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
|
|
139
|
+
|
|
140
|
+
### PDFChunkNorris
|
|
141
|
+
|
|
142
|
+
#TODO:
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
LICENCE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
chunknorris.egg-info/PKG-INFO
|
|
5
|
+
chunknorris.egg-info/SOURCES.txt
|
|
6
|
+
chunknorris.egg-info/dependency_links.txt
|
|
7
|
+
chunknorris.egg-info/requires.txt
|
|
8
|
+
chunknorris.egg-info/top_level.txt
|
|
9
|
+
src/chunknorris/__init__.py
|
|
10
|
+
src/chunknorris/__init__.pyi
|
|
11
|
+
src/chunknorris/chunkers/__init__.py
|
|
12
|
+
src/chunknorris/chunkers/__init__.pyi
|
|
13
|
+
src/chunknorris/chunkers/html_chunknorris.py
|
|
14
|
+
src/chunknorris/chunkers/html_chunknorris.pyi
|
|
15
|
+
src/chunknorris/chunkers/markdown_chunknorris.py
|
|
16
|
+
src/chunknorris/chunkers/markdown_chunknorris.pyi
|
|
17
|
+
src/chunknorris/custom_chunkers/__init__.py
|
|
18
|
+
src/chunknorris/custom_chunkers/__init__.pyi
|
|
19
|
+
src/chunknorris/custom_chunkers/wikit_chunknorris.py
|
|
20
|
+
src/chunknorris/custom_chunkers/wikit_chunknorris.pyi
|
|
21
|
+
src/chunknorris/exceptions/__init__.py
|
|
22
|
+
src/chunknorris/exceptions/__init__.pyi
|
|
23
|
+
src/chunknorris/exceptions/exceptions.py
|
|
24
|
+
src/chunknorris/exceptions/exceptions.pyi
|
|
25
|
+
src/chunknorris/types/types.py
|
|
26
|
+
src/chunknorris/types/types.pyi
|
|
27
|
+
src/chunknorris/utils/__init__.py
|
|
28
|
+
src/chunknorris/utils/__init__.pyi
|
|
29
|
+
src/chunknorris/utils/utils.py
|
|
30
|
+
src/chunknorris/utils/utils.pyi
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chunknorris
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[tool.setuptools.packages.find]
|
|
6
|
+
where = ["src", "images"]
|
|
7
|
+
include = ["*"]
|
|
8
|
+
exclude = ["__pycache__"]
|
|
9
|
+
namespaces = true # true by default
|
|
10
|
+
|
|
11
|
+
[project]
|
|
12
|
+
name = "chunknorris"
|
|
13
|
+
version = "0.0.1"
|
|
14
|
+
authors = [
|
|
15
|
+
{ name="Wikit", email="dev@wikit.ai" },
|
|
16
|
+
]
|
|
17
|
+
description = "A package for chunking documents from various formats"
|
|
18
|
+
keywords = ["chunk", "document", "split", "html", "markdown", "pdf", "header"]
|
|
19
|
+
readme = {file = "README.md", content-type = "text/markdown"}
|
|
20
|
+
requires-python = ">=3.10"
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Natural Language :: English",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Framework :: Pytest",
|
|
25
|
+
"License :: OSI Approved :: GNU Affero General Public License v3",
|
|
26
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
27
|
+
"Topic :: Text Processing :: Markup :: HTML",
|
|
28
|
+
"Operating System :: OS Independent"
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"markdownify>=0.11.6",
|
|
32
|
+
"tiktoken>=0.5.2",
|
|
33
|
+
"PyMuPDF>=1.23.16"
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://gitlab.com/wikit/research-and-development/chunk-norris"
|
|
38
|
+
Issues = "https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from markdownify import markdownify
|
|
2
|
+
|
|
3
|
+
from .markdown_chunknorris import MarkdownChunkNorris
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class HTMLChunkNorris(MarkdownChunkNorris):
|
|
7
|
+
|
|
8
|
+
def __call__(self, html_text: str, **kwargs) -> str:
|
|
9
|
+
text = HTMLChunkNorris.apply_markdownify(html_text)
|
|
10
|
+
|
|
11
|
+
return super().__call__(text, **kwargs)
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def apply_markdownify(html_text) -> str:
|
|
15
|
+
"""Applies markdownify to the html text
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
html_text (str): the text of the html file
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
str: the markdownified string
|
|
22
|
+
"""
|
|
23
|
+
md_text = markdownify(html_text, strip=["figure", "img"], bullets="-*+")
|
|
24
|
+
|
|
25
|
+
return md_text
|