chunknorris 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. chunknorris-0.0.1/LICENCE +15 -0
  2. chunknorris-0.0.1/PKG-INFO +142 -0
  3. chunknorris-0.0.1/README.md +120 -0
  4. chunknorris-0.0.1/chunknorris.egg-info/PKG-INFO +142 -0
  5. chunknorris-0.0.1/chunknorris.egg-info/SOURCES.txt +30 -0
  6. chunknorris-0.0.1/chunknorris.egg-info/dependency_links.txt +1 -0
  7. chunknorris-0.0.1/chunknorris.egg-info/requires.txt +3 -0
  8. chunknorris-0.0.1/chunknorris.egg-info/top_level.txt +1 -0
  9. chunknorris-0.0.1/pyproject.toml +38 -0
  10. chunknorris-0.0.1/setup.cfg +4 -0
  11. chunknorris-0.0.1/src/chunknorris/__init__.py +4 -0
  12. chunknorris-0.0.1/src/chunknorris/__init__.pyi +4 -0
  13. chunknorris-0.0.1/src/chunknorris/chunkers/__init__.py +2 -0
  14. chunknorris-0.0.1/src/chunknorris/chunkers/__init__.pyi +2 -0
  15. chunknorris-0.0.1/src/chunknorris/chunkers/html_chunknorris.py +25 -0
  16. chunknorris-0.0.1/src/chunknorris/chunkers/html_chunknorris.pyi +6 -0
  17. chunknorris-0.0.1/src/chunknorris/chunkers/markdown_chunknorris.py +543 -0
  18. chunknorris-0.0.1/src/chunknorris/chunkers/markdown_chunknorris.pyi +26 -0
  19. chunknorris-0.0.1/src/chunknorris/custom_chunkers/__init__.py +1 -0
  20. chunknorris-0.0.1/src/chunknorris/custom_chunkers/__init__.pyi +1 -0
  21. chunknorris-0.0.1/src/chunknorris/custom_chunkers/wikit_chunknorris.py +178 -0
  22. chunknorris-0.0.1/src/chunknorris/custom_chunkers/wikit_chunknorris.pyi +14 -0
  23. chunknorris-0.0.1/src/chunknorris/exceptions/__init__.py +1 -0
  24. chunknorris-0.0.1/src/chunknorris/exceptions/__init__.pyi +1 -0
  25. chunknorris-0.0.1/src/chunknorris/exceptions/exceptions.py +8 -0
  26. chunknorris-0.0.1/src/chunknorris/exceptions/exceptions.pyi +5 -0
  27. chunknorris-0.0.1/src/chunknorris/types/types.py +12 -0
  28. chunknorris-0.0.1/src/chunknorris/types/types.pyi +11 -0
  29. chunknorris-0.0.1/src/chunknorris/utils/__init__.py +0 -0
  30. chunknorris-0.0.1/src/chunknorris/utils/__init__.pyi +0 -0
  31. chunknorris-0.0.1/src/chunknorris/utils/utils.py +64 -0
  32. chunknorris-0.0.1/src/chunknorris/utils/utils.pyi +10 -0
@@ -0,0 +1,15 @@
1
+ ChunkNorris - A package for reliable chunking of documents
2
+ Copyright (C) 2024 Wikit.ai
3
+
4
+ This program is free software: you can redistribute it and/or modify
5
+ it under the terms of the GNU Affero General Public License as
6
+ published by the Free Software Foundation, either version 3 of the
7
+ License, or (at your option) any later version.
8
+
9
+ This program is distributed in the hope that it will be useful,
10
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ GNU Affero General Public License for more details.
13
+
14
+ You should have received a copy of the GNU Affero General Public License
15
+ along with this program. If not, see <https://www.gnu.org/licenses/>
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.1
2
+ Name: chunknorris
3
+ Version: 0.0.1
4
+ Summary: A package for chunking documents from various formats
5
+ Author-email: Wikit <dev@wikit.ai>
6
+ Project-URL: Homepage, https://gitlab.com/wikit/research-and-development/chunk-norris
7
+ Project-URL: Issues, https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues
8
+ Keywords: chunk,document,split,html,markdown,pdf,header
9
+ Classifier: Natural Language :: English
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Framework :: Pytest
12
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
13
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
14
+ Classifier: Topic :: Text Processing :: Markup :: HTML
15
+ Classifier: Operating System :: OS Independent
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENCE
19
+ Requires-Dist: markdownify>=0.11.6
20
+ Requires-Dist: tiktoken>=0.5.2
21
+ Requires-Dist: PyMuPDF>=1.23.16
22
+
23
+ # Chunk Norris
24
+
25
+ ## Goal
26
+
27
+ This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
28
+ An optimized chunking method might lead to smaller chunks, meaning :
29
+ - **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
30
+ - **Less errors** because of chunks exceeding the API limit in terms of number of tokens
31
+ - **Less hallucinations** of generation models because of superfluous information in the prompt
32
+ - **Reduced cost** as the prompt would have reduced size
33
+
34
+ ## Installation
35
+
36
+ Using Pypi, just run the following command :
37
+ ```pip install chunknorris```
38
+
39
+ ## Chunkers
40
+
41
+ The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
42
+
43
+ All chunkers follow a similar logic :
44
+ - Extract table of contents (= headers)
45
+ - Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
46
+
47
+ ![](images/chunk_method.png)
48
+
49
+ ### MarkdownChunkNorris
50
+
51
+ This chunker is meant to be used **on markdown-formatted text**.
52
+
53
+ Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
54
+
55
+ #### Usage
56
+
57
+ ```py
58
+ from chunkers import MarkdownChunkNorris
59
+
60
+ text = """
61
+ # This is a header
62
+ This is a text
63
+ ## This is another header
64
+ And another text
65
+ ## With this final header
66
+ And this last text
67
+ """
68
+ chunker = MarkdownChunkNorris()
69
+ header_style = "atx" # or "setext" depending on headers in your text
70
+ chunks = chunker(text, header_style=header_style)
71
+ ```
72
+
73
+ ### HTMLChunkNorris
74
+
75
+ This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
76
+
77
+ #### Usage
78
+
79
+ ```py
80
+ from chunkers import HTMLChunkNorris
81
+
82
+ text = """
83
+ <h1>This is 1st level heading</h1>
84
+ <p>This is a test paragraph.</p>
85
+ <h2>This is 2nd level heading</h2>
86
+ <p>This is a test paragraph.</p>
87
+ <h2>This is another level heading</h2>
88
+ <p>This is another test paragraph.</p>
89
+ """
90
+ hcn = HTMLChunkNorris()
91
+ chunks = hcn(text)
92
+ ```
93
+
94
+ ### Advanced usage of chunkers
95
+
96
+ Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
97
+
98
+ ```py
99
+ from chunkers import MarkdownChunkNorris
100
+
101
+ mystring = "# header\nThis is a markdown string"
102
+
103
+ chunker = MarkdownChunkNorris() # or any other chunker
104
+ chunks = chunker(
105
+ mystring,
106
+ max_title_level_to_use="h3",
107
+ max_chunk_word_length=200,
108
+ link_placement="in_sentence",
109
+ max_chunk_tokens=8191,
110
+ chunk_tokens_exceeded_handling="split",
111
+ min_chunk_wordcount=15,
112
+ )
113
+ ```
114
+
115
+ ***max_title_level_to_use***
116
+ (str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
117
+
118
+ ***max_chunk_word_length***
119
+ (int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
120
+
121
+ ***link_placement***
122
+ (str): How the links should be handled. Defaults to in_sentence.
123
+ Options :
124
+ - "remove" : text is kept but links are removed
125
+ - "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
126
+ - "in_sentence" : the links is added between parenthesis inside the sentence
127
+
128
+ ***max_chunk_tokens***
129
+ (int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
130
+
131
+ ***chunk_tokens_exceeded_handling***
132
+ (str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
133
+ Options:
134
+ - "raise_error": raises an error, indicated the chunk could not be split according to headers
135
+ - "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
136
+
137
+ ***min_chunk_wordcount***
138
+ (int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
139
+
140
+ ### PDFChunkNorris
141
+
142
+ #TODO:
@@ -0,0 +1,120 @@
1
+ # Chunk Norris
2
+
3
+ ## Goal
4
+
5
+ This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
6
+ An optimized chunking method might lead to smaller chunks, meaning :
7
+ - **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
8
+ - **Less errors** because of chunks exceeding the API limit in terms of number of tokens
9
+ - **Less hallucinations** of generation models because of superfluous information in the prompt
10
+ - **Reduced cost** as the prompt would have reduced size
11
+
12
+ ## Installation
13
+
14
+ Using Pypi, just run the following command :
15
+ ```pip install chunknorris```
16
+
17
+ ## Chunkers
18
+
19
+ The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
20
+
21
+ All chunkers follow a similar logic :
22
+ - Extract table of contents (= headers)
23
+ - Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
24
+
25
+ ![](images/chunk_method.png)
26
+
27
+ ### MarkdownChunkNorris
28
+
29
+ This chunker is meant to be used **on markdown-formatted text**.
30
+
31
+ Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
32
+
33
+ #### Usage
34
+
35
+ ```py
36
+ from chunkers import MarkdownChunkNorris
37
+
38
+ text = """
39
+ # This is a header
40
+ This is a text
41
+ ## This is another header
42
+ And another text
43
+ ## With this final header
44
+ And this last text
45
+ """
46
+ chunker = MarkdownChunkNorris()
47
+ header_style = "atx" # or "setext" depending on headers in your text
48
+ chunks = chunker(text, header_style=header_style)
49
+ ```
50
+
51
+ ### HTMLChunkNorris
52
+
53
+ This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
54
+
55
+ #### Usage
56
+
57
+ ```py
58
+ from chunkers import HTMLChunkNorris
59
+
60
+ text = """
61
+ <h1>This is 1st level heading</h1>
62
+ <p>This is a test paragraph.</p>
63
+ <h2>This is 2nd level heading</h2>
64
+ <p>This is a test paragraph.</p>
65
+ <h2>This is another level heading</h2>
66
+ <p>This is another test paragraph.</p>
67
+ """
68
+ hcn = HTMLChunkNorris()
69
+ chunks = hcn(text)
70
+ ```
71
+
72
+ ### Advanced usage of chunkers
73
+
74
+ Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
75
+
76
+ ```py
77
+ from chunkers import MarkdownChunkNorris
78
+
79
+ mystring = "# header\nThis is a markdown string"
80
+
81
+ chunker = MarkdownChunkNorris() # or any other chunker
82
+ chunks = chunker(
83
+ mystring,
84
+ max_title_level_to_use="h3",
85
+ max_chunk_word_length=200,
86
+ link_placement="in_sentence",
87
+ max_chunk_tokens=8191,
88
+ chunk_tokens_exceeded_handling="split",
89
+ min_chunk_wordcount=15,
90
+ )
91
+ ```
92
+
93
+ ***max_title_level_to_use***
94
+ (str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
95
+
96
+ ***max_chunk_word_length***
97
+ (int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
98
+
99
+ ***link_placement***
100
+ (str): How the links should be handled. Defaults to in_sentence.
101
+ Options :
102
+ - "remove" : text is kept but links are removed
103
+ - "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
104
+ - "in_sentence" : the links is added between parenthesis inside the sentence
105
+
106
+ ***max_chunk_tokens***
107
+ (int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
108
+
109
+ ***chunk_tokens_exceeded_handling***
110
+ (str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
111
+ Options:
112
+ - "raise_error": raises an error, indicated the chunk could not be split according to headers
113
+ - "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
114
+
115
+ ***min_chunk_wordcount***
116
+ (int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
117
+
118
+ ### PDFChunkNorris
119
+
120
+ #TODO:
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.1
2
+ Name: chunknorris
3
+ Version: 0.0.1
4
+ Summary: A package for chunking documents from various formats
5
+ Author-email: Wikit <dev@wikit.ai>
6
+ Project-URL: Homepage, https://gitlab.com/wikit/research-and-development/chunk-norris
7
+ Project-URL: Issues, https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues
8
+ Keywords: chunk,document,split,html,markdown,pdf,header
9
+ Classifier: Natural Language :: English
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Framework :: Pytest
12
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
13
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
14
+ Classifier: Topic :: Text Processing :: Markup :: HTML
15
+ Classifier: Operating System :: OS Independent
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENCE
19
+ Requires-Dist: markdownify>=0.11.6
20
+ Requires-Dist: tiktoken>=0.5.2
21
+ Requires-Dist: PyMuPDF>=1.23.16
22
+
23
+ # Chunk Norris
24
+
25
+ ## Goal
26
+
27
+ This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
28
+ An optimized chunking method might lead to smaller chunks, meaning :
29
+ - **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
30
+ - **Less errors** because of chunks exceeding the API limit in terms of number of tokens
31
+ - **Less hallucinations** of generation models because of superfluous information in the prompt
32
+ - **Reduced cost** as the prompt would have reduced size
33
+
34
+ ## Installation
35
+
36
+ Using Pypi, just run the following command :
37
+ ```pip install chunknorris```
38
+
39
+ ## Chunkers
40
+
41
+ The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
42
+
43
+ All chunkers follow a similar logic :
44
+ - Extract table of contents (= headers)
45
+ - Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
46
+
47
+ ![](images/chunk_method.png)
48
+
49
+ ### MarkdownChunkNorris
50
+
51
+ This chunker is meant to be used **on markdown-formatted text**.
52
+
53
+ Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
54
+
55
+ #### Usage
56
+
57
+ ```py
58
+ from chunkers import MarkdownChunkNorris
59
+
60
+ text = """
61
+ # This is a header
62
+ This is a text
63
+ ## This is another header
64
+ And another text
65
+ ## With this final header
66
+ And this last text
67
+ """
68
+ chunker = MarkdownChunkNorris()
69
+ header_style = "atx" # or "setext" depending on headers in your text
70
+ chunks = chunker(text, header_style=header_style)
71
+ ```
72
+
73
+ ### HTMLChunkNorris
74
+
75
+ This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
76
+
77
+ #### Usage
78
+
79
+ ```py
80
+ from chunkers import HTMLChunkNorris
81
+
82
+ text = """
83
+ <h1>This is 1st level heading</h1>
84
+ <p>This is a test paragraph.</p>
85
+ <h2>This is 2nd level heading</h2>
86
+ <p>This is a test paragraph.</p>
87
+ <h2>This is another level heading</h2>
88
+ <p>This is another test paragraph.</p>
89
+ """
90
+ hcn = HTMLChunkNorris()
91
+ chunks = hcn(text)
92
+ ```
93
+
94
+ ### Advanced usage of chunkers
95
+
96
+ Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
97
+
98
+ ```py
99
+ from chunkers import MarkdownChunkNorris
100
+
101
+ mystring = "# header\nThis is a markdown string"
102
+
103
+ chunker = MarkdownChunkNorris() # or any other chunker
104
+ chunks = chunker(
105
+ mystring,
106
+ max_title_level_to_use="h3",
107
+ max_chunk_word_length=200,
108
+ link_placement="in_sentence",
109
+ max_chunk_tokens=8191,
110
+ chunk_tokens_exceeded_handling="split",
111
+ min_chunk_wordcount=15,
112
+ )
113
+ ```
114
+
115
+ ***max_title_level_to_use***
116
+ (str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
117
+
118
+ ***max_chunk_word_length***
119
+ (int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
120
+
121
+ ***link_placement***
122
+ (str): How the links should be handled. Defaults to in_sentence.
123
+ Options :
124
+ - "remove" : text is kept but links are removed
125
+ - "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
126
+ - "in_sentence" : the links is added between parenthesis inside the sentence
127
+
128
+ ***max_chunk_tokens***
129
+ (int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
130
+
131
+ ***chunk_tokens_exceeded_handling***
132
+ (str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
133
+ Options:
134
+ - "raise_error": raises an error, indicated the chunk could not be split according to headers
135
+ - "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
136
+
137
+ ***min_chunk_wordcount***
138
+ (int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
139
+
140
+ ### PDFChunkNorris
141
+
142
+ #TODO:
@@ -0,0 +1,30 @@
1
+ LICENCE
2
+ README.md
3
+ pyproject.toml
4
+ chunknorris.egg-info/PKG-INFO
5
+ chunknorris.egg-info/SOURCES.txt
6
+ chunknorris.egg-info/dependency_links.txt
7
+ chunknorris.egg-info/requires.txt
8
+ chunknorris.egg-info/top_level.txt
9
+ src/chunknorris/__init__.py
10
+ src/chunknorris/__init__.pyi
11
+ src/chunknorris/chunkers/__init__.py
12
+ src/chunknorris/chunkers/__init__.pyi
13
+ src/chunknorris/chunkers/html_chunknorris.py
14
+ src/chunknorris/chunkers/html_chunknorris.pyi
15
+ src/chunknorris/chunkers/markdown_chunknorris.py
16
+ src/chunknorris/chunkers/markdown_chunknorris.pyi
17
+ src/chunknorris/custom_chunkers/__init__.py
18
+ src/chunknorris/custom_chunkers/__init__.pyi
19
+ src/chunknorris/custom_chunkers/wikit_chunknorris.py
20
+ src/chunknorris/custom_chunkers/wikit_chunknorris.pyi
21
+ src/chunknorris/exceptions/__init__.py
22
+ src/chunknorris/exceptions/__init__.pyi
23
+ src/chunknorris/exceptions/exceptions.py
24
+ src/chunknorris/exceptions/exceptions.pyi
25
+ src/chunknorris/types/types.py
26
+ src/chunknorris/types/types.pyi
27
+ src/chunknorris/utils/__init__.py
28
+ src/chunknorris/utils/__init__.pyi
29
+ src/chunknorris/utils/utils.py
30
+ src/chunknorris/utils/utils.pyi
@@ -0,0 +1,3 @@
1
+ markdownify>=0.11.6
2
+ tiktoken>=0.5.2
3
+ PyMuPDF>=1.23.16
@@ -0,0 +1 @@
1
+ chunknorris
@@ -0,0 +1,38 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.setuptools.packages.find]
6
+ where = ["src", "images"]
7
+ include = ["*"]
8
+ exclude = ["__pycache__"]
9
+ namespaces = true # true by default
10
+
11
+ [project]
12
+ name = "chunknorris"
13
+ version = "0.0.1"
14
+ authors = [
15
+ { name="Wikit", email="dev@wikit.ai" },
16
+ ]
17
+ description = "A package for chunking documents from various formats"
18
+ keywords = ["chunk", "document", "split", "html", "markdown", "pdf", "header"]
19
+ readme = {file = "README.md", content-type = "text/markdown"}
20
+ requires-python = ">=3.10"
21
+ classifiers = [
22
+ "Natural Language :: English",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Framework :: Pytest",
25
+ "License :: OSI Approved :: GNU Affero General Public License v3",
26
+ "Topic :: Text Processing :: Markup :: Markdown",
27
+ "Topic :: Text Processing :: Markup :: HTML",
28
+ "Operating System :: OS Independent"
29
+ ]
30
+ dependencies = [
31
+ "markdownify>=0.11.6",
32
+ "tiktoken>=0.5.2",
33
+ "PyMuPDF>=1.23.16"
34
+ ]
35
+
36
+ [project.urls]
37
+ Homepage = "https://gitlab.com/wikit/research-and-development/chunk-norris"
38
+ Issues = "https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,4 @@
1
+ from .chunkers import *
2
+ from .custom_chunkers import *
3
+ from .exceptions import *
4
+ from .types import *
@@ -0,0 +1,4 @@
1
+ from .chunkers import *
2
+ from .custom_chunkers import *
3
+ from .exceptions import *
4
+ from .types import *
@@ -0,0 +1,2 @@
1
+ from .html_chunknorris import HTMLChunkNorris
2
+ from .markdown_chunknorris import MarkdownChunkNorris
@@ -0,0 +1,2 @@
1
+ from .html_chunknorris import HTMLChunkNorris as HTMLChunkNorris
2
+ from .markdown_chunknorris import MarkdownChunkNorris as MarkdownChunkNorris
@@ -0,0 +1,25 @@
1
+ from markdownify import markdownify
2
+
3
+ from .markdown_chunknorris import MarkdownChunkNorris
4
+
5
+
6
+ class HTMLChunkNorris(MarkdownChunkNorris):
7
+
8
+ def __call__(self, html_text: str, **kwargs) -> str:
9
+ text = HTMLChunkNorris.apply_markdownify(html_text)
10
+
11
+ return super().__call__(text, **kwargs)
12
+
13
+ @staticmethod
14
+ def apply_markdownify(html_text) -> str:
15
+ """Applies markdownify to the html text
16
+
17
+ Args:
18
+ html_text (str): the text of the html file
19
+
20
+ Returns:
21
+ str: the markdownified string
22
+ """
23
+ md_text = markdownify(html_text, strip=["figure", "img"], bullets="-*+")
24
+
25
+ return md_text
@@ -0,0 +1,6 @@
1
+ from .markdown_chunknorris import MarkdownChunkNorris as MarkdownChunkNorris
2
+
3
+ class HTMLChunkNorris(MarkdownChunkNorris):
4
+ def __call__(self, html_text: str, **kwargs) -> str: ...
5
+ @staticmethod
6
+ def apply_markdownify(html_text) -> str: ...