PyPI - chunknorris - Versions diffs - 0.0.1__tar.gz - Mend

chunknorris 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

chunknorris-0.0.1/LICENCE ADDED Viewed

@@ -0,0 +1,15 @@
+ChunkNorris - A package for reliable chunking of documents
+Copyright (C) 2024 Wikit.ai
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>

chunknorris-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,142 @@
+Metadata-Version: 2.1
+Name: chunknorris
+Version: 0.0.1
+Summary: A package for chunking documents from various formats
+Author-email: Wikit <dev@wikit.ai>
+Project-URL: Homepage, https://gitlab.com/wikit/research-and-development/chunk-norris
+Project-URL: Issues, https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues
+Keywords: chunk,document,split,html,markdown,pdf,header
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Framework :: Pytest
+Classifier: License :: OSI Approved :: GNU Affero General Public License v3
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENCE
+Requires-Dist: markdownify>=0.11.6
+Requires-Dist: tiktoken>=0.5.2
+Requires-Dist: PyMuPDF>=1.23.16
+# Chunk Norris
+## Goal
+This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
+An optimized chunking method might lead to smaller chunks, meaning :
+- **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
+- **Less errors** because of chunks exceeding the API limit in terms of number of tokens
+- **Less hallucinations** of generation models because of superfluous information in the prompt
+- **Reduced cost** as the prompt would have reduced size
+## Installation
+Using Pypi, just run the following command :
+```pip install chunknorris```
+## Chunkers
+The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
+All chunkers follow a similar logic :
+- Extract table of contents (= headers)
+- Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
+![](images/chunk_method.png)
+### MarkdownChunkNorris
+This chunker is meant to be used **on markdown-formatted text**.
+Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
+#### Usage
+```py
+from chunkers import MarkdownChunkNorris
+text = """
+# This is a header
+This is a text
+## This is another header
+And another text
+## With this final header
+And this last text
+"""
+chunker = MarkdownChunkNorris()
+header_style = "atx" # or "setext" depending on headers in your text
+chunks = chunker(text, header_style=header_style)
+```
+### HTMLChunkNorris
+This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
+#### Usage
+```py
+from chunkers import HTMLChunkNorris
+text = """
+<h1>This is 1st level heading</h1>
+<p>This is a test paragraph.</p>
+<h2>This is 2nd level heading</h2>
+<p>This is a test paragraph.</p>
+<h2>This is another level heading</h2>
+<p>This is another test paragraph.</p>
+"""
+hcn = HTMLChunkNorris()
+chunks = hcn(text)
+```
+### Advanced usage of chunkers
+Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
+```py
+from chunkers import MarkdownChunkNorris
+mystring = "# header\nThis is a markdown string"
+chunker = MarkdownChunkNorris() # or any other chunker
+chunks = chunker(
+    mystring,
+    max_title_level_to_use="h3",
+    max_chunk_word_length=200,
+    link_placement="in_sentence",
+    max_chunk_tokens=8191,
+    chunk_tokens_exceeded_handling="split",
+    min_chunk_wordcount=15,
+    )
+```
+***max_title_level_to_use***
+(str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
+***max_chunk_word_length***
+(int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
+***link_placement***
+(str): How the links should be handled. Defaults to in_sentence.
+Options :
+- "remove" : text is kept but links are removed
+- "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
+- "in_sentence" : the links is added between parenthesis inside the sentence
+***max_chunk_tokens***
+(int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
+***chunk_tokens_exceeded_handling***
+(str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
+Options:
+- "raise_error": raises an error, indicated the chunk could not be split according to headers
+- "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
+***min_chunk_wordcount***
+(int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
+### PDFChunkNorris
+#TODO:

chunknorris-0.0.1/README.md ADDED Viewed

@@ -0,0 +1,120 @@
+# Chunk Norris
+## Goal
+This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
+An optimized chunking method might lead to smaller chunks, meaning :
+- **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
+- **Less errors** because of chunks exceeding the API limit in terms of number of tokens
+- **Less hallucinations** of generation models because of superfluous information in the prompt
+- **Reduced cost** as the prompt would have reduced size
+## Installation
+Using Pypi, just run the following command :
+```pip install chunknorris```
+## Chunkers
+The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
+All chunkers follow a similar logic :
+- Extract table of contents (= headers)
+- Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
+![](images/chunk_method.png)
+### MarkdownChunkNorris
+This chunker is meant to be used **on markdown-formatted text**.
+Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
+#### Usage
+```py
+from chunkers import MarkdownChunkNorris
+text = """
+# This is a header
+This is a text
+## This is another header
+And another text
+## With this final header
+And this last text
+"""
+chunker = MarkdownChunkNorris()
+header_style = "atx" # or "setext" depending on headers in your text
+chunks = chunker(text, header_style=header_style)
+```
+### HTMLChunkNorris
+This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
+#### Usage
+```py
+from chunkers import HTMLChunkNorris
+text = """
+<h1>This is 1st level heading</h1>
+<p>This is a test paragraph.</p>
+<h2>This is 2nd level heading</h2>
+<p>This is a test paragraph.</p>
+<h2>This is another level heading</h2>
+<p>This is another test paragraph.</p>
+"""
+hcn = HTMLChunkNorris()
+chunks = hcn(text)
+```
+### Advanced usage of chunkers
+Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
+```py
+from chunkers import MarkdownChunkNorris
+mystring = "# header\nThis is a markdown string"
+chunker = MarkdownChunkNorris() # or any other chunker
+chunks = chunker(
+    mystring,
+    max_title_level_to_use="h3",
+    max_chunk_word_length=200,
+    link_placement="in_sentence",
+    max_chunk_tokens=8191,
+    chunk_tokens_exceeded_handling="split",
+    min_chunk_wordcount=15,
+    )
+```
+***max_title_level_to_use***
+(str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
+***max_chunk_word_length***
+(int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
+***link_placement***
+(str): How the links should be handled. Defaults to in_sentence.
+Options :
+- "remove" : text is kept but links are removed
+- "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
+- "in_sentence" : the links is added between parenthesis inside the sentence
+***max_chunk_tokens***
+(int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
+***chunk_tokens_exceeded_handling***
+(str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
+Options:
+- "raise_error": raises an error, indicated the chunk could not be split according to headers
+- "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
+***min_chunk_wordcount***
+(int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
+### PDFChunkNorris
+#TODO:

chunknorris-0.0.1/chunknorris.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,142 @@
+Metadata-Version: 2.1
+Name: chunknorris
+Version: 0.0.1
+Summary: A package for chunking documents from various formats
+Author-email: Wikit <dev@wikit.ai>
+Project-URL: Homepage, https://gitlab.com/wikit/research-and-development/chunk-norris
+Project-URL: Issues, https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues
+Keywords: chunk,document,split,html,markdown,pdf,header
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Framework :: Pytest
+Classifier: License :: OSI Approved :: GNU Affero General Public License v3
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENCE
+Requires-Dist: markdownify>=0.11.6
+Requires-Dist: tiktoken>=0.5.2
+Requires-Dist: PyMuPDF>=1.23.16
+# Chunk Norris
+## Goal
+This project aims at improving the method of chunking documents from various sources (HTML, PDFs, ...)
+An optimized chunking method might lead to smaller chunks, meaning :
+- **Better relevancy of chunks** (and thus easier identification of useful chunks through embedding cosine similarity)
+- **Less errors** because of chunks exceeding the API limit in terms of number of tokens
+- **Less hallucinations** of generation models because of superfluous information in the prompt
+- **Reduced cost** as the prompt would have reduced size
+## Installation
+Using Pypi, just run the following command :
+```pip install chunknorris```
+## Chunkers
+The package features multiple ***chunkers*** that can be used independently depending on the type of document needed.
+All chunkers follow a similar logic :
+- Extract table of contents (= headers)
+- Build chunks using the text content of a part, and put the titles of the parts it belongs to on top
+![](images/chunk_method.png)
+### MarkdownChunkNorris
+This chunker is meant to be used **on markdown-formatted text**.
+Note: When calling the chunker, **you need to specify the header style** of your markdown text ([ATX or Setext](https://golem.ph.utexas.edu/~distler/maruku/markdown_syntax.html#header)). By default it will consider "Setext" heading style.
+#### Usage
+```py
+from chunkers import MarkdownChunkNorris
+text = """
+# This is a header
+This is a text
+## This is another header
+And another text
+## With this final header
+And this last text
+"""
+chunker = MarkdownChunkNorris()
+header_style = "atx" # or "setext" depending on headers in your text
+chunks = chunker(text, header_style=header_style)
+```
+### HTMLChunkNorris
+This chunker is meant to be used **on html-formatted text**. Behind the scene, it uses markdownify to transform the text to markdown with "setex"-style headers and uses MarkdownChunkNorris to process it.
+#### Usage
+```py
+from chunkers import HTMLChunkNorris
+text = """
+<h1>This is 1st level heading</h1>
+<p>This is a test paragraph.</p>
+<h2>This is 2nd level heading</h2>
+<p>This is a test paragraph.</p>
+<h2>This is another level heading</h2>
+<p>This is another test paragraph.</p>
+"""
+hcn = HTMLChunkNorris()
+chunks = hcn(text)
+```
+### Advanced usage of chunkers
+Additionally, the chunkers can take a number of argument allowing to modifiy its behavior:
+```py
+from chunkers import MarkdownChunkNorris
+mystring = "# header\nThis is a markdown string"
+chunker = MarkdownChunkNorris() # or any other chunker
+chunks = chunker(
+    mystring,
+    max_title_level_to_use="h3",
+    max_chunk_word_length=200,
+    link_placement="in_sentence",
+    max_chunk_tokens=8191,
+    chunk_tokens_exceeded_handling="split",
+    min_chunk_wordcount=15,
+    )
+```
+***max_title_level_to_use***
+(str): The maximum (included) level of headers take into account for chunking. For example, if "h3" is set, then "h4" and "h5" titles won't be used. Must be a string of type "hx" with x being the title level. Defaults to "h4".
+***max_chunk_word_length***
+(int): The maximum size (soft limit, in words) a chunk can be. Chunk bigger that this size will be chunked using lower level headers, until no lower level headers are available. Defaults to 200.
+***link_placement***
+(str): How the links should be handled. Defaults to in_sentence.
+Options :
+- "remove" : text is kept but links are removed
+- "end_of_chunk" : adds a paragraph at the end of the chunk containing all the links
+- "in_sentence" : the links is added between parenthesis inside the sentence
+***max_chunk_tokens***
+(int): The hard maximum of number of token a chunk can be. Chunks bigger by this limit will be handler according to chunk_tokens_exceeded_handling. Defaults to 8191.
+***chunk_tokens_exceeded_handling***
+(str): how the chunks bigger that than specified by max_chunk_tokens should be handled. Default to "raise_error".
+Options:
+- "raise_error": raises an error, indicated the chunk could not be split according to headers
+- "split": split the chunks arbitrarily sothat each chunk has a size lower than max_chunk_tokens
+***min_chunk_wordcount***
+(int): Minimum number of words to consider keeping the chunks. Chunks with less words will be discarded. Defaults to 15.
+### PDFChunkNorris
+#TODO:

chunknorris-0.0.1/chunknorris.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,30 @@
+LICENCE
+README.md
+pyproject.toml
+chunknorris.egg-info/PKG-INFO
+chunknorris.egg-info/SOURCES.txt
+chunknorris.egg-info/dependency_links.txt
+chunknorris.egg-info/requires.txt
+chunknorris.egg-info/top_level.txt
+src/chunknorris/__init__.py
+src/chunknorris/__init__.pyi
+src/chunknorris/chunkers/__init__.py
+src/chunknorris/chunkers/__init__.pyi
+src/chunknorris/chunkers/html_chunknorris.py
+src/chunknorris/chunkers/html_chunknorris.pyi
+src/chunknorris/chunkers/markdown_chunknorris.py
+src/chunknorris/chunkers/markdown_chunknorris.pyi
+src/chunknorris/custom_chunkers/__init__.py
+src/chunknorris/custom_chunkers/__init__.pyi
+src/chunknorris/custom_chunkers/wikit_chunknorris.py
+src/chunknorris/custom_chunkers/wikit_chunknorris.pyi
+src/chunknorris/exceptions/__init__.py
+src/chunknorris/exceptions/__init__.pyi
+src/chunknorris/exceptions/exceptions.py
+src/chunknorris/exceptions/exceptions.pyi
+src/chunknorris/types/types.py
+src/chunknorris/types/types.pyi
+src/chunknorris/utils/__init__.py
+src/chunknorris/utils/__init__.pyi
+src/chunknorris/utils/utils.py
+src/chunknorris/utils/utils.pyi

chunknorris-0.0.1/chunknorris.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

chunknorris-0.0.1/chunknorris.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,3 @@
+markdownify>=0.11.6
+tiktoken>=0.5.2
+PyMuPDF>=1.23.16

chunknorris-0.0.1/chunknorris.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ chunknorris

chunknorris-0.0.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,38 @@
+[build-system]
+requires = ["setuptools>=69.0"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+where = ["src", "images"]
+include = ["*"]
+exclude = ["__pycache__"]
+namespaces = true  # true by default
+[project]
+name = "chunknorris"
+version = "0.0.1"
+authors = [
+  { name="Wikit", email="dev@wikit.ai" },
+]
+description = "A package for chunking documents from various formats"
+keywords = ["chunk", "document", "split", "html", "markdown", "pdf", "header"]
+readme = {file = "README.md", content-type = "text/markdown"}
+requires-python = ">=3.10"
+classifiers = [
+    "Natural Language :: English",
+    "Programming Language :: Python :: 3.10",
+    "Framework :: Pytest",
+    "License :: OSI Approved :: GNU Affero General Public License v3",
+    "Topic :: Text Processing :: Markup :: Markdown",
+    "Topic :: Text Processing :: Markup :: HTML",
+    "Operating System :: OS Independent"
+]
+dependencies = [
+  "markdownify>=0.11.6",
+  "tiktoken>=0.5.2",
+  "PyMuPDF>=1.23.16"
+]
+[project.urls]
+Homepage = "https://gitlab.com/wikit/research-and-development/chunk-norris"
+Issues = "https://gitlab.com/wikit/research-and-development/chunk-norris/-/issues"

chunknorris-0.0.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

chunknorris-0.0.1/src/chunknorris/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .chunkers import *
+from .custom_chunkers import *
+from .exceptions import *
+from .types import *

chunknorris-0.0.1/src/chunknorris/__init__.pyi ADDED Viewed

@@ -0,0 +1,4 @@
+from .chunkers import *
+from .custom_chunkers import *
+from .exceptions import *
+from .types import *

chunknorris-0.0.1/src/chunknorris/chunkers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .html_chunknorris import HTMLChunkNorris
2	+ from .markdown_chunknorris import MarkdownChunkNorris

chunknorris-0.0.1/src/chunknorris/chunkers/__init__.pyi ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .html_chunknorris import HTMLChunkNorris as HTMLChunkNorris
2	+ from .markdown_chunknorris import MarkdownChunkNorris as MarkdownChunkNorris

chunknorris-0.0.1/src/chunknorris/chunkers/html_chunknorris.py ADDED Viewed

@@ -0,0 +1,25 @@
+from markdownify import markdownify
+from .markdown_chunknorris import MarkdownChunkNorris
+class HTMLChunkNorris(MarkdownChunkNorris):
+    def __call__(self, html_text: str, **kwargs) -> str:
+        text = HTMLChunkNorris.apply_markdownify(html_text)
+        return super().__call__(text, **kwargs)
+    @staticmethod
+    def apply_markdownify(html_text) -> str:
+        """Applies markdownify to the html text
+        Args:
+            html_text (str): the text of the html file
+        Returns:
+            str: the markdownified string
+        """
+        md_text = markdownify(html_text, strip=["figure", "img"], bullets="-*+")
+        return md_text

chunknorris-0.0.1/src/chunknorris/chunkers/html_chunknorris.pyi ADDED Viewed

@@ -0,0 +1,6 @@
+from .markdown_chunknorris import MarkdownChunkNorris as MarkdownChunkNorris
+class HTMLChunkNorris(MarkdownChunkNorris):
+    def __call__(self, html_text: str, **kwargs) -> str: ...
+    @staticmethod
+    def apply_markdownify(html_text) -> str: ...