MarkdownHeaderTextSplitter 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markdownheadertextsplitter-0.1.0/LICENSE +21 -0
- markdownheadertextsplitter-0.1.0/MarkdownHeaderTextSplitter.egg-info/PKG-INFO +74 -0
- markdownheadertextsplitter-0.1.0/MarkdownHeaderTextSplitter.egg-info/SOURCES.txt +13 -0
- markdownheadertextsplitter-0.1.0/MarkdownHeaderTextSplitter.egg-info/dependency_links.txt +1 -0
- markdownheadertextsplitter-0.1.0/MarkdownHeaderTextSplitter.egg-info/entry_points.txt +2 -0
- markdownheadertextsplitter-0.1.0/MarkdownHeaderTextSplitter.egg-info/requires.txt +1 -0
- markdownheadertextsplitter-0.1.0/MarkdownHeaderTextSplitter.egg-info/top_level.txt +1 -0
- markdownheadertextsplitter-0.1.0/PKG-INFO +74 -0
- markdownheadertextsplitter-0.1.0/README.md +48 -0
- markdownheadertextsplitter-0.1.0/markdownheadertextsplitter/__init__.py +17 -0
- markdownheadertextsplitter-0.1.0/markdownheadertextsplitter/app.py +68 -0
- markdownheadertextsplitter-0.1.0/markdownheadertextsplitter/cli.py +17 -0
- markdownheadertextsplitter-0.1.0/markdownheadertextsplitter/core.py +258 -0
- markdownheadertextsplitter-0.1.0/pyproject.toml +36 -0
- markdownheadertextsplitter-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Wenxi Wang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: MarkdownHeaderTextSplitter
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A heading-aware Markdown splitter with a simple local UI
|
|
5
|
+
Author: Wenxi Wang
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://example.com/MarkdownHeaderTextSplitter
|
|
8
|
+
Project-URL: Repository, https://example.com/MarkdownHeaderTextSplitter
|
|
9
|
+
Keywords: markdown,splitter,rag,chunking,ui
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: streamlit>=1.32
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# MarkdownHeaderTextSplitter
|
|
28
|
+
|
|
29
|
+
A small, original, heading-aware Markdown splitter with a simple local UI for preparing Markdown files for RAG or inspection.
|
|
30
|
+
|
|
31
|
+
## What it does
|
|
32
|
+
|
|
33
|
+
- Accepts a `.md` file in a local UI
|
|
34
|
+
- Respects Markdown heading structure like `#`, `##`, and `###`
|
|
35
|
+
- Lets you choose how many chunks you want
|
|
36
|
+
- Outputs the split Markdown chunks as separate `.md` files inside a downloadable `.zip`
|
|
37
|
+
- Keeps the implementation lightweight and original instead of copying third-party splitter code
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install MarkdownHeaderTextSplitter
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Run
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
MarkdownHeaderTextSplitter
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
This launches a local Streamlit app in your browser.
|
|
52
|
+
|
|
53
|
+
## Notes
|
|
54
|
+
|
|
55
|
+
- This package does **not** vendor or copy code from LangChain or other companies.
|
|
56
|
+
- The idea of heading-aware splitting is common, but you should still do your own name, trademark, licensing, and patent review before publishing publicly.
|
|
57
|
+
- PyPI package-name availability can change over time, so confirm the final project name before upload.
|
|
58
|
+
|
|
59
|
+
## File structure
|
|
60
|
+
|
|
61
|
+
```text
|
|
62
|
+
MarkdownHeaderTextSplitter_pypi/
|
|
63
|
+
README.md
|
|
64
|
+
LICENSE
|
|
65
|
+
pyproject.toml
|
|
66
|
+
markdownheadertextsplitter/
|
|
67
|
+
__init__.py
|
|
68
|
+
app.py
|
|
69
|
+
cli.py
|
|
70
|
+
core.py
|
|
71
|
+
dist/
|
|
72
|
+
MarkdownHeaderTextSplitter-0.1.0.tar.gz
|
|
73
|
+
markdownheadertextsplitter-0.1.0-py3-none-any.whl
|
|
74
|
+
```
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
MarkdownHeaderTextSplitter.egg-info/PKG-INFO
|
|
5
|
+
MarkdownHeaderTextSplitter.egg-info/SOURCES.txt
|
|
6
|
+
MarkdownHeaderTextSplitter.egg-info/dependency_links.txt
|
|
7
|
+
MarkdownHeaderTextSplitter.egg-info/entry_points.txt
|
|
8
|
+
MarkdownHeaderTextSplitter.egg-info/requires.txt
|
|
9
|
+
MarkdownHeaderTextSplitter.egg-info/top_level.txt
|
|
10
|
+
markdownheadertextsplitter/__init__.py
|
|
11
|
+
markdownheadertextsplitter/app.py
|
|
12
|
+
markdownheadertextsplitter/cli.py
|
|
13
|
+
markdownheadertextsplitter/core.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
streamlit>=1.32
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
markdownheadertextsplitter
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: MarkdownHeaderTextSplitter
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A heading-aware Markdown splitter with a simple local UI
|
|
5
|
+
Author: Wenxi Wang
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://example.com/MarkdownHeaderTextSplitter
|
|
8
|
+
Project-URL: Repository, https://example.com/MarkdownHeaderTextSplitter
|
|
9
|
+
Keywords: markdown,splitter,rag,chunking,ui
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: streamlit>=1.32
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# MarkdownHeaderTextSplitter
|
|
28
|
+
|
|
29
|
+
A small, original, heading-aware Markdown splitter with a simple local UI for preparing Markdown files for RAG or inspection.
|
|
30
|
+
|
|
31
|
+
## What it does
|
|
32
|
+
|
|
33
|
+
- Accepts a `.md` file in a local UI
|
|
34
|
+
- Respects Markdown heading structure like `#`, `##`, and `###`
|
|
35
|
+
- Lets you choose how many chunks you want
|
|
36
|
+
- Outputs the split Markdown chunks as separate `.md` files inside a downloadable `.zip`
|
|
37
|
+
- Keeps the implementation lightweight and original instead of copying third-party splitter code
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install MarkdownHeaderTextSplitter
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Run
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
MarkdownHeaderTextSplitter
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
This launches a local Streamlit app in your browser.
|
|
52
|
+
|
|
53
|
+
## Notes
|
|
54
|
+
|
|
55
|
+
- This package does **not** vendor or copy code from LangChain or other companies.
|
|
56
|
+
- The idea of heading-aware splitting is common, but you should still do your own name, trademark, licensing, and patent review before publishing publicly.
|
|
57
|
+
- PyPI package-name availability can change over time, so confirm the final project name before upload.
|
|
58
|
+
|
|
59
|
+
## File structure
|
|
60
|
+
|
|
61
|
+
```text
|
|
62
|
+
MarkdownHeaderTextSplitter_pypi/
|
|
63
|
+
README.md
|
|
64
|
+
LICENSE
|
|
65
|
+
pyproject.toml
|
|
66
|
+
markdownheadertextsplitter/
|
|
67
|
+
__init__.py
|
|
68
|
+
app.py
|
|
69
|
+
cli.py
|
|
70
|
+
core.py
|
|
71
|
+
dist/
|
|
72
|
+
MarkdownHeaderTextSplitter-0.1.0.tar.gz
|
|
73
|
+
markdownheadertextsplitter-0.1.0-py3-none-any.whl
|
|
74
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# MarkdownHeaderTextSplitter
|
|
2
|
+
|
|
3
|
+
A small, original, heading-aware Markdown splitter with a simple local UI for preparing Markdown files for RAG or inspection.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
- Accepts a `.md` file in a local UI
|
|
8
|
+
- Respects Markdown heading structure like `#`, `##`, and `###`
|
|
9
|
+
- Lets you choose how many chunks you want
|
|
10
|
+
- Outputs the split Markdown chunks as separate `.md` files inside a downloadable `.zip`
|
|
11
|
+
- Keeps the implementation lightweight and original instead of copying third-party splitter code
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install MarkdownHeaderTextSplitter
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Run
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
MarkdownHeaderTextSplitter
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
This launches a local Streamlit app in your browser.
|
|
26
|
+
|
|
27
|
+
## Notes
|
|
28
|
+
|
|
29
|
+
- This package does **not** vendor or copy code from LangChain or other companies.
|
|
30
|
+
- The idea of heading-aware splitting is common, but you should still do your own name, trademark, licensing, and patent review before publishing publicly.
|
|
31
|
+
- PyPI package-name availability can change over time, so confirm the final project name before upload.
|
|
32
|
+
|
|
33
|
+
## File structure
|
|
34
|
+
|
|
35
|
+
```text
|
|
36
|
+
MarkdownHeaderTextSplitter_pypi/
|
|
37
|
+
README.md
|
|
38
|
+
LICENSE
|
|
39
|
+
pyproject.toml
|
|
40
|
+
markdownheadertextsplitter/
|
|
41
|
+
__init__.py
|
|
42
|
+
app.py
|
|
43
|
+
cli.py
|
|
44
|
+
core.py
|
|
45
|
+
dist/
|
|
46
|
+
MarkdownHeaderTextSplitter-0.1.0.tar.gz
|
|
47
|
+
markdownheadertextsplitter-0.1.0-py3-none-any.whl
|
|
48
|
+
```
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""MarkdownHeaderTextSplitter package."""
|
|
2
|
+
|
|
3
|
+
__all__ = [
|
|
4
|
+
"parse_markdown_sections",
|
|
5
|
+
"split_markdown_into_chunks",
|
|
6
|
+
"chunk_manifest",
|
|
7
|
+
"build_zip_bytes",
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
|
|
12
|
+
from .core import ( # noqa: E402
|
|
13
|
+
build_zip_bytes,
|
|
14
|
+
chunk_manifest,
|
|
15
|
+
parse_markdown_sections,
|
|
16
|
+
split_markdown_into_chunks,
|
|
17
|
+
)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import streamlit as st
|
|
4
|
+
|
|
5
|
+
from markdownheadertextsplitter.core import build_zip_bytes, chunk_manifest, split_markdown_into_chunks
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
st.set_page_config(
|
|
9
|
+
page_title="MarkdownHeaderTextSplitter",
|
|
10
|
+
page_icon="🧩",
|
|
11
|
+
layout="wide",
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
st.title("MarkdownHeaderTextSplitter")
|
|
15
|
+
st.caption("Split Markdown into smaller heading-aware chunks for RAG, review, or preprocessing.")
|
|
16
|
+
|
|
17
|
+
with st.sidebar:
|
|
18
|
+
st.header("Settings")
|
|
19
|
+
target_chunks = st.slider("Target number of chunks", min_value=1, max_value=40, value=6)
|
|
20
|
+
max_heading_level = st.select_slider("Respect headings through level", options=[1, 2, 3, 4, 5, 6], value=3)
|
|
21
|
+
include_preamble = st.checkbox("Keep preamble before first heading", value=True)
|
|
22
|
+
hard_max_chars = st.number_input(
|
|
23
|
+
"Safety cap: max characters per chunk (0 = disabled)",
|
|
24
|
+
min_value=0,
|
|
25
|
+
max_value=50000,
|
|
26
|
+
value=0,
|
|
27
|
+
step=100,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
uploaded = st.file_uploader("Drag a Markdown file here", type=["md"])
|
|
31
|
+
|
|
32
|
+
if uploaded is None:
|
|
33
|
+
st.info("Upload a .md file to preview and export split chunks.")
|
|
34
|
+
st.stop()
|
|
35
|
+
|
|
36
|
+
raw_bytes = uploaded.read()
|
|
37
|
+
text = raw_bytes.decode("utf-8", errors="ignore")
|
|
38
|
+
|
|
39
|
+
chunks = split_markdown_into_chunks(
|
|
40
|
+
text,
|
|
41
|
+
target_chunks=target_chunks,
|
|
42
|
+
max_heading_level=max_heading_level,
|
|
43
|
+
include_preamble=include_preamble,
|
|
44
|
+
hard_max_chars=hard_max_chars,
|
|
45
|
+
)
|
|
46
|
+
manifest = chunk_manifest(chunks, uploaded.name)
|
|
47
|
+
zip_bytes = build_zip_bytes(chunks, uploaded.name)
|
|
48
|
+
|
|
49
|
+
col1, col2 = st.columns([1, 2])
|
|
50
|
+
with col1:
|
|
51
|
+
st.metric("Output chunks", len(chunks))
|
|
52
|
+
st.metric("Input characters", len(text))
|
|
53
|
+
st.metric("Avg chunk characters", int(sum(len(c) for c in chunks) / max(1, len(chunks))))
|
|
54
|
+
st.download_button(
|
|
55
|
+
label="Download split .zip",
|
|
56
|
+
data=zip_bytes,
|
|
57
|
+
file_name=f"{uploaded.name.rsplit('.', 1)[0]}_chunks.zip",
|
|
58
|
+
mime="application/zip",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
with col2:
|
|
62
|
+
st.subheader("Chunk manifest")
|
|
63
|
+
st.dataframe(manifest, use_container_width=True)
|
|
64
|
+
|
|
65
|
+
st.subheader("Chunk preview")
|
|
66
|
+
for item, chunk in zip(manifest, chunks):
|
|
67
|
+
with st.expander(f"{item['file_name']} — {item['char_count']} chars"):
|
|
68
|
+
st.code(chunk, language="markdown")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main() -> None:
|
|
8
|
+
try:
|
|
9
|
+
from streamlit.web import cli as stcli
|
|
10
|
+
except Exception as exc: # pragma: no cover
|
|
11
|
+
raise SystemExit(
|
|
12
|
+
"Streamlit is required to launch the UI. Install the package dependencies first."
|
|
13
|
+
) from exc
|
|
14
|
+
|
|
15
|
+
app_path = Path(__file__).with_name("app.py")
|
|
16
|
+
sys.argv = ["streamlit", "run", str(app_path)]
|
|
17
|
+
raise SystemExit(stcli.main())
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
import json
|
|
6
|
+
import math
|
|
7
|
+
import re
|
|
8
|
+
import zipfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterable, List
|
|
11
|
+
|
|
12
|
+
HEADING_RE = re.compile(r"^\\?(#{1,6})\s+(.*\S)\s*$")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Section:
|
|
17
|
+
heading_level: int
|
|
18
|
+
heading_text: str
|
|
19
|
+
content: str
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def char_count(self) -> int:
|
|
23
|
+
return len(self.content)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _normalize_newlines(text: str) -> str:
|
|
27
|
+
return text.replace("\r\n", "\n").replace("\r", "\n")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_markdown_sections(
|
|
31
|
+
text: str,
|
|
32
|
+
*,
|
|
33
|
+
max_heading_level: int = 3,
|
|
34
|
+
include_preamble: bool = True,
|
|
35
|
+
) -> List[Section]:
|
|
36
|
+
"""Split a markdown document into ordered sections based on headings.
|
|
37
|
+
|
|
38
|
+
The implementation is intentionally simple and original:
|
|
39
|
+
- Headings up to ``max_heading_level`` start a new section.
|
|
40
|
+
- The heading line is kept inside the section content.
|
|
41
|
+
- Optional preamble text before the first heading becomes its own section.
|
|
42
|
+
"""
|
|
43
|
+
text = _normalize_newlines(text).strip("\n")
|
|
44
|
+
if not text:
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
sections: List[Section] = []
|
|
48
|
+
current_lines: List[str] = []
|
|
49
|
+
current_level = 0
|
|
50
|
+
current_title = "Preamble"
|
|
51
|
+
saw_heading = False
|
|
52
|
+
|
|
53
|
+
for line in text.split("\n"):
|
|
54
|
+
match = HEADING_RE.match(line)
|
|
55
|
+
if match:
|
|
56
|
+
level = len(match.group(1))
|
|
57
|
+
title = match.group(2).strip()
|
|
58
|
+
if level <= max_heading_level:
|
|
59
|
+
if current_lines and (include_preamble or saw_heading):
|
|
60
|
+
sections.append(
|
|
61
|
+
Section(
|
|
62
|
+
heading_level=current_level,
|
|
63
|
+
heading_text=current_title,
|
|
64
|
+
content="\n".join(current_lines).strip() + "\n",
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
current_lines = [line]
|
|
68
|
+
current_level = level
|
|
69
|
+
current_title = title
|
|
70
|
+
saw_heading = True
|
|
71
|
+
continue
|
|
72
|
+
current_lines.append(line)
|
|
73
|
+
|
|
74
|
+
if current_lines and (include_preamble or saw_heading):
|
|
75
|
+
sections.append(
|
|
76
|
+
Section(
|
|
77
|
+
heading_level=current_level,
|
|
78
|
+
heading_text=current_title,
|
|
79
|
+
content="\n".join(current_lines).strip() + "\n",
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
if not sections:
|
|
84
|
+
return [Section(heading_level=0, heading_text="Document", content=text + "\n")]
|
|
85
|
+
return sections
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _split_text_by_paragraphs(text: str) -> List[str]:
|
|
89
|
+
parts = [part.strip() for part in re.split(r"\n\s*\n", text) if part.strip()]
|
|
90
|
+
return [part + "\n\n" for part in parts] or [text]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _split_text_by_size(text: str, target_parts: int) -> List[str]:
|
|
94
|
+
if target_parts <= 1 or len(text) < 2:
|
|
95
|
+
return [text]
|
|
96
|
+
paragraphs = _split_text_by_paragraphs(text)
|
|
97
|
+
if len(paragraphs) >= target_parts:
|
|
98
|
+
total_chars = sum(len(p) for p in paragraphs)
|
|
99
|
+
desired = max(1, math.ceil(total_chars / target_parts))
|
|
100
|
+
out: List[str] = []
|
|
101
|
+
bucket: List[str] = []
|
|
102
|
+
bucket_size = 0
|
|
103
|
+
remaining_parts = target_parts
|
|
104
|
+
remaining_paragraphs = paragraphs[:]
|
|
105
|
+
|
|
106
|
+
for para in remaining_paragraphs:
|
|
107
|
+
remaining_paragraphs_left = len(remaining_paragraphs)
|
|
108
|
+
remaining_paragraphs.pop(0)
|
|
109
|
+
bucket.append(para)
|
|
110
|
+
bucket_size += len(para)
|
|
111
|
+
|
|
112
|
+
must_close = len(out) + 1 == target_parts
|
|
113
|
+
enough_size = bucket_size >= desired
|
|
114
|
+
enough_remaining = len(remaining_paragraphs) >= (remaining_parts - 1)
|
|
115
|
+
|
|
116
|
+
if must_close or (enough_size and enough_remaining):
|
|
117
|
+
out.append("".join(bucket).strip() + "\n")
|
|
118
|
+
bucket = []
|
|
119
|
+
bucket_size = 0
|
|
120
|
+
remaining_parts -= 1
|
|
121
|
+
|
|
122
|
+
if bucket:
|
|
123
|
+
out.append("".join(bucket).strip() + "\n")
|
|
124
|
+
return [chunk for chunk in out if chunk.strip()]
|
|
125
|
+
|
|
126
|
+
# Fallback: raw char slicing if there are not enough paragraphs.
|
|
127
|
+
text = text.strip()
|
|
128
|
+
step = max(1, math.ceil(len(text) / target_parts))
|
|
129
|
+
out = []
|
|
130
|
+
start = 0
|
|
131
|
+
while start < len(text):
|
|
132
|
+
out.append(text[start : start + step].strip() + "\n")
|
|
133
|
+
start += step
|
|
134
|
+
return [chunk for chunk in out if chunk.strip()]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _merge_sections_to_target(sections: List[Section], target_chunks: int) -> List[str]:
|
|
138
|
+
if target_chunks <= 1 or len(sections) <= 1:
|
|
139
|
+
return ["\n".join(section.content.strip() for section in sections if section.content.strip()) + "\n"]
|
|
140
|
+
|
|
141
|
+
total_chars = sum(section.char_count for section in sections)
|
|
142
|
+
desired = max(1, math.ceil(total_chars / target_chunks))
|
|
143
|
+
|
|
144
|
+
chunks: List[str] = []
|
|
145
|
+
bucket: List[str] = []
|
|
146
|
+
bucket_chars = 0
|
|
147
|
+
remaining_sections = sections[:]
|
|
148
|
+
remaining_buckets = target_chunks
|
|
149
|
+
|
|
150
|
+
while remaining_sections:
|
|
151
|
+
section = remaining_sections.pop(0)
|
|
152
|
+
bucket.append(section.content.strip())
|
|
153
|
+
bucket_chars += section.char_count
|
|
154
|
+
|
|
155
|
+
must_close = len(chunks) + 1 == target_chunks
|
|
156
|
+
enough_size = bucket_chars >= desired
|
|
157
|
+
enough_remaining = len(remaining_sections) >= (remaining_buckets - 1)
|
|
158
|
+
|
|
159
|
+
if must_close or (enough_size and enough_remaining):
|
|
160
|
+
chunks.append("\n\n".join(bucket).strip() + "\n")
|
|
161
|
+
bucket = []
|
|
162
|
+
bucket_chars = 0
|
|
163
|
+
remaining_buckets -= 1
|
|
164
|
+
|
|
165
|
+
if bucket:
|
|
166
|
+
chunks.append("\n\n".join(bucket).strip() + "\n")
|
|
167
|
+
|
|
168
|
+
return [chunk for chunk in chunks if chunk.strip()]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _expand_sections_to_target(sections: List[Section], target_chunks: int) -> List[str]:
|
|
172
|
+
chunks = [section.content.strip() + "\n" for section in sections if section.content.strip()]
|
|
173
|
+
if not chunks:
|
|
174
|
+
return []
|
|
175
|
+
|
|
176
|
+
while len(chunks) < target_chunks:
|
|
177
|
+
idx = max(range(len(chunks)), key=lambda i: len(chunks[i]))
|
|
178
|
+
biggest = chunks[idx]
|
|
179
|
+
if len(biggest) < 400:
|
|
180
|
+
break
|
|
181
|
+
split_parts = _split_text_by_size(biggest, 2)
|
|
182
|
+
if len(split_parts) <= 1:
|
|
183
|
+
break
|
|
184
|
+
chunks = chunks[:idx] + split_parts + chunks[idx + 1 :]
|
|
185
|
+
|
|
186
|
+
return chunks
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _apply_hard_max_chars(chunks: Iterable[str], hard_max_chars: int) -> List[str]:
|
|
190
|
+
out: List[str] = []
|
|
191
|
+
for chunk in chunks:
|
|
192
|
+
if hard_max_chars <= 0 or len(chunk) <= hard_max_chars:
|
|
193
|
+
out.append(chunk.strip() + "\n")
|
|
194
|
+
continue
|
|
195
|
+
parts = _split_text_by_size(chunk, max(2, math.ceil(len(chunk) / hard_max_chars)))
|
|
196
|
+
out.extend(part.strip() + "\n" for part in parts if part.strip())
|
|
197
|
+
return out
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def split_markdown_into_chunks(
|
|
201
|
+
text: str,
|
|
202
|
+
*,
|
|
203
|
+
target_chunks: int = 6,
|
|
204
|
+
max_heading_level: int = 3,
|
|
205
|
+
include_preamble: bool = True,
|
|
206
|
+
hard_max_chars: int = 0,
|
|
207
|
+
) -> List[str]:
|
|
208
|
+
sections = parse_markdown_sections(
|
|
209
|
+
text,
|
|
210
|
+
max_heading_level=max_heading_level,
|
|
211
|
+
include_preamble=include_preamble,
|
|
212
|
+
)
|
|
213
|
+
if not sections:
|
|
214
|
+
return []
|
|
215
|
+
|
|
216
|
+
if target_chunks <= 1:
|
|
217
|
+
chunks = ["\n\n".join(section.content.strip() for section in sections).strip() + "\n"]
|
|
218
|
+
elif len(sections) == target_chunks:
|
|
219
|
+
chunks = [section.content.strip() + "\n" for section in sections]
|
|
220
|
+
elif len(sections) > target_chunks:
|
|
221
|
+
chunks = _merge_sections_to_target(sections, target_chunks)
|
|
222
|
+
else:
|
|
223
|
+
chunks = _expand_sections_to_target(sections, target_chunks)
|
|
224
|
+
|
|
225
|
+
chunks = _apply_hard_max_chars(chunks, hard_max_chars)
|
|
226
|
+
return [chunk for chunk in chunks if chunk.strip()]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def chunk_manifest(chunks: Iterable[str], source_name: str) -> list[dict]:
|
|
230
|
+
manifest = []
|
|
231
|
+
for index, chunk in enumerate(chunks, start=1):
|
|
232
|
+
line_count = chunk.count("\n") + 1
|
|
233
|
+
manifest.append(
|
|
234
|
+
{
|
|
235
|
+
"chunk_index": index,
|
|
236
|
+
"file_name": f"{Path(source_name).stem}_chunk_{index:03d}.md",
|
|
237
|
+
"char_count": len(chunk),
|
|
238
|
+
"line_count": line_count,
|
|
239
|
+
"preview": chunk[:180].replace("\n", " ").strip(),
|
|
240
|
+
}
|
|
241
|
+
)
|
|
242
|
+
return manifest
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def build_zip_bytes(chunks: Iterable[str], source_name: str) -> bytes:
|
|
246
|
+
chunks = list(chunks)
|
|
247
|
+
manifest = chunk_manifest(chunks, source_name)
|
|
248
|
+
stem = Path(source_name).stem or "markdown"
|
|
249
|
+
|
|
250
|
+
buffer = BytesIO()
|
|
251
|
+
with zipfile.ZipFile(buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
252
|
+
for item, chunk in zip(manifest, chunks):
|
|
253
|
+
zf.writestr(item["file_name"], chunk)
|
|
254
|
+
zf.writestr(
|
|
255
|
+
f"{stem}_manifest.json",
|
|
256
|
+
json.dumps(manifest, indent=2, ensure_ascii=False),
|
|
257
|
+
)
|
|
258
|
+
return buffer.getvalue()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "MarkdownHeaderTextSplitter"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A heading-aware Markdown splitter with a simple local UI"
|
|
9
|
+
authors = [{name="Wenxi Wang"}]
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
license = {text = "MIT"}
|
|
13
|
+
dependencies = [
|
|
14
|
+
"streamlit>=1.32",
|
|
15
|
+
]
|
|
16
|
+
keywords = ["markdown", "splitter", "rag", "chunking", "ui"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 3 - Alpha",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
28
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://example.com/MarkdownHeaderTextSplitter"
|
|
33
|
+
Repository = "https://example.com/MarkdownHeaderTextSplitter"
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
MarkdownHeaderTextSplitter = "markdownheadertextsplitter.cli:main"
|