semantic-chunker-langchain 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semantic_chunker_langchain/__init__.py +10 -0
- semantic_chunker_langchain/chunker.py +181 -0
- semantic_chunker_langchain/extractors/pdf.py +10 -0
- semantic_chunker_langchain/outputs/formatter.py +23 -0
- semantic_chunker_langchain/utils.py +8 -0
- semantic_chunker_langchain-0.1.0.dist-info/LICENSE +21 -0
- semantic_chunker_langchain-0.1.0.dist-info/METADATA +110 -0
- semantic_chunker_langchain-0.1.0.dist-info/RECORD +10 -0
- semantic_chunker_langchain-0.1.0.dist-info/WHEEL +4 -0
- semantic_chunker_langchain-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,10 @@
|
|
1
|
+
from .chunker import SemanticChunker, SimpleSemanticChunker
|
2
|
+
from .utils import estimate_token_count
|
3
|
+
from .extractors.pdf import extract_pdf
|
4
|
+
from .outputs.formatter import write_to_txt, write_to_json
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"SemanticChunker", "SimpleSemanticChunker",
|
8
|
+
"estimate_token_count", "extract_pdf",
|
9
|
+
"write_to_txt", "write_to_json"
|
10
|
+
]
|
@@ -0,0 +1,181 @@
|
|
1
|
+
# === langchain_semantic_chunker/chunker.py ===
|
2
|
+
from langchain_core.documents import Document
|
3
|
+
from langchain_text_splitters import TextSplitter
|
4
|
+
from langchain_semantic_chunker.utils import estimate_token_count
|
5
|
+
import re
|
6
|
+
|
7
|
+
class SemanticChunker(TextSplitter):
|
8
|
+
def __init__(self, max_tokens: int = None, overlap: int = 200, model_name: str = "gpt-3.5-turbo", chunking_type: str = "text"):
|
9
|
+
self.model_name = model_name
|
10
|
+
self.max_tokens = max_tokens or self._default_tokens_for_model(model_name)
|
11
|
+
self.overlap = overlap
|
12
|
+
self.chunking_type = chunking_type
|
13
|
+
|
14
|
+
def _default_tokens_for_model(self, model_name: str) -> int:
|
15
|
+
if "claude" in model_name:
|
16
|
+
return 8000
|
17
|
+
elif "gpt-4" in model_name:
|
18
|
+
return 4000
|
19
|
+
else:
|
20
|
+
return 1500
|
21
|
+
|
22
|
+
def score_chunk(self, text: str) -> float:
|
23
|
+
return estimate_token_count(text, model_name=self.model_name)
|
24
|
+
|
25
|
+
def _split_paragraphs(self, text: str) -> list[str]:
|
26
|
+
return [p.strip() for p in text.split("\n\n") if p.strip()]
|
27
|
+
|
28
|
+
def _split_markdown(self, text: str) -> list[str]:
|
29
|
+
# Split at headings (e.g., ## or ###)
|
30
|
+
return re.split(r"(?=^#{1,6}\s)", text, flags=re.MULTILINE)
|
31
|
+
|
32
|
+
def _split_code(self, text: str) -> list[str]:
|
33
|
+
return text.split("\n\n") # Simple fallback chunker
|
34
|
+
|
35
|
+
def split_documents(self, documents: list[Document]) -> list[Document]:
|
36
|
+
chunks = []
|
37
|
+
|
38
|
+
for doc in documents:
|
39
|
+
text = doc.page_content
|
40
|
+
metadata = doc.metadata.copy()
|
41
|
+
|
42
|
+
if self.chunking_type == "markdown":
|
43
|
+
blocks = self._split_markdown(text)
|
44
|
+
elif self.chunking_type == "code":
|
45
|
+
blocks = self._split_code(text)
|
46
|
+
else:
|
47
|
+
blocks = self._split_paragraphs(text)
|
48
|
+
|
49
|
+
current_chunk = []
|
50
|
+
token_count = 0
|
51
|
+
|
52
|
+
for block in blocks:
|
53
|
+
block_tokens = estimate_token_count(block, model_name=self.model_name)
|
54
|
+
|
55
|
+
if token_count + block_tokens > self.max_tokens:
|
56
|
+
chunk_text = "\n\n".join(current_chunk)
|
57
|
+
chunk_metadata = metadata.copy()
|
58
|
+
chunk_metadata["score"] = self.score_chunk(chunk_text)
|
59
|
+
chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
|
60
|
+
|
61
|
+
if self.overlap and len(current_chunk) > 0:
|
62
|
+
overlap_text = current_chunk[-1]
|
63
|
+
overlap_tokens = estimate_token_count(overlap_text, model_name=self.model_name)
|
64
|
+
current_chunk = [overlap_text]
|
65
|
+
token_count = overlap_tokens
|
66
|
+
else:
|
67
|
+
current_chunk = []
|
68
|
+
token_count = 0
|
69
|
+
|
70
|
+
current_chunk.append(block)
|
71
|
+
token_count += block_tokens
|
72
|
+
|
73
|
+
if current_chunk:
|
74
|
+
chunk_text = "\n\n".join(current_chunk)
|
75
|
+
chunk_metadata = metadata.copy()
|
76
|
+
chunk_metadata["score"] = self.score_chunk(chunk_text)
|
77
|
+
chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
|
78
|
+
|
79
|
+
merged_chunks = []
|
80
|
+
i = 0
|
81
|
+
while i < len(chunks):
|
82
|
+
chunk = chunks[i]
|
83
|
+
token_count = estimate_token_count(chunk.page_content, model_name=self.model_name)
|
84
|
+
if token_count < 300 and i + 1 < len(chunks):
|
85
|
+
next_chunk = chunks[i + 1]
|
86
|
+
merged_text = chunk.page_content + "\n\n" + next_chunk.page_content
|
87
|
+
merged_meta = chunk.metadata.copy()
|
88
|
+
merged_meta.update(next_chunk.metadata)
|
89
|
+
merged_meta["score"] = self.score_chunk(merged_text)
|
90
|
+
merged_chunks.append(Document(page_content=merged_text, metadata=merged_meta))
|
91
|
+
i += 2
|
92
|
+
else:
|
93
|
+
merged_chunks.append(chunk)
|
94
|
+
i += 1
|
95
|
+
|
96
|
+
return merged_chunks
|
97
|
+
|
98
|
+
def split_text(self, text: str) -> list[str]:
|
99
|
+
return self._split_paragraphs(text)
|
100
|
+
|
101
|
+
def to_retriever(self, chunks: list[Document], embedding) -> object:
|
102
|
+
from langchain_community.vectorstores import FAISS
|
103
|
+
return FAISS.from_documents(chunks, embedding=embedding).as_retriever()
|
104
|
+
|
105
|
+
|
106
|
+
class SimpleSemanticChunker(SemanticChunker):
|
107
|
+
def split_text(self, text):
|
108
|
+
return text.split('\n\n')
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
# from langchain_core.documents import Document
|
114
|
+
# from langchain_text_splitters import TextSplitter
|
115
|
+
# from langchain_semantic_chunker.utils import estimate_token_count
|
116
|
+
|
117
|
+
|
118
|
+
# class SemanticChunker(TextSplitter):
|
119
|
+
# def __init__(self, max_tokens: int = 1500, overlap: int = 200, model_name: str = "gpt-3.5-turbo"):
|
120
|
+
# """
|
121
|
+
# Token-aware document chunker for LangChain.
|
122
|
+
|
123
|
+
# Args:
|
124
|
+
# max_tokens (int): Maximum tokens per chunk
|
125
|
+
# overlap (int): Optional overlap in tokens between chunks
|
126
|
+
# model_name (str): The model name for token estimation (used with tiktoken)
|
127
|
+
# """
|
128
|
+
# self.max_tokens = max_tokens
|
129
|
+
# self.overlap = overlap
|
130
|
+
# self.model_name = model_name
|
131
|
+
|
132
|
+
# def split_documents(self, documents: list[Document]) -> list[Document]:
|
133
|
+
# chunks = []
|
134
|
+
|
135
|
+
# for doc in documents:
|
136
|
+
# text = doc.page_content
|
137
|
+
# metadata = doc.metadata.copy()
|
138
|
+
|
139
|
+
# paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
140
|
+
# current_chunk = []
|
141
|
+
# token_count = 0
|
142
|
+
|
143
|
+
# for para in paragraphs:
|
144
|
+
# para_tokens = estimate_token_count(para, model_name=self.model_name)
|
145
|
+
|
146
|
+
# if token_count + para_tokens > self.max_tokens:
|
147
|
+
# # Commit current chunk
|
148
|
+
# chunk_text = "\n\n".join(current_chunk)
|
149
|
+
# chunks.append(Document(page_content=chunk_text, metadata=metadata))
|
150
|
+
|
151
|
+
# # Start new chunk with overlap (if defined)
|
152
|
+
# if self.overlap and len(current_chunk) > 0:
|
153
|
+
# overlap_text = current_chunk[-1]
|
154
|
+
# overlap_tokens = estimate_token_count(overlap_text, model_name=self.model_name)
|
155
|
+
# current_chunk = [overlap_text]
|
156
|
+
# token_count = overlap_tokens
|
157
|
+
# else:
|
158
|
+
# current_chunk = []
|
159
|
+
# token_count = 0
|
160
|
+
|
161
|
+
# current_chunk.append(para)
|
162
|
+
# token_count += para_tokens
|
163
|
+
|
164
|
+
# if current_chunk:
|
165
|
+
# chunk_text = "\n\n".join(current_chunk)
|
166
|
+
# chunks.append(Document(page_content=chunk_text, metadata=metadata))
|
167
|
+
|
168
|
+
# return chunks
|
169
|
+
|
170
|
+
# def split_text(self, text: str) -> list[str]:
|
171
|
+
# """
|
172
|
+
# Dummy method to satisfy LangChain's abstract base class requirement.
|
173
|
+
# """
|
174
|
+
# return text.split('\n\n')
|
175
|
+
|
176
|
+
|
177
|
+
|
178
|
+
# class SimpleSemanticChunker(SemanticChunker):
|
179
|
+
# def split_text(self, text):
|
180
|
+
# # Dummy implementation: split by paragraphs
|
181
|
+
# return text.split('\n\n')
|
@@ -0,0 +1,10 @@
|
|
1
|
+
from langchain_core.documents import Document
|
2
|
+
import pdfplumber
|
3
|
+
|
4
|
+
def extract_pdf(path: str) -> list[Document]:
|
5
|
+
with pdfplumber.open(path) as pdf:
|
6
|
+
return [
|
7
|
+
Document(page_content=page.extract_text(), metadata={"page_number": i+1})
|
8
|
+
for i, page in enumerate(pdf.pages)
|
9
|
+
if page.extract_text()
|
10
|
+
]
|
@@ -0,0 +1,23 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
def write_to_txt(docs, path="output.txt"):
|
4
|
+
with open(path, "w", encoding="utf-8") as f:
|
5
|
+
for i, doc in enumerate(docs):
|
6
|
+
chunk_header = f"# Chunk {i+1} | Source: {doc.metadata.get('source', 'N/A')} | Page: {doc.metadata.get('page_number', 'N/A')}\n"
|
7
|
+
|
8
|
+
# Format content for markdown: spacing and bullets
|
9
|
+
content = doc.page_content.strip()
|
10
|
+
content = content.replace("โข", "-")
|
11
|
+
content = content.replace("\n", "\n\n") # Add blank line between lines
|
12
|
+
|
13
|
+
# Write formatted chunk
|
14
|
+
f.write(f"\n\n{chunk_header}{content}\n")
|
15
|
+
|
16
|
+
|
17
|
+
def write_to_json(docs, path="output.json"):
|
18
|
+
formatted = [
|
19
|
+
{"chunk": i+1, "content": doc.page_content.strip(), "metadata": doc.metadata}
|
20
|
+
for i, doc in enumerate(docs)
|
21
|
+
]
|
22
|
+
with open(path, "w", encoding="utf-8") as f:
|
23
|
+
json.dump(formatted, f, indent=2, ensure_ascii=False)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Prajwal Mandale, Sudhnwa Ghorpade
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,110 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: semantic-chunker-langchain
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Token-aware, LangChain-compatible semantic chunker with PDF and layout support
|
5
|
+
License: MIT
|
6
|
+
Author: Prajwal Shivaji Mandale
|
7
|
+
Author-email: prajwal.mandale333@gmail.com
|
8
|
+
Requires-Python: >=3.9,<3.13
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
15
|
+
Requires-Dist: faiss-cpu (>=1.11.0,<2.0.0)
|
16
|
+
Requires-Dist: langchain (>=0.3.25,<0.4.0)
|
17
|
+
Requires-Dist: langchain-community (>=0.3.26,<0.4.0)
|
18
|
+
Requires-Dist: openai (>=1.84.0,<2.0.0)
|
19
|
+
Requires-Dist: pdfplumber (>=0.11.6,<0.12.0)
|
20
|
+
Requires-Dist: tiktoken (>=0.9.0,<0.10.0)
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
|
23
|
+
# Semantic Chunker for LangChain
|
24
|
+
|
25
|
+
A **token-aware**, **LangChain-compatible** chunker that splits text (from PDF, markdown, or plain text) into semantically coherent chunks while respecting model token limits.
|
26
|
+
|
27
|
+
---
|
28
|
+
|
29
|
+
## ๐ Features
|
30
|
+
|
31
|
+
* ๐ **Model-Aware Token Limits**: Automatically adjusts chunking size for GPT-3.5, GPT-4, Claude, and others.
|
32
|
+
* ๐ **Multi-format Input Support**:
|
33
|
+
|
34
|
+
* PDF via `pdfplumber`
|
35
|
+
* Plain `.txt`
|
36
|
+
* Markdown
|
37
|
+
* (Extendable to `.docx` and `.html`)
|
38
|
+
* ๐ **Overlapping Chunks**: Smart overlap between paragraphs to preserve context.
|
39
|
+
* ๐ง **Smart Merging**: Merges chunks smaller than 300 tokens.
|
40
|
+
* ๐งฉ **Retriever-Ready**: Direct integration with `LangChain` retrievers via FAISS.
|
41
|
+
* ๐ง **CLI Support**: Run from terminal with one command.
|
42
|
+
|
43
|
+
---
|
44
|
+
|
45
|
+
## ๐ฆ Installation
|
46
|
+
|
47
|
+
```bash
|
48
|
+
pip install semantic-chunker-langchain
|
49
|
+
```
|
50
|
+
|
51
|
+
> Requires Python 3.9 - 3.12
|
52
|
+
|
53
|
+
---
|
54
|
+
|
55
|
+
## ๐ ๏ธ Usage
|
56
|
+
|
57
|
+
### ๐ธ Chunk a PDF and Save to JSON/TXT
|
58
|
+
|
59
|
+
```bash
|
60
|
+
semantic-chunker sample.pdf --txt chunks.txt --json chunks.json
|
61
|
+
```
|
62
|
+
|
63
|
+
### ๐ธ From Code
|
64
|
+
|
65
|
+
```python
|
66
|
+
from langchain_semantic_chunker.chunker import SemanticChunker, SimpleSemanticChunker
|
67
|
+
from langchain_semantic_chunker.extractors.pdf import extract_pdf
|
68
|
+
from langchain_semantic_chunker.outputs.formatter import write_to_txt
|
69
|
+
|
70
|
+
# Extract
|
71
|
+
docs = extract_pdf("sample.pdf")
|
72
|
+
chunker = SemanticChunker(model_name="gpt-3.5-turbo")
|
73
|
+
chunks = chunker.split_documents(docs)
|
74
|
+
|
75
|
+
# Save to file
|
76
|
+
write_to_txt(chunks, "output.txt")
|
77
|
+
|
78
|
+
# Using SimpleSemanticChunker
|
79
|
+
simple_chunker = SimpleSemanticChunker(model_name="gpt-3.5-turbo")
|
80
|
+
simple_chunks = simple_chunker.split_documents(docs)
|
81
|
+
```
|
82
|
+
|
83
|
+
### ๐ธ Convert to Retriever
|
84
|
+
|
85
|
+
```python
|
86
|
+
from langchain_community.embeddings import OpenAIEmbeddings
|
87
|
+
retriever = chunker.to_retriever(chunks, embedding=OpenAIEmbeddings())
|
88
|
+
```
|
89
|
+
|
90
|
+
---
|
91
|
+
|
92
|
+
## ๐งช Testing
|
93
|
+
|
94
|
+
```bash
|
95
|
+
poetry run pytest tests/
|
96
|
+
```
|
97
|
+
|
98
|
+
---
|
99
|
+
|
100
|
+
## ๐จโ๐ป Authors
|
101
|
+
|
102
|
+
* Prajwal Shivaji Mandale
|
103
|
+
* Sudhnwa Ghorpade
|
104
|
+
|
105
|
+
---
|
106
|
+
|
107
|
+
## ๐ License
|
108
|
+
|
109
|
+
This project is licensed under the MIT License.
|
110
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
semantic_chunker_langchain/__init__.py,sha256=fg5lxVkyosKw8alIAQHzQFkysAcIdjUfQpFwnhlgqAQ,351
|
2
|
+
semantic_chunker_langchain/chunker.py,sha256=KtdY3dHlJKCtux6G75GtinqoOCVzhVXfCCSmD11kOUM,7253
|
3
|
+
semantic_chunker_langchain/extractors/pdf.py,sha256=8jRWBCMeIK3M_WgOyDqxxadEHQw678CzD5ryAJ0tvAA,356
|
4
|
+
semantic_chunker_langchain/outputs/formatter.py,sha256=tYShwikgwIleV6Nz1ohmtGX6nQRVnY41NOkOT6v43Qk,964
|
5
|
+
semantic_chunker_langchain/utils.py,sha256=E0Ajj2IBa6EFJJkGYZ8pyWUEKEAjiL9_Uof8KPnM8ew,288
|
6
|
+
semantic_chunker_langchain-0.1.0.dist-info/entry_points.txt,sha256=Kve0GJQ5uzNSMBidDihM9sFuoUY90OeP5THfJWQLDVQ,45
|
7
|
+
semantic_chunker_langchain-0.1.0.dist-info/LICENSE,sha256=vfqlCGc0OOjpze243uuSsBAAq1OFEoCLbmElHpljFWM,1111
|
8
|
+
semantic_chunker_langchain-0.1.0.dist-info/METADATA,sha256=KnJVAnDvroFTjUlq6-J6ER2lGmGwI6WX1PwbIca1y2o,2894
|
9
|
+
semantic_chunker_langchain-0.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
10
|
+
semantic_chunker_langchain-0.1.0.dist-info/RECORD,,
|