semantic-chunker-langchain 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ from .chunker import SemanticChunker, SimpleSemanticChunker
2
+ from .utils import estimate_token_count
3
+ from .extractors.pdf import extract_pdf
4
+ from .outputs.formatter import write_to_txt, write_to_json
5
+
6
+ __all__ = [
7
+ "SemanticChunker", "SimpleSemanticChunker",
8
+ "estimate_token_count", "extract_pdf",
9
+ "write_to_txt", "write_to_json"
10
+ ]
@@ -0,0 +1,181 @@
1
+ # === langchain_semantic_chunker/chunker.py ===
2
+ from langchain_core.documents import Document
3
+ from langchain_text_splitters import TextSplitter
4
+ from langchain_semantic_chunker.utils import estimate_token_count
5
+ import re
6
+
7
+ class SemanticChunker(TextSplitter):
8
+ def __init__(self, max_tokens: int = None, overlap: int = 200, model_name: str = "gpt-3.5-turbo", chunking_type: str = "text"):
9
+ self.model_name = model_name
10
+ self.max_tokens = max_tokens or self._default_tokens_for_model(model_name)
11
+ self.overlap = overlap
12
+ self.chunking_type = chunking_type
13
+
14
+ def _default_tokens_for_model(self, model_name: str) -> int:
15
+ if "claude" in model_name:
16
+ return 8000
17
+ elif "gpt-4" in model_name:
18
+ return 4000
19
+ else:
20
+ return 1500
21
+
22
+ def score_chunk(self, text: str) -> float:
23
+ return estimate_token_count(text, model_name=self.model_name)
24
+
25
+ def _split_paragraphs(self, text: str) -> list[str]:
26
+ return [p.strip() for p in text.split("\n\n") if p.strip()]
27
+
28
+ def _split_markdown(self, text: str) -> list[str]:
29
+ # Split at headings (e.g., ## or ###)
30
+ return re.split(r"(?=^#{1,6}\s)", text, flags=re.MULTILINE)
31
+
32
+ def _split_code(self, text: str) -> list[str]:
33
+ return text.split("\n\n") # Simple fallback chunker
34
+
35
+ def split_documents(self, documents: list[Document]) -> list[Document]:
36
+ chunks = []
37
+
38
+ for doc in documents:
39
+ text = doc.page_content
40
+ metadata = doc.metadata.copy()
41
+
42
+ if self.chunking_type == "markdown":
43
+ blocks = self._split_markdown(text)
44
+ elif self.chunking_type == "code":
45
+ blocks = self._split_code(text)
46
+ else:
47
+ blocks = self._split_paragraphs(text)
48
+
49
+ current_chunk = []
50
+ token_count = 0
51
+
52
+ for block in blocks:
53
+ block_tokens = estimate_token_count(block, model_name=self.model_name)
54
+
55
+ if token_count + block_tokens > self.max_tokens:
56
+ chunk_text = "\n\n".join(current_chunk)
57
+ chunk_metadata = metadata.copy()
58
+ chunk_metadata["score"] = self.score_chunk(chunk_text)
59
+ chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
60
+
61
+ if self.overlap and len(current_chunk) > 0:
62
+ overlap_text = current_chunk[-1]
63
+ overlap_tokens = estimate_token_count(overlap_text, model_name=self.model_name)
64
+ current_chunk = [overlap_text]
65
+ token_count = overlap_tokens
66
+ else:
67
+ current_chunk = []
68
+ token_count = 0
69
+
70
+ current_chunk.append(block)
71
+ token_count += block_tokens
72
+
73
+ if current_chunk:
74
+ chunk_text = "\n\n".join(current_chunk)
75
+ chunk_metadata = metadata.copy()
76
+ chunk_metadata["score"] = self.score_chunk(chunk_text)
77
+ chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
78
+
79
+ merged_chunks = []
80
+ i = 0
81
+ while i < len(chunks):
82
+ chunk = chunks[i]
83
+ token_count = estimate_token_count(chunk.page_content, model_name=self.model_name)
84
+ if token_count < 300 and i + 1 < len(chunks):
85
+ next_chunk = chunks[i + 1]
86
+ merged_text = chunk.page_content + "\n\n" + next_chunk.page_content
87
+ merged_meta = chunk.metadata.copy()
88
+ merged_meta.update(next_chunk.metadata)
89
+ merged_meta["score"] = self.score_chunk(merged_text)
90
+ merged_chunks.append(Document(page_content=merged_text, metadata=merged_meta))
91
+ i += 2
92
+ else:
93
+ merged_chunks.append(chunk)
94
+ i += 1
95
+
96
+ return merged_chunks
97
+
98
+ def split_text(self, text: str) -> list[str]:
99
+ return self._split_paragraphs(text)
100
+
101
+ def to_retriever(self, chunks: list[Document], embedding) -> object:
102
+ from langchain_community.vectorstores import FAISS
103
+ return FAISS.from_documents(chunks, embedding=embedding).as_retriever()
104
+
105
+
106
+ class SimpleSemanticChunker(SemanticChunker):
107
+ def split_text(self, text):
108
+ return text.split('\n\n')
109
+
110
+
111
+
112
+
113
+ # from langchain_core.documents import Document
114
+ # from langchain_text_splitters import TextSplitter
115
+ # from langchain_semantic_chunker.utils import estimate_token_count
116
+
117
+
118
+ # class SemanticChunker(TextSplitter):
119
+ # def __init__(self, max_tokens: int = 1500, overlap: int = 200, model_name: str = "gpt-3.5-turbo"):
120
+ # """
121
+ # Token-aware document chunker for LangChain.
122
+
123
+ # Args:
124
+ # max_tokens (int): Maximum tokens per chunk
125
+ # overlap (int): Optional overlap in tokens between chunks
126
+ # model_name (str): The model name for token estimation (used with tiktoken)
127
+ # """
128
+ # self.max_tokens = max_tokens
129
+ # self.overlap = overlap
130
+ # self.model_name = model_name
131
+
132
+ # def split_documents(self, documents: list[Document]) -> list[Document]:
133
+ # chunks = []
134
+
135
+ # for doc in documents:
136
+ # text = doc.page_content
137
+ # metadata = doc.metadata.copy()
138
+
139
+ # paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
140
+ # current_chunk = []
141
+ # token_count = 0
142
+
143
+ # for para in paragraphs:
144
+ # para_tokens = estimate_token_count(para, model_name=self.model_name)
145
+
146
+ # if token_count + para_tokens > self.max_tokens:
147
+ # # Commit current chunk
148
+ # chunk_text = "\n\n".join(current_chunk)
149
+ # chunks.append(Document(page_content=chunk_text, metadata=metadata))
150
+
151
+ # # Start new chunk with overlap (if defined)
152
+ # if self.overlap and len(current_chunk) > 0:
153
+ # overlap_text = current_chunk[-1]
154
+ # overlap_tokens = estimate_token_count(overlap_text, model_name=self.model_name)
155
+ # current_chunk = [overlap_text]
156
+ # token_count = overlap_tokens
157
+ # else:
158
+ # current_chunk = []
159
+ # token_count = 0
160
+
161
+ # current_chunk.append(para)
162
+ # token_count += para_tokens
163
+
164
+ # if current_chunk:
165
+ # chunk_text = "\n\n".join(current_chunk)
166
+ # chunks.append(Document(page_content=chunk_text, metadata=metadata))
167
+
168
+ # return chunks
169
+
170
+ # def split_text(self, text: str) -> list[str]:
171
+ # """
172
+ # Dummy method to satisfy LangChain's abstract base class requirement.
173
+ # """
174
+ # return text.split('\n\n')
175
+
176
+
177
+
178
+ # class SimpleSemanticChunker(SemanticChunker):
179
+ # def split_text(self, text):
180
+ # # Dummy implementation: split by paragraphs
181
+ # return text.split('\n\n')
@@ -0,0 +1,10 @@
1
+ from langchain_core.documents import Document
2
+ import pdfplumber
3
+
4
+ def extract_pdf(path: str) -> list[Document]:
5
+ with pdfplumber.open(path) as pdf:
6
+ return [
7
+ Document(page_content=page.extract_text(), metadata={"page_number": i+1})
8
+ for i, page in enumerate(pdf.pages)
9
+ if page.extract_text()
10
+ ]
@@ -0,0 +1,23 @@
1
+ import json
2
+
3
+ def write_to_txt(docs, path="output.txt"):
4
+ with open(path, "w", encoding="utf-8") as f:
5
+ for i, doc in enumerate(docs):
6
+ chunk_header = f"# Chunk {i+1} | Source: {doc.metadata.get('source', 'N/A')} | Page: {doc.metadata.get('page_number', 'N/A')}\n"
7
+
8
+ # Format content for markdown: spacing and bullets
9
+ content = doc.page_content.strip()
10
+ content = content.replace("โ€ข", "-")
11
+ content = content.replace("\n", "\n\n") # Add blank line between lines
12
+
13
+ # Write formatted chunk
14
+ f.write(f"\n\n{chunk_header}{content}\n")
15
+
16
+
17
+ def write_to_json(docs, path="output.json"):
18
+ formatted = [
19
+ {"chunk": i+1, "content": doc.page_content.strip(), "metadata": doc.metadata}
20
+ for i, doc in enumerate(docs)
21
+ ]
22
+ with open(path, "w", encoding="utf-8") as f:
23
+ json.dump(formatted, f, indent=2, ensure_ascii=False)
@@ -0,0 +1,8 @@
1
+ import tiktoken
2
+
3
+ def estimate_token_count(text: str, model_name: str = "gpt-3.5-turbo") -> int:
4
+ try:
5
+ encoding = tiktoken.encoding_for_model(model_name)
6
+ except Exception:
7
+ encoding = tiktoken.get_encoding("cl100k_base")
8
+ return len(encoding.encode(text))
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Prajwal Mandale, Sudhnwa Ghorpade
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.3
2
+ Name: semantic-chunker-langchain
3
+ Version: 0.1.0
4
+ Summary: Token-aware, LangChain-compatible semantic chunker with PDF and layout support
5
+ License: MIT
6
+ Author: Prajwal Shivaji Mandale
7
+ Author-email: prajwal.mandale333@gmail.com
8
+ Requires-Python: >=3.9,<3.13
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Dist: faiss-cpu (>=1.11.0,<2.0.0)
16
+ Requires-Dist: langchain (>=0.3.25,<0.4.0)
17
+ Requires-Dist: langchain-community (>=0.3.26,<0.4.0)
18
+ Requires-Dist: openai (>=1.84.0,<2.0.0)
19
+ Requires-Dist: pdfplumber (>=0.11.6,<0.12.0)
20
+ Requires-Dist: tiktoken (>=0.9.0,<0.10.0)
21
+ Description-Content-Type: text/markdown
22
+
23
+ # Semantic Chunker for LangChain
24
+
25
+ A **token-aware**, **LangChain-compatible** chunker that splits text (from PDF, markdown, or plain text) into semantically coherent chunks while respecting model token limits.
26
+
27
+ ---
28
+
29
+ ## ๐Ÿš€ Features
30
+
31
+ * ๐Ÿ” **Model-Aware Token Limits**: Automatically adjusts chunking size for GPT-3.5, GPT-4, Claude, and others.
32
+ * ๐Ÿ“„ **Multi-format Input Support**:
33
+
34
+ * PDF via `pdfplumber`
35
+ * Plain `.txt`
36
+ * Markdown
37
+ * (Extendable to `.docx` and `.html`)
38
+ * ๐Ÿ” **Overlapping Chunks**: Smart overlap between paragraphs to preserve context.
39
+ * ๐Ÿง  **Smart Merging**: Merges chunks smaller than 300 tokens.
40
+ * ๐Ÿงฉ **Retriever-Ready**: Direct integration with `LangChain` retrievers via FAISS.
41
+ * ๐Ÿ”ง **CLI Support**: Run from terminal with one command.
42
+
43
+ ---
44
+
45
+ ## ๐Ÿ“ฆ Installation
46
+
47
+ ```bash
48
+ pip install semantic-chunker-langchain
49
+ ```
50
+
51
+ > Requires Python 3.9 - 3.12
52
+
53
+ ---
54
+
55
+ ## ๐Ÿ› ๏ธ Usage
56
+
57
+ ### ๐Ÿ”ธ Chunk a PDF and Save to JSON/TXT
58
+
59
+ ```bash
60
+ semantic-chunker sample.pdf --txt chunks.txt --json chunks.json
61
+ ```
62
+
63
+ ### ๐Ÿ”ธ From Code
64
+
65
+ ```python
66
+ from langchain_semantic_chunker.chunker import SemanticChunker, SimpleSemanticChunker
67
+ from langchain_semantic_chunker.extractors.pdf import extract_pdf
68
+ from langchain_semantic_chunker.outputs.formatter import write_to_txt
69
+
70
+ # Extract
71
+ docs = extract_pdf("sample.pdf")
72
+ chunker = SemanticChunker(model_name="gpt-3.5-turbo")
73
+ chunks = chunker.split_documents(docs)
74
+
75
+ # Save to file
76
+ write_to_txt(chunks, "output.txt")
77
+
78
+ # Using SimpleSemanticChunker
79
+ simple_chunker = SimpleSemanticChunker(model_name="gpt-3.5-turbo")
80
+ simple_chunks = simple_chunker.split_documents(docs)
81
+ ```
82
+
83
+ ### ๐Ÿ”ธ Convert to Retriever
84
+
85
+ ```python
86
+ from langchain_community.embeddings import OpenAIEmbeddings
87
+ retriever = chunker.to_retriever(chunks, embedding=OpenAIEmbeddings())
88
+ ```
89
+
90
+ ---
91
+
92
+ ## ๐Ÿงช Testing
93
+
94
+ ```bash
95
+ poetry run pytest tests/
96
+ ```
97
+
98
+ ---
99
+
100
+ ## ๐Ÿ‘จโ€๐Ÿ’ป Authors
101
+
102
+ * Prajwal Shivaji Mandale
103
+ * Sudhnwa Ghorpade
104
+
105
+ ---
106
+
107
+ ## ๐Ÿ“œ License
108
+
109
+ This project is licensed under the MIT License.
110
+
@@ -0,0 +1,10 @@
1
+ semantic_chunker_langchain/__init__.py,sha256=fg5lxVkyosKw8alIAQHzQFkysAcIdjUfQpFwnhlgqAQ,351
2
+ semantic_chunker_langchain/chunker.py,sha256=KtdY3dHlJKCtux6G75GtinqoOCVzhVXfCCSmD11kOUM,7253
3
+ semantic_chunker_langchain/extractors/pdf.py,sha256=8jRWBCMeIK3M_WgOyDqxxadEHQw678CzD5ryAJ0tvAA,356
4
+ semantic_chunker_langchain/outputs/formatter.py,sha256=tYShwikgwIleV6Nz1ohmtGX6nQRVnY41NOkOT6v43Qk,964
5
+ semantic_chunker_langchain/utils.py,sha256=E0Ajj2IBa6EFJJkGYZ8pyWUEKEAjiL9_Uof8KPnM8ew,288
6
+ semantic_chunker_langchain-0.1.0.dist-info/entry_points.txt,sha256=Kve0GJQ5uzNSMBidDihM9sFuoUY90OeP5THfJWQLDVQ,45
7
+ semantic_chunker_langchain-0.1.0.dist-info/LICENSE,sha256=vfqlCGc0OOjpze243uuSsBAAq1OFEoCLbmElHpljFWM,1111
8
+ semantic_chunker_langchain-0.1.0.dist-info/METADATA,sha256=KnJVAnDvroFTjUlq6-J6ER2lGmGwI6WX1PwbIca1y2o,2894
9
+ semantic_chunker_langchain-0.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
+ semantic_chunker_langchain-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ semantic-chunker=cli:main
3
+