semantic-chunker-langchain 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: semantic-chunker-langchain
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Token-aware, LangChain-compatible semantic chunker with PDF and layout support
5
5
  License: MIT
6
6
  Author: Prajwal Shivaji Mandale
@@ -42,7 +42,7 @@ A **token-aware**, **LangChain-compatible** chunker that splits text (from PDF,
42
42
 
43
43
  ---
44
44
 
45
- ## 📦 Installation
45
+ ## 📆 Installation
46
46
 
47
47
  ```bash
48
48
  pip install semantic-chunker-langchain
@@ -63,12 +63,14 @@ semantic-chunker sample.pdf --txt chunks.txt --json chunks.json
63
63
  ### 🔸 From Code
64
64
 
65
65
  ```python
66
- from langchain_semantic_chunker.chunker import SemanticChunker, SimpleSemanticChunker
67
- from langchain_semantic_chunker.extractors.pdf import extract_pdf
68
- from langchain_semantic_chunker.outputs.formatter import write_to_txt
66
+ from semantic_chunker_langchain.chunker import SemanticChunker, SimpleSemanticChunker
67
+ from semantic_chunker_langchain.extractors.pdf import extract_pdf
68
+ from semantic_chunker_langchain.outputs.formatter import write_to_txt
69
69
 
70
70
  # Extract
71
71
  docs = extract_pdf("sample.pdf")
72
+
73
+ # Using SemanticChunker
72
74
  chunker = SemanticChunker(model_name="gpt-3.5-turbo")
73
75
  chunks = chunker.split_documents(docs)
74
76
 
@@ -89,7 +91,7 @@ retriever = chunker.to_retriever(chunks, embedding=OpenAIEmbeddings())
89
91
 
90
92
  ---
91
93
 
92
- ## 🧪 Testing
94
+ ## 📊 Testing
93
95
 
94
96
  ```bash
95
97
  poetry run pytest tests/
@@ -20,7 +20,7 @@ A **token-aware**, **LangChain-compatible** chunker that splits text (from PDF,
20
20
 
21
21
  ---
22
22
 
23
- ## 📦 Installation
23
+ ## 📆 Installation
24
24
 
25
25
  ```bash
26
26
  pip install semantic-chunker-langchain
@@ -41,12 +41,14 @@ semantic-chunker sample.pdf --txt chunks.txt --json chunks.json
41
41
  ### 🔸 From Code
42
42
 
43
43
  ```python
44
- from langchain_semantic_chunker.chunker import SemanticChunker, SimpleSemanticChunker
45
- from langchain_semantic_chunker.extractors.pdf import extract_pdf
46
- from langchain_semantic_chunker.outputs.formatter import write_to_txt
44
+ from semantic_chunker_langchain.chunker import SemanticChunker, SimpleSemanticChunker
45
+ from semantic_chunker_langchain.extractors.pdf import extract_pdf
46
+ from semantic_chunker_langchain.outputs.formatter import write_to_txt
47
47
 
48
48
  # Extract
49
49
  docs = extract_pdf("sample.pdf")
50
+
51
+ # Using SemanticChunker
50
52
  chunker = SemanticChunker(model_name="gpt-3.5-turbo")
51
53
  chunks = chunker.split_documents(docs)
52
54
 
@@ -67,7 +69,7 @@ retriever = chunker.to_retriever(chunks, embedding=OpenAIEmbeddings())
67
69
 
68
70
  ---
69
71
 
70
- ## 🧪 Testing
72
+ ## 📊 Testing
71
73
 
72
74
  ```bash
73
75
  poetry run pytest tests/
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "semantic-chunker-langchain"
3
- version = "0.1.1"
3
+ version = "0.1.3"
4
4
  description = "Token-aware, LangChain-compatible semantic chunker with PDF and layout support"
5
5
  authors = ["Prajwal Shivaji Mandale <prajwal.mandale333@gmail.com>","Sudhnwa Ghorpade <sudhnwa.ghorpade@gmail.com>"]
6
6
  license = "MIT"
@@ -107,75 +107,3 @@ class SimpleSemanticChunker(SemanticChunker):
107
107
  def split_text(self, text):
108
108
  return text.split('\n\n')
109
109
 
110
-
111
-
112
-
113
- # from langchain_core.documents import Document
114
- # from langchain_text_splitters import TextSplitter
115
- # from langchain_semantic_chunker.utils import estimate_token_count
116
-
117
-
118
- # class SemanticChunker(TextSplitter):
119
- # def __init__(self, max_tokens: int = 1500, overlap: int = 200, model_name: str = "gpt-3.5-turbo"):
120
- # """
121
- # Token-aware document chunker for LangChain.
122
-
123
- # Args:
124
- # max_tokens (int): Maximum tokens per chunk
125
- # overlap (int): Optional overlap in tokens between chunks
126
- # model_name (str): The model name for token estimation (used with tiktoken)
127
- # """
128
- # self.max_tokens = max_tokens
129
- # self.overlap = overlap
130
- # self.model_name = model_name
131
-
132
- # def split_documents(self, documents: list[Document]) -> list[Document]:
133
- # chunks = []
134
-
135
- # for doc in documents:
136
- # text = doc.page_content
137
- # metadata = doc.metadata.copy()
138
-
139
- # paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
140
- # current_chunk = []
141
- # token_count = 0
142
-
143
- # for para in paragraphs:
144
- # para_tokens = estimate_token_count(para, model_name=self.model_name)
145
-
146
- # if token_count + para_tokens > self.max_tokens:
147
- # # Commit current chunk
148
- # chunk_text = "\n\n".join(current_chunk)
149
- # chunks.append(Document(page_content=chunk_text, metadata=metadata))
150
-
151
- # # Start new chunk with overlap (if defined)
152
- # if self.overlap and len(current_chunk) > 0:
153
- # overlap_text = current_chunk[-1]
154
- # overlap_tokens = estimate_token_count(overlap_text, model_name=self.model_name)
155
- # current_chunk = [overlap_text]
156
- # token_count = overlap_tokens
157
- # else:
158
- # current_chunk = []
159
- # token_count = 0
160
-
161
- # current_chunk.append(para)
162
- # token_count += para_tokens
163
-
164
- # if current_chunk:
165
- # chunk_text = "\n\n".join(current_chunk)
166
- # chunks.append(Document(page_content=chunk_text, metadata=metadata))
167
-
168
- # return chunks
169
-
170
- # def split_text(self, text: str) -> list[str]:
171
- # """
172
- # Dummy method to satisfy LangChain's abstract base class requirement.
173
- # """
174
- # return text.split('\n\n')
175
-
176
-
177
-
178
- # class SimpleSemanticChunker(SemanticChunker):
179
- # def split_text(self, text):
180
- # # Dummy implementation: split by paragraphs
181
- # return text.split('\n\n')