semantic-chunker-langchain 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: semantic-chunker-langchain
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Token-aware, LangChain-compatible semantic chunker with PDF and layout support
5
5
  License: MIT
6
6
  Author: Prajwal Shivaji Mandale
@@ -62,13 +62,14 @@ semantic-chunker sample.pdf --txt chunks.txt --json chunks.json
62
62
 
63
63
  ### 🔸 From Code
64
64
 
65
- ```python
66
- from langchain_semantic_chunker.chunker import SemanticChunker, SimpleSemanticChunker
67
- from langchain_semantic_chunker.extractors.pdf import extract_pdf
68
- from langchain_semantic_chunker.outputs.formatter import write_to_txt
65
+ from semantic_chunker_langchain.chunker import SemanticChunker, SimpleSemanticChunker
66
+ from semantic_chunker_langchain.extractors.pdf import extract_pdf
67
+ from semantic_chunker_langchain.outputs.formatter import write_to_txt
69
68
 
70
69
  # Extract
71
70
  docs = extract_pdf("sample.pdf")
71
+
72
+ # Using SemanticChunker
72
73
  chunker = SemanticChunker(model_name="gpt-3.5-turbo")
73
74
  chunks = chunker.split_documents(docs)
74
75
 
@@ -78,7 +79,7 @@ write_to_txt(chunks, "output.txt")
78
79
  # Using SimpleSemanticChunker
79
80
  simple_chunker = SimpleSemanticChunker(model_name="gpt-3.5-turbo")
80
81
  simple_chunks = simple_chunker.split_documents(docs)
81
- ```
82
+
82
83
 
83
84
  ### 🔸 Convert to Retriever
84
85
 
@@ -40,13 +40,14 @@ semantic-chunker sample.pdf --txt chunks.txt --json chunks.json
40
40
 
41
41
  ### 🔸 From Code
42
42
 
43
- ```python
44
- from langchain_semantic_chunker.chunker import SemanticChunker, SimpleSemanticChunker
45
- from langchain_semantic_chunker.extractors.pdf import extract_pdf
46
- from langchain_semantic_chunker.outputs.formatter import write_to_txt
43
+ from semantic_chunker_langchain.chunker import SemanticChunker, SimpleSemanticChunker
44
+ from semantic_chunker_langchain.extractors.pdf import extract_pdf
45
+ from semantic_chunker_langchain.outputs.formatter import write_to_txt
47
46
 
48
47
  # Extract
49
48
  docs = extract_pdf("sample.pdf")
49
+
50
+ # Using SemanticChunker
50
51
  chunker = SemanticChunker(model_name="gpt-3.5-turbo")
51
52
  chunks = chunker.split_documents(docs)
52
53
 
@@ -56,7 +57,7 @@ write_to_txt(chunks, "output.txt")
56
57
  # Using SimpleSemanticChunker
57
58
  simple_chunker = SimpleSemanticChunker(model_name="gpt-3.5-turbo")
58
59
  simple_chunks = simple_chunker.split_documents(docs)
59
- ```
60
+
60
61
 
61
62
  ### 🔸 Convert to Retriever
62
63
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "semantic-chunker-langchain"
3
- version = "0.1.0"
3
+ version = "0.1.2"
4
4
  description = "Token-aware, LangChain-compatible semantic chunker with PDF and layout support"
5
5
  authors = ["Prajwal Shivaji Mandale <prajwal.mandale333@gmail.com>","Sudhnwa Ghorpade <sudhnwa.ghorpade@gmail.com>"]
6
6
  license = "MIT"
@@ -1,7 +1,7 @@
1
1
  # === langchain_semantic_chunker/chunker.py ===
2
2
  from langchain_core.documents import Document
3
3
  from langchain_text_splitters import TextSplitter
4
- from langchain_semantic_chunker.utils import estimate_token_count
4
+ from semantic_chunker_langchain.utils import estimate_token_count
5
5
  import re
6
6
 
7
7
  class SemanticChunker(TextSplitter):
@@ -107,75 +107,3 @@ class SimpleSemanticChunker(SemanticChunker):
107
107
  def split_text(self, text):
108
108
  return text.split('\n\n')
109
109
 
110
-
111
-
112
-
113
- # from langchain_core.documents import Document
114
- # from langchain_text_splitters import TextSplitter
115
- # from langchain_semantic_chunker.utils import estimate_token_count
116
-
117
-
118
- # class SemanticChunker(TextSplitter):
119
- # def __init__(self, max_tokens: int = 1500, overlap: int = 200, model_name: str = "gpt-3.5-turbo"):
120
- # """
121
- # Token-aware document chunker for LangChain.
122
-
123
- # Args:
124
- # max_tokens (int): Maximum tokens per chunk
125
- # overlap (int): Optional overlap in tokens between chunks
126
- # model_name (str): The model name for token estimation (used with tiktoken)
127
- # """
128
- # self.max_tokens = max_tokens
129
- # self.overlap = overlap
130
- # self.model_name = model_name
131
-
132
- # def split_documents(self, documents: list[Document]) -> list[Document]:
133
- # chunks = []
134
-
135
- # for doc in documents:
136
- # text = doc.page_content
137
- # metadata = doc.metadata.copy()
138
-
139
- # paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
140
- # current_chunk = []
141
- # token_count = 0
142
-
143
- # for para in paragraphs:
144
- # para_tokens = estimate_token_count(para, model_name=self.model_name)
145
-
146
- # if token_count + para_tokens > self.max_tokens:
147
- # # Commit current chunk
148
- # chunk_text = "\n\n".join(current_chunk)
149
- # chunks.append(Document(page_content=chunk_text, metadata=metadata))
150
-
151
- # # Start new chunk with overlap (if defined)
152
- # if self.overlap and len(current_chunk) > 0:
153
- # overlap_text = current_chunk[-1]
154
- # overlap_tokens = estimate_token_count(overlap_text, model_name=self.model_name)
155
- # current_chunk = [overlap_text]
156
- # token_count = overlap_tokens
157
- # else:
158
- # current_chunk = []
159
- # token_count = 0
160
-
161
- # current_chunk.append(para)
162
- # token_count += para_tokens
163
-
164
- # if current_chunk:
165
- # chunk_text = "\n\n".join(current_chunk)
166
- # chunks.append(Document(page_content=chunk_text, metadata=metadata))
167
-
168
- # return chunks
169
-
170
- # def split_text(self, text: str) -> list[str]:
171
- # """
172
- # Dummy method to satisfy LangChain's abstract base class requirement.
173
- # """
174
- # return text.split('\n\n')
175
-
176
-
177
-
178
- # class SimpleSemanticChunker(SemanticChunker):
179
- # def split_text(self, text):
180
- # # Dummy implementation: split by paragraphs
181
- # return text.split('\n\n')