langroid 0.58.2__py3-none-any.whl → 0.59.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/base.py +39 -17
- langroid/agent/base.py-e +2216 -0
- langroid/agent/callbacks/chainlit.py +2 -1
- langroid/agent/chat_agent.py +73 -55
- langroid/agent/chat_agent.py-e +2086 -0
- langroid/agent/chat_document.py +7 -7
- langroid/agent/chat_document.py-e +513 -0
- langroid/agent/openai_assistant.py +9 -9
- langroid/agent/openai_assistant.py-e +882 -0
- langroid/agent/special/arangodb/arangodb_agent.py +10 -18
- langroid/agent/special/arangodb/arangodb_agent.py-e +648 -0
- langroid/agent/special/arangodb/tools.py +3 -3
- langroid/agent/special/doc_chat_agent.py +16 -14
- langroid/agent/special/lance_rag/critic_agent.py +2 -2
- langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
- langroid/agent/special/lance_tools.py +6 -5
- langroid/agent/special/lance_tools.py-e +61 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
- langroid/agent/special/neo4j/neo4j_chat_agent.py-e +430 -0
- langroid/agent/special/relevance_extractor_agent.py +1 -1
- langroid/agent/special/sql/sql_chat_agent.py +11 -3
- langroid/agent/task.py +9 -87
- langroid/agent/task.py-e +2418 -0
- langroid/agent/tool_message.py +33 -17
- langroid/agent/tool_message.py-e +400 -0
- langroid/agent/tools/file_tools.py +4 -2
- langroid/agent/tools/file_tools.py-e +234 -0
- langroid/agent/tools/mcp/fastmcp_client.py +19 -6
- langroid/agent/tools/mcp/fastmcp_client.py-e +584 -0
- langroid/agent/tools/orchestration.py +22 -17
- langroid/agent/tools/orchestration.py-e +301 -0
- langroid/agent/tools/recipient_tool.py +3 -3
- langroid/agent/tools/task_tool.py +22 -16
- langroid/agent/tools/task_tool.py-e +249 -0
- langroid/agent/xml_tool_message.py +90 -35
- langroid/agent/xml_tool_message.py-e +392 -0
- langroid/cachedb/base.py +1 -1
- langroid/embedding_models/base.py +2 -2
- langroid/embedding_models/models.py +3 -7
- langroid/embedding_models/models.py-e +563 -0
- langroid/exceptions.py +4 -1
- langroid/language_models/azure_openai.py +2 -2
- langroid/language_models/azure_openai.py-e +134 -0
- langroid/language_models/base.py +6 -4
- langroid/language_models/base.py-e +812 -0
- langroid/language_models/client_cache.py +64 -0
- langroid/language_models/config.py +2 -4
- langroid/language_models/config.py-e +18 -0
- langroid/language_models/model_info.py +9 -1
- langroid/language_models/model_info.py-e +483 -0
- langroid/language_models/openai_gpt.py +119 -20
- langroid/language_models/openai_gpt.py-e +2280 -0
- langroid/language_models/provider_params.py +3 -22
- langroid/language_models/provider_params.py-e +153 -0
- langroid/mytypes.py +11 -4
- langroid/mytypes.py-e +132 -0
- langroid/parsing/code_parser.py +1 -1
- langroid/parsing/file_attachment.py +1 -1
- langroid/parsing/file_attachment.py-e +246 -0
- langroid/parsing/md_parser.py +14 -4
- langroid/parsing/md_parser.py-e +574 -0
- langroid/parsing/parser.py +22 -7
- langroid/parsing/parser.py-e +410 -0
- langroid/parsing/repo_loader.py +3 -1
- langroid/parsing/repo_loader.py-e +812 -0
- langroid/parsing/search.py +1 -1
- langroid/parsing/url_loader.py +17 -51
- langroid/parsing/url_loader.py-e +683 -0
- langroid/parsing/urls.py +5 -4
- langroid/parsing/urls.py-e +279 -0
- langroid/prompts/prompts_config.py +1 -1
- langroid/pydantic_v1/__init__.py +45 -6
- langroid/pydantic_v1/__init__.py-e +36 -0
- langroid/pydantic_v1/main.py +11 -4
- langroid/pydantic_v1/main.py-e +11 -0
- langroid/utils/configuration.py +13 -11
- langroid/utils/configuration.py-e +141 -0
- langroid/utils/constants.py +1 -1
- langroid/utils/constants.py-e +32 -0
- langroid/utils/globals.py +21 -5
- langroid/utils/globals.py-e +49 -0
- langroid/utils/html_logger.py +2 -1
- langroid/utils/html_logger.py-e +825 -0
- langroid/utils/object_registry.py +1 -1
- langroid/utils/object_registry.py-e +66 -0
- langroid/utils/pydantic_utils.py +55 -28
- langroid/utils/pydantic_utils.py-e +602 -0
- langroid/utils/types.py +2 -2
- langroid/utils/types.py-e +113 -0
- langroid/vector_store/base.py +3 -3
- langroid/vector_store/lancedb.py +5 -5
- langroid/vector_store/lancedb.py-e +404 -0
- langroid/vector_store/meilisearch.py +2 -2
- langroid/vector_store/pineconedb.py +4 -4
- langroid/vector_store/pineconedb.py-e +427 -0
- langroid/vector_store/postgres.py +1 -1
- langroid/vector_store/qdrantdb.py +3 -3
- langroid/vector_store/weaviatedb.py +1 -1
- {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/METADATA +3 -2
- langroid-0.59.0b1.dist-info/RECORD +181 -0
- langroid/agent/special/doc_chat_task.py +0 -0
- langroid/mcp/__init__.py +0 -1
- langroid/mcp/server/__init__.py +0 -1
- langroid-0.58.2.dist-info/RECORD +0 -145
- {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/WHEEL +0 -0
- {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,410 @@
|
|
1
|
+
import logging
|
2
|
+
import re
|
3
|
+
from enum import Enum
|
4
|
+
from typing import Any, Dict, List, Literal, Optional
|
5
|
+
|
6
|
+
import tiktoken
|
7
|
+
|
8
|
+
from langroid.mytypes import Document
|
9
|
+
from langroid.parsing.md_parser import (
|
10
|
+
MarkdownChunkConfig,
|
11
|
+
chunk_markdown,
|
12
|
+
count_words,
|
13
|
+
)
|
14
|
+
from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
|
15
|
+
from langroid.pydantic_v1 import model_validator
|
16
|
+
from pydantic_settings import BaseSettings
|
17
|
+
from pydantic import ConfigDict
|
18
|
+
from langroid.utils.object_registry import ObjectRegistry
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
logger.setLevel(logging.WARNING)
|
22
|
+
|
23
|
+
|
24
|
+
class Splitter(str, Enum):
|
25
|
+
TOKENS = "tokens"
|
26
|
+
PARA_SENTENCE = "para_sentence"
|
27
|
+
SIMPLE = "simple"
|
28
|
+
# "structure-aware" splitting with chunks enriched by header info
|
29
|
+
MARKDOWN = "markdown"
|
30
|
+
|
31
|
+
|
32
|
+
class BaseParsingConfig(BaseSettings):
|
33
|
+
"""Base class for document parsing configurations."""
|
34
|
+
|
35
|
+
library: str
|
36
|
+
|
37
|
+
model_config = ConfigDict(extra="ignore") # Ignore unknown settings
|
38
|
+
|
39
|
+
class LLMPdfParserConfig(BaseSettings):
|
40
|
+
"""Configuration for LLM-based parsing."""
|
41
|
+
|
42
|
+
model_name: str = "gemini/gemini-2.0-flash" # Default model
|
43
|
+
max_tokens: Optional[int] = None
|
44
|
+
split_on_page: Optional[bool] = True
|
45
|
+
requests_per_minute: Optional[int] = 5
|
46
|
+
timeout: int = 60
|
47
|
+
prompt: str = "" # override with a domain-specific prompt
|
48
|
+
system_prompt: str = "" # override with a domain-specific system prompt
|
49
|
+
|
50
|
+
|
51
|
+
class MarkerConfig(BaseSettings):
|
52
|
+
"""Configuration for Markitdown-based parsing."""
|
53
|
+
|
54
|
+
config_dict: Dict[str, Any] = {}
|
55
|
+
|
56
|
+
|
57
|
+
class PdfParsingConfig(BaseParsingConfig):
|
58
|
+
library: Literal[
|
59
|
+
"fitz",
|
60
|
+
"pymupdf4llm",
|
61
|
+
"docling",
|
62
|
+
"pypdf",
|
63
|
+
"unstructured",
|
64
|
+
"pdf2image",
|
65
|
+
"markitdown",
|
66
|
+
"llm-pdf-parser",
|
67
|
+
"marker",
|
68
|
+
] = "pymupdf4llm"
|
69
|
+
llm_parser_config: Optional[LLMPdfParserConfig] = None
|
70
|
+
marker_config: Optional[MarkerConfig] = None
|
71
|
+
|
72
|
+
@model_validator(mode='before')
|
73
|
+
@classmethod
|
74
|
+
def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
75
|
+
"""Ensure correct config is set based on library selection."""
|
76
|
+
library = values.get("library")
|
77
|
+
|
78
|
+
if library == "llm-pdf-parser":
|
79
|
+
values.setdefault("llm_parser_config", LLMPdfParserConfig())
|
80
|
+
else:
|
81
|
+
values["llm_parser_config"] = None
|
82
|
+
|
83
|
+
if library == "marker":
|
84
|
+
values.setdefault("marker_config", MarkerConfig())
|
85
|
+
else:
|
86
|
+
values["marker_config"] = None
|
87
|
+
|
88
|
+
return values
|
89
|
+
|
90
|
+
|
91
|
+
class DocxParsingConfig(BaseSettings):
|
92
|
+
library: Literal["python-docx", "unstructured", "markitdown-docx"] = "unstructured"
|
93
|
+
|
94
|
+
|
95
|
+
class DocParsingConfig(BaseSettings):
|
96
|
+
library: Literal["unstructured"] = "unstructured"
|
97
|
+
|
98
|
+
|
99
|
+
class MarkitdownPPTXParsingConfig(BaseSettings):
|
100
|
+
library: Literal["markitdown"] = "markitdown"
|
101
|
+
|
102
|
+
|
103
|
+
class MarkitdownXLSXParsingConfig(BaseSettings):
|
104
|
+
library: Literal["markitdown"] = "markitdown"
|
105
|
+
|
106
|
+
|
107
|
+
class MarkitdownXLSParsingConfig(BaseSettings):
|
108
|
+
library: Literal["markitdown"] = "markitdown"
|
109
|
+
|
110
|
+
|
111
|
+
class ParsingConfig(BaseSettings):
|
112
|
+
splitter: str = Splitter.MARKDOWN
|
113
|
+
chunk_by_page: bool = False # split by page?
|
114
|
+
chunk_size: int = 200 # aim for this many tokens per chunk
|
115
|
+
chunk_size_variation: float = 0.30 # max variation from chunk_size
|
116
|
+
overlap: int = 50 # overlap between chunks
|
117
|
+
max_chunks: int = 10_000
|
118
|
+
# offset to subtract from page numbers:
|
119
|
+
# e.g. if physical page 12 is displayed as page 1, set page_number_offset = 11
|
120
|
+
page_number_offset: int = 0
|
121
|
+
# aim to have at least this many chars per chunk when truncating due to punctuation
|
122
|
+
min_chunk_chars: int = 350
|
123
|
+
discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
|
124
|
+
n_similar_docs: Optional[int] = None # deprecated
|
125
|
+
n_neighbor_ids: int = 5 # window size to store around each chunk
|
126
|
+
separators: List[str] = ["\n\n", "\n", " ", ""]
|
127
|
+
token_encoding_model: str = "text-embedding-3-small"
|
128
|
+
pdf: PdfParsingConfig = PdfParsingConfig()
|
129
|
+
docx: DocxParsingConfig = DocxParsingConfig()
|
130
|
+
doc: DocParsingConfig = DocParsingConfig()
|
131
|
+
pptx: MarkitdownPPTXParsingConfig = MarkitdownPPTXParsingConfig()
|
132
|
+
xls: MarkitdownXLSParsingConfig = MarkitdownXLSParsingConfig()
|
133
|
+
xlsx: MarkitdownXLSXParsingConfig = MarkitdownXLSXParsingConfig()
|
134
|
+
|
135
|
+
|
136
|
+
class Parser:
|
137
|
+
def __init__(self, config: ParsingConfig):
|
138
|
+
self.config = config
|
139
|
+
try:
|
140
|
+
self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
|
141
|
+
except Exception:
|
142
|
+
self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
|
143
|
+
|
144
|
+
def num_tokens(self, text: str) -> int:
|
145
|
+
if self.config.splitter == Splitter.MARKDOWN:
|
146
|
+
return count_words(text) # simple count based on whitespace-split
|
147
|
+
tokens = self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
|
148
|
+
return len(tokens)
|
149
|
+
|
150
|
+
def truncate_tokens(self, text: str, max_tokens: int) -> str:
|
151
|
+
tokens = self.tokenizer.encode(text)
|
152
|
+
if len(tokens) <= max_tokens:
|
153
|
+
return text
|
154
|
+
return self.tokenizer.decode(tokens[:max_tokens])
|
155
|
+
|
156
|
+
def add_window_ids(self, chunks: List[Document]) -> None:
|
157
|
+
"""Chunks may belong to multiple docs, but for each doc,
|
158
|
+
they appear consecutively. Add window_ids in metadata"""
|
159
|
+
|
160
|
+
# discard empty chunks
|
161
|
+
chunks = [c for c in chunks if c.content.strip() != ""]
|
162
|
+
if len(chunks) == 0:
|
163
|
+
return
|
164
|
+
# The original metadata.id (if any) is ignored since it will be same for all
|
165
|
+
# chunks and is useless. We want a distinct id for each chunk.
|
166
|
+
# ASSUMPTION: all chunks c of a doc have same c.metadata.id !
|
167
|
+
orig_ids = [c.metadata.id for c in chunks]
|
168
|
+
ids = [ObjectRegistry.new_id() for c in chunks]
|
169
|
+
id2chunk = {id: c for id, c in zip(ids, chunks)}
|
170
|
+
|
171
|
+
# group the ids by orig_id
|
172
|
+
# (each distinct orig_id refers to a different document)
|
173
|
+
orig_id_to_ids: Dict[str, List[str]] = {}
|
174
|
+
for orig_id, id in zip(orig_ids, ids):
|
175
|
+
if orig_id not in orig_id_to_ids:
|
176
|
+
orig_id_to_ids[orig_id] = []
|
177
|
+
orig_id_to_ids[orig_id].append(id)
|
178
|
+
|
179
|
+
# now each orig_id maps to a sequence of ids within a single doc
|
180
|
+
|
181
|
+
k = self.config.n_neighbor_ids
|
182
|
+
for orig, ids in orig_id_to_ids.items():
|
183
|
+
# ids are consecutive chunks in a single doc
|
184
|
+
n = len(ids)
|
185
|
+
window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
|
186
|
+
for i, _ in enumerate(ids):
|
187
|
+
c = id2chunk[ids[i]]
|
188
|
+
c.metadata.window_ids = window_ids[i]
|
189
|
+
c.metadata.id = ids[i]
|
190
|
+
c.metadata.is_chunk = True
|
191
|
+
|
192
|
+
def split_simple(self, docs: List[Document]) -> List[Document]:
|
193
|
+
if len(self.config.separators) == 0:
|
194
|
+
raise ValueError("Must have at least one separator")
|
195
|
+
final_docs = []
|
196
|
+
|
197
|
+
for d in docs:
|
198
|
+
if d.content.strip() == "":
|
199
|
+
continue
|
200
|
+
chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
|
201
|
+
# note we are ensuring we COPY the document metadata into each chunk,
|
202
|
+
# which ensures all chunks of a given doc have same metadata
|
203
|
+
# (and in particular same metadata.id, which is important later for
|
204
|
+
# add_window_ids)
|
205
|
+
chunk_docs = [
|
206
|
+
Document(
|
207
|
+
content=c, metadata=d.metadata.model_copy(update=dict(is_chunk=True))
|
208
|
+
)
|
209
|
+
for c in chunks
|
210
|
+
if c.strip() != ""
|
211
|
+
]
|
212
|
+
self.add_window_ids(chunk_docs)
|
213
|
+
final_docs += chunk_docs
|
214
|
+
return final_docs
|
215
|
+
|
216
|
+
def split_para_sentence(self, docs: List[Document]) -> List[Document]:
|
217
|
+
chunks = docs
|
218
|
+
while True:
|
219
|
+
un_splittables = 0
|
220
|
+
split_chunks = []
|
221
|
+
for c in chunks:
|
222
|
+
if c.content.strip() == "":
|
223
|
+
continue
|
224
|
+
if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
|
225
|
+
# small chunk: no need to split
|
226
|
+
split_chunks.append(c)
|
227
|
+
continue
|
228
|
+
splits = self._split_para_sentence_once([c])
|
229
|
+
un_splittables += len(splits) == 1
|
230
|
+
split_chunks += splits
|
231
|
+
if len(split_chunks) == len(chunks):
|
232
|
+
if un_splittables > 0:
|
233
|
+
max_len = max([self.num_tokens(p.content) for p in chunks])
|
234
|
+
logger.warning(
|
235
|
+
f"""
|
236
|
+
Unable to split {un_splittables} chunks
|
237
|
+
using chunk_size = {self.config.chunk_size}.
|
238
|
+
Max chunk size is {max_len} tokens.
|
239
|
+
"""
|
240
|
+
)
|
241
|
+
break # we won't be able to shorten them with current settings
|
242
|
+
chunks = split_chunks.model_copy()
|
243
|
+
|
244
|
+
self.add_window_ids(chunks)
|
245
|
+
return chunks
|
246
|
+
|
247
|
+
def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
|
248
|
+
final_chunks = []
|
249
|
+
for d in docs:
|
250
|
+
if d.content.strip() == "":
|
251
|
+
continue
|
252
|
+
chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
|
253
|
+
# note we are ensuring we COPY the document metadata into each chunk,
|
254
|
+
# which ensures all chunks of a given doc have same metadata
|
255
|
+
# (and in particular same metadata.id, which is important later for
|
256
|
+
# add_window_ids)
|
257
|
+
chunk_docs = [
|
258
|
+
Document(
|
259
|
+
content=c, metadata=d.metadata.model_copy(update=dict(is_chunk=True))
|
260
|
+
)
|
261
|
+
for c in chunks
|
262
|
+
if c.strip() != ""
|
263
|
+
]
|
264
|
+
final_chunks += chunk_docs
|
265
|
+
|
266
|
+
return final_chunks
|
267
|
+
|
268
|
+
def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
|
269
|
+
final_docs = []
|
270
|
+
for d in docs:
|
271
|
+
if self.config.splitter == Splitter.MARKDOWN:
|
272
|
+
chunks = chunk_markdown(
|
273
|
+
d.content,
|
274
|
+
MarkdownChunkConfig(
|
275
|
+
# apply rough adjustment factor to convert from tokens to words,
|
276
|
+
# which is what the markdown chunker uses
|
277
|
+
chunk_size=int(self.config.chunk_size * 0.75),
|
278
|
+
overlap_tokens=int(self.config.overlap * 0.75),
|
279
|
+
variation_percent=self.config.chunk_size_variation,
|
280
|
+
rollup=True,
|
281
|
+
),
|
282
|
+
)
|
283
|
+
else:
|
284
|
+
chunks = self.chunk_tokens(d.content)
|
285
|
+
# note we are ensuring we COPY the document metadata into each chunk,
|
286
|
+
# which ensures all chunks of a given doc have same metadata
|
287
|
+
# (and in particular same metadata.id, which is important later for
|
288
|
+
# add_window_ids)
|
289
|
+
chunk_docs = [
|
290
|
+
Document(
|
291
|
+
content=c, metadata=d.metadata.model_copy(update=dict(is_chunk=True))
|
292
|
+
)
|
293
|
+
for c in chunks
|
294
|
+
if c.strip() != ""
|
295
|
+
]
|
296
|
+
self.add_window_ids(chunk_docs)
|
297
|
+
final_docs += chunk_docs
|
298
|
+
return final_docs
|
299
|
+
|
300
|
+
def chunk_tokens(
|
301
|
+
self,
|
302
|
+
text: str,
|
303
|
+
) -> List[str]:
|
304
|
+
"""
|
305
|
+
Split a text into chunks of ~CHUNK_SIZE tokens,
|
306
|
+
based on punctuation and newline boundaries.
|
307
|
+
Adapted from
|
308
|
+
https://github.com/openai/chatgpt-retrieval-plugin/blob/main/services/chunks.py
|
309
|
+
|
310
|
+
Args:
|
311
|
+
text: The text to split into chunks.
|
312
|
+
|
313
|
+
Returns:
|
314
|
+
A list of text chunks, each of which is a string of tokens
|
315
|
+
roughly self.config.chunk_size tokens long.
|
316
|
+
"""
|
317
|
+
# Return an empty list if the text is empty or whitespace
|
318
|
+
if not text or text.isspace():
|
319
|
+
return []
|
320
|
+
|
321
|
+
# Tokenize the text
|
322
|
+
tokens = self.tokenizer.encode(text, disallowed_special=())
|
323
|
+
|
324
|
+
# Initialize an empty list of chunks
|
325
|
+
chunks = []
|
326
|
+
|
327
|
+
# Initialize a counter for the number of chunks
|
328
|
+
num_chunks = 0
|
329
|
+
|
330
|
+
# Loop until all tokens are consumed
|
331
|
+
while tokens and num_chunks < self.config.max_chunks:
|
332
|
+
# Take the first chunk_size tokens as a chunk
|
333
|
+
chunk = tokens[: self.config.chunk_size]
|
334
|
+
|
335
|
+
# Decode the chunk into text
|
336
|
+
chunk_text = self.tokenizer.decode(chunk)
|
337
|
+
|
338
|
+
# Skip the chunk if it is empty or whitespace
|
339
|
+
if not chunk_text or chunk_text.isspace():
|
340
|
+
# Remove the tokens corresponding to the chunk text
|
341
|
+
# from remaining tokens
|
342
|
+
tokens = tokens[len(chunk) :]
|
343
|
+
# Continue to the next iteration of the loop
|
344
|
+
continue
|
345
|
+
|
346
|
+
# Find the last period or punctuation mark in the chunk
|
347
|
+
punctuation_matches = [
|
348
|
+
(m.start(), m.group())
|
349
|
+
for m in re.finditer(r"(?:[.!?][\s\n]|\n)", chunk_text)
|
350
|
+
]
|
351
|
+
|
352
|
+
last_punctuation = max([pos for pos, _ in punctuation_matches] + [-1])
|
353
|
+
|
354
|
+
# If there is a punctuation mark, and the last punctuation index is
|
355
|
+
# after MIN_CHUNK_SIZE_CHARS
|
356
|
+
if (
|
357
|
+
last_punctuation != -1
|
358
|
+
and last_punctuation > self.config.min_chunk_chars
|
359
|
+
):
|
360
|
+
# Truncate the chunk text at the punctuation mark
|
361
|
+
chunk_text = chunk_text[: last_punctuation + 1]
|
362
|
+
|
363
|
+
# Replace redundant (3 or more) newlines with 2 newlines to preser
|
364
|
+
# paragraph separation!
|
365
|
+
# But do NOT strip leading/trailing whitespace, to preserve formatting
|
366
|
+
# (e.g. code blocks, or in case we want to stitch chunks back together)
|
367
|
+
chunk_text_to_append = re.sub(r"\n{3,}", "\n\n", chunk_text)
|
368
|
+
|
369
|
+
if len(chunk_text_to_append) > self.config.discard_chunk_chars:
|
370
|
+
# Append the chunk text to the list of chunks
|
371
|
+
chunks.append(chunk_text_to_append)
|
372
|
+
|
373
|
+
# Remove the tokens corresponding to the chunk text
|
374
|
+
# from the remaining tokens
|
375
|
+
tokens = tokens[
|
376
|
+
len(self.tokenizer.encode(chunk_text, disallowed_special=())) :
|
377
|
+
]
|
378
|
+
|
379
|
+
# Increment the number of chunks
|
380
|
+
num_chunks += 1
|
381
|
+
|
382
|
+
# There may be remaining tokens, but we discard them
|
383
|
+
# since we have already reached the maximum number of chunks
|
384
|
+
|
385
|
+
return chunks
|
386
|
+
|
387
|
+
def split(self, docs: List[Document]) -> List[Document]:
|
388
|
+
if len(docs) == 0:
|
389
|
+
return []
|
390
|
+
# create ids in metadata of docs if absent:
|
391
|
+
# we need this to distinguish docs later in add_window_ids
|
392
|
+
for d in docs:
|
393
|
+
if d.metadata.id in [None, ""]:
|
394
|
+
d.metadata.id = ObjectRegistry.new_id()
|
395
|
+
# some docs are already splits, so don't split them further!
|
396
|
+
chunked_docs = [d for d in docs if d.metadata.is_chunk]
|
397
|
+
big_docs = [d for d in docs if not d.metadata.is_chunk]
|
398
|
+
if len(big_docs) == 0:
|
399
|
+
return chunked_docs
|
400
|
+
match self.config.splitter:
|
401
|
+
case Splitter.MARKDOWN | Splitter.TOKENS:
|
402
|
+
big_doc_chunks = self.split_chunk_tokens(big_docs)
|
403
|
+
case Splitter.PARA_SENTENCE:
|
404
|
+
big_doc_chunks = self.split_para_sentence(big_docs)
|
405
|
+
case Splitter.SIMPLE:
|
406
|
+
big_doc_chunks = self.split_simple(big_docs)
|
407
|
+
case _:
|
408
|
+
raise ValueError(f"Unknown splitter: {self.config.splitter}")
|
409
|
+
|
410
|
+
return chunked_docs + big_doc_chunks
|
langroid/parsing/repo_loader.py
CHANGED
@@ -18,10 +18,12 @@ if TYPE_CHECKING:
|
|
18
18
|
from github.Label import Label
|
19
19
|
from github.Repository import Repository
|
20
20
|
|
21
|
+
from pydantic import BaseModel, Field
|
22
|
+
from pydantic_settings import BaseSettings
|
23
|
+
|
21
24
|
from langroid.mytypes import DocMetaData, Document
|
22
25
|
from langroid.parsing.document_parser import DocumentParser, DocumentType
|
23
26
|
from langroid.parsing.parser import Parser, ParsingConfig
|
24
|
-
from langroid.pydantic_v1 import BaseModel, BaseSettings, Field
|
25
27
|
|
26
28
|
logger = logging.getLogger(__name__)
|
27
29
|
|