langroid 0.58.2__py3-none-any.whl → 0.59.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. langroid/agent/base.py +39 -17
  2. langroid/agent/base.py-e +2216 -0
  3. langroid/agent/callbacks/chainlit.py +2 -1
  4. langroid/agent/chat_agent.py +73 -55
  5. langroid/agent/chat_agent.py-e +2086 -0
  6. langroid/agent/chat_document.py +7 -7
  7. langroid/agent/chat_document.py-e +513 -0
  8. langroid/agent/openai_assistant.py +9 -9
  9. langroid/agent/openai_assistant.py-e +882 -0
  10. langroid/agent/special/arangodb/arangodb_agent.py +10 -18
  11. langroid/agent/special/arangodb/arangodb_agent.py-e +648 -0
  12. langroid/agent/special/arangodb/tools.py +3 -3
  13. langroid/agent/special/doc_chat_agent.py +16 -14
  14. langroid/agent/special/lance_rag/critic_agent.py +2 -2
  15. langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
  16. langroid/agent/special/lance_tools.py +6 -5
  17. langroid/agent/special/lance_tools.py-e +61 -0
  18. langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
  19. langroid/agent/special/neo4j/neo4j_chat_agent.py-e +430 -0
  20. langroid/agent/special/relevance_extractor_agent.py +1 -1
  21. langroid/agent/special/sql/sql_chat_agent.py +11 -3
  22. langroid/agent/task.py +9 -87
  23. langroid/agent/task.py-e +2418 -0
  24. langroid/agent/tool_message.py +33 -17
  25. langroid/agent/tool_message.py-e +400 -0
  26. langroid/agent/tools/file_tools.py +4 -2
  27. langroid/agent/tools/file_tools.py-e +234 -0
  28. langroid/agent/tools/mcp/fastmcp_client.py +19 -6
  29. langroid/agent/tools/mcp/fastmcp_client.py-e +584 -0
  30. langroid/agent/tools/orchestration.py +22 -17
  31. langroid/agent/tools/orchestration.py-e +301 -0
  32. langroid/agent/tools/recipient_tool.py +3 -3
  33. langroid/agent/tools/task_tool.py +22 -16
  34. langroid/agent/tools/task_tool.py-e +249 -0
  35. langroid/agent/xml_tool_message.py +90 -35
  36. langroid/agent/xml_tool_message.py-e +392 -0
  37. langroid/cachedb/base.py +1 -1
  38. langroid/embedding_models/base.py +2 -2
  39. langroid/embedding_models/models.py +3 -7
  40. langroid/embedding_models/models.py-e +563 -0
  41. langroid/exceptions.py +4 -1
  42. langroid/language_models/azure_openai.py +2 -2
  43. langroid/language_models/azure_openai.py-e +134 -0
  44. langroid/language_models/base.py +6 -4
  45. langroid/language_models/base.py-e +812 -0
  46. langroid/language_models/client_cache.py +64 -0
  47. langroid/language_models/config.py +2 -4
  48. langroid/language_models/config.py-e +18 -0
  49. langroid/language_models/model_info.py +9 -1
  50. langroid/language_models/model_info.py-e +483 -0
  51. langroid/language_models/openai_gpt.py +119 -20
  52. langroid/language_models/openai_gpt.py-e +2280 -0
  53. langroid/language_models/provider_params.py +3 -22
  54. langroid/language_models/provider_params.py-e +153 -0
  55. langroid/mytypes.py +11 -4
  56. langroid/mytypes.py-e +132 -0
  57. langroid/parsing/code_parser.py +1 -1
  58. langroid/parsing/file_attachment.py +1 -1
  59. langroid/parsing/file_attachment.py-e +246 -0
  60. langroid/parsing/md_parser.py +14 -4
  61. langroid/parsing/md_parser.py-e +574 -0
  62. langroid/parsing/parser.py +22 -7
  63. langroid/parsing/parser.py-e +410 -0
  64. langroid/parsing/repo_loader.py +3 -1
  65. langroid/parsing/repo_loader.py-e +812 -0
  66. langroid/parsing/search.py +1 -1
  67. langroid/parsing/url_loader.py +17 -51
  68. langroid/parsing/url_loader.py-e +683 -0
  69. langroid/parsing/urls.py +5 -4
  70. langroid/parsing/urls.py-e +279 -0
  71. langroid/prompts/prompts_config.py +1 -1
  72. langroid/pydantic_v1/__init__.py +45 -6
  73. langroid/pydantic_v1/__init__.py-e +36 -0
  74. langroid/pydantic_v1/main.py +11 -4
  75. langroid/pydantic_v1/main.py-e +11 -0
  76. langroid/utils/configuration.py +13 -11
  77. langroid/utils/configuration.py-e +141 -0
  78. langroid/utils/constants.py +1 -1
  79. langroid/utils/constants.py-e +32 -0
  80. langroid/utils/globals.py +21 -5
  81. langroid/utils/globals.py-e +49 -0
  82. langroid/utils/html_logger.py +2 -1
  83. langroid/utils/html_logger.py-e +825 -0
  84. langroid/utils/object_registry.py +1 -1
  85. langroid/utils/object_registry.py-e +66 -0
  86. langroid/utils/pydantic_utils.py +55 -28
  87. langroid/utils/pydantic_utils.py-e +602 -0
  88. langroid/utils/types.py +2 -2
  89. langroid/utils/types.py-e +113 -0
  90. langroid/vector_store/base.py +3 -3
  91. langroid/vector_store/lancedb.py +5 -5
  92. langroid/vector_store/lancedb.py-e +404 -0
  93. langroid/vector_store/meilisearch.py +2 -2
  94. langroid/vector_store/pineconedb.py +4 -4
  95. langroid/vector_store/pineconedb.py-e +427 -0
  96. langroid/vector_store/postgres.py +1 -1
  97. langroid/vector_store/qdrantdb.py +3 -3
  98. langroid/vector_store/weaviatedb.py +1 -1
  99. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/METADATA +3 -2
  100. langroid-0.59.0b1.dist-info/RECORD +181 -0
  101. langroid/agent/special/doc_chat_task.py +0 -0
  102. langroid/mcp/__init__.py +0 -1
  103. langroid/mcp/server/__init__.py +0 -1
  104. langroid-0.58.2.dist-info/RECORD +0 -145
  105. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/WHEEL +0 -0
  106. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,410 @@
1
+ import logging
2
+ import re
3
+ from enum import Enum
4
+ from typing import Any, Dict, List, Literal, Optional
5
+
6
+ import tiktoken
7
+
8
+ from langroid.mytypes import Document
9
+ from langroid.parsing.md_parser import (
10
+ MarkdownChunkConfig,
11
+ chunk_markdown,
12
+ count_words,
13
+ )
14
+ from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
15
+ from langroid.pydantic_v1 import model_validator
16
+ from pydantic_settings import BaseSettings
17
+ from pydantic import ConfigDict
18
+ from langroid.utils.object_registry import ObjectRegistry
19
+
20
+ logger = logging.getLogger(__name__)
21
+ logger.setLevel(logging.WARNING)
22
+
23
+
24
+ class Splitter(str, Enum):
25
+ TOKENS = "tokens"
26
+ PARA_SENTENCE = "para_sentence"
27
+ SIMPLE = "simple"
28
+ # "structure-aware" splitting with chunks enriched by header info
29
+ MARKDOWN = "markdown"
30
+
31
+
32
+ class BaseParsingConfig(BaseSettings):
33
+ """Base class for document parsing configurations."""
34
+
35
+ library: str
36
+
37
+ model_config = ConfigDict(extra="ignore") # Ignore unknown settings
38
+
39
+ class LLMPdfParserConfig(BaseSettings):
40
+ """Configuration for LLM-based parsing."""
41
+
42
+ model_name: str = "gemini/gemini-2.0-flash" # Default model
43
+ max_tokens: Optional[int] = None
44
+ split_on_page: Optional[bool] = True
45
+ requests_per_minute: Optional[int] = 5
46
+ timeout: int = 60
47
+ prompt: str = "" # override with a domain-specific prompt
48
+ system_prompt: str = "" # override with a domain-specific system prompt
49
+
50
+
51
+ class MarkerConfig(BaseSettings):
52
+ """Configuration for Markitdown-based parsing."""
53
+
54
+ config_dict: Dict[str, Any] = {}
55
+
56
+
57
+ class PdfParsingConfig(BaseParsingConfig):
58
+ library: Literal[
59
+ "fitz",
60
+ "pymupdf4llm",
61
+ "docling",
62
+ "pypdf",
63
+ "unstructured",
64
+ "pdf2image",
65
+ "markitdown",
66
+ "llm-pdf-parser",
67
+ "marker",
68
+ ] = "pymupdf4llm"
69
+ llm_parser_config: Optional[LLMPdfParserConfig] = None
70
+ marker_config: Optional[MarkerConfig] = None
71
+
72
+ @model_validator(mode='before')
73
+ @classmethod
74
+ def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
75
+ """Ensure correct config is set based on library selection."""
76
+ library = values.get("library")
77
+
78
+ if library == "llm-pdf-parser":
79
+ values.setdefault("llm_parser_config", LLMPdfParserConfig())
80
+ else:
81
+ values["llm_parser_config"] = None
82
+
83
+ if library == "marker":
84
+ values.setdefault("marker_config", MarkerConfig())
85
+ else:
86
+ values["marker_config"] = None
87
+
88
+ return values
89
+
90
+
91
+ class DocxParsingConfig(BaseSettings):
92
+ library: Literal["python-docx", "unstructured", "markitdown-docx"] = "unstructured"
93
+
94
+
95
+ class DocParsingConfig(BaseSettings):
96
+ library: Literal["unstructured"] = "unstructured"
97
+
98
+
99
+ class MarkitdownPPTXParsingConfig(BaseSettings):
100
+ library: Literal["markitdown"] = "markitdown"
101
+
102
+
103
+ class MarkitdownXLSXParsingConfig(BaseSettings):
104
+ library: Literal["markitdown"] = "markitdown"
105
+
106
+
107
+ class MarkitdownXLSParsingConfig(BaseSettings):
108
+ library: Literal["markitdown"] = "markitdown"
109
+
110
+
111
+ class ParsingConfig(BaseSettings):
112
+ splitter: str = Splitter.MARKDOWN
113
+ chunk_by_page: bool = False # split by page?
114
+ chunk_size: int = 200 # aim for this many tokens per chunk
115
+ chunk_size_variation: float = 0.30 # max variation from chunk_size
116
+ overlap: int = 50 # overlap between chunks
117
+ max_chunks: int = 10_000
118
+ # offset to subtract from page numbers:
119
+ # e.g. if physical page 12 is displayed as page 1, set page_number_offset = 11
120
+ page_number_offset: int = 0
121
+ # aim to have at least this many chars per chunk when truncating due to punctuation
122
+ min_chunk_chars: int = 350
123
+ discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
124
+ n_similar_docs: Optional[int] = None # deprecated
125
+ n_neighbor_ids: int = 5 # window size to store around each chunk
126
+ separators: List[str] = ["\n\n", "\n", " ", ""]
127
+ token_encoding_model: str = "text-embedding-3-small"
128
+ pdf: PdfParsingConfig = PdfParsingConfig()
129
+ docx: DocxParsingConfig = DocxParsingConfig()
130
+ doc: DocParsingConfig = DocParsingConfig()
131
+ pptx: MarkitdownPPTXParsingConfig = MarkitdownPPTXParsingConfig()
132
+ xls: MarkitdownXLSParsingConfig = MarkitdownXLSParsingConfig()
133
+ xlsx: MarkitdownXLSXParsingConfig = MarkitdownXLSXParsingConfig()
134
+
135
+
136
+ class Parser:
137
+ def __init__(self, config: ParsingConfig):
138
+ self.config = config
139
+ try:
140
+ self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
141
+ except Exception:
142
+ self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
143
+
144
+ def num_tokens(self, text: str) -> int:
145
+ if self.config.splitter == Splitter.MARKDOWN:
146
+ return count_words(text) # simple count based on whitespace-split
147
+ tokens = self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
148
+ return len(tokens)
149
+
150
+ def truncate_tokens(self, text: str, max_tokens: int) -> str:
151
+ tokens = self.tokenizer.encode(text)
152
+ if len(tokens) <= max_tokens:
153
+ return text
154
+ return self.tokenizer.decode(tokens[:max_tokens])
155
+
156
+ def add_window_ids(self, chunks: List[Document]) -> None:
157
+ """Chunks may belong to multiple docs, but for each doc,
158
+ they appear consecutively. Add window_ids in metadata"""
159
+
160
+ # discard empty chunks
161
+ chunks = [c for c in chunks if c.content.strip() != ""]
162
+ if len(chunks) == 0:
163
+ return
164
+ # The original metadata.id (if any) is ignored since it will be same for all
165
+ # chunks and is useless. We want a distinct id for each chunk.
166
+ # ASSUMPTION: all chunks c of a doc have same c.metadata.id !
167
+ orig_ids = [c.metadata.id for c in chunks]
168
+ ids = [ObjectRegistry.new_id() for c in chunks]
169
+ id2chunk = {id: c for id, c in zip(ids, chunks)}
170
+
171
+ # group the ids by orig_id
172
+ # (each distinct orig_id refers to a different document)
173
+ orig_id_to_ids: Dict[str, List[str]] = {}
174
+ for orig_id, id in zip(orig_ids, ids):
175
+ if orig_id not in orig_id_to_ids:
176
+ orig_id_to_ids[orig_id] = []
177
+ orig_id_to_ids[orig_id].append(id)
178
+
179
+ # now each orig_id maps to a sequence of ids within a single doc
180
+
181
+ k = self.config.n_neighbor_ids
182
+ for orig, ids in orig_id_to_ids.items():
183
+ # ids are consecutive chunks in a single doc
184
+ n = len(ids)
185
+ window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
186
+ for i, _ in enumerate(ids):
187
+ c = id2chunk[ids[i]]
188
+ c.metadata.window_ids = window_ids[i]
189
+ c.metadata.id = ids[i]
190
+ c.metadata.is_chunk = True
191
+
192
+ def split_simple(self, docs: List[Document]) -> List[Document]:
193
+ if len(self.config.separators) == 0:
194
+ raise ValueError("Must have at least one separator")
195
+ final_docs = []
196
+
197
+ for d in docs:
198
+ if d.content.strip() == "":
199
+ continue
200
+ chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
201
+ # note we are ensuring we COPY the document metadata into each chunk,
202
+ # which ensures all chunks of a given doc have same metadata
203
+ # (and in particular same metadata.id, which is important later for
204
+ # add_window_ids)
205
+ chunk_docs = [
206
+ Document(
207
+ content=c, metadata=d.metadata.model_copy(update=dict(is_chunk=True))
208
+ )
209
+ for c in chunks
210
+ if c.strip() != ""
211
+ ]
212
+ self.add_window_ids(chunk_docs)
213
+ final_docs += chunk_docs
214
+ return final_docs
215
+
216
+ def split_para_sentence(self, docs: List[Document]) -> List[Document]:
217
+ chunks = docs
218
+ while True:
219
+ un_splittables = 0
220
+ split_chunks = []
221
+ for c in chunks:
222
+ if c.content.strip() == "":
223
+ continue
224
+ if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
225
+ # small chunk: no need to split
226
+ split_chunks.append(c)
227
+ continue
228
+ splits = self._split_para_sentence_once([c])
229
+ un_splittables += len(splits) == 1
230
+ split_chunks += splits
231
+ if len(split_chunks) == len(chunks):
232
+ if un_splittables > 0:
233
+ max_len = max([self.num_tokens(p.content) for p in chunks])
234
+ logger.warning(
235
+ f"""
236
+ Unable to split {un_splittables} chunks
237
+ using chunk_size = {self.config.chunk_size}.
238
+ Max chunk size is {max_len} tokens.
239
+ """
240
+ )
241
+ break # we won't be able to shorten them with current settings
242
+ chunks = split_chunks.model_copy()
243
+
244
+ self.add_window_ids(chunks)
245
+ return chunks
246
+
247
+ def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
248
+ final_chunks = []
249
+ for d in docs:
250
+ if d.content.strip() == "":
251
+ continue
252
+ chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
253
+ # note we are ensuring we COPY the document metadata into each chunk,
254
+ # which ensures all chunks of a given doc have same metadata
255
+ # (and in particular same metadata.id, which is important later for
256
+ # add_window_ids)
257
+ chunk_docs = [
258
+ Document(
259
+ content=c, metadata=d.metadata.model_copy(update=dict(is_chunk=True))
260
+ )
261
+ for c in chunks
262
+ if c.strip() != ""
263
+ ]
264
+ final_chunks += chunk_docs
265
+
266
+ return final_chunks
267
+
268
+ def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
269
+ final_docs = []
270
+ for d in docs:
271
+ if self.config.splitter == Splitter.MARKDOWN:
272
+ chunks = chunk_markdown(
273
+ d.content,
274
+ MarkdownChunkConfig(
275
+ # apply rough adjustment factor to convert from tokens to words,
276
+ # which is what the markdown chunker uses
277
+ chunk_size=int(self.config.chunk_size * 0.75),
278
+ overlap_tokens=int(self.config.overlap * 0.75),
279
+ variation_percent=self.config.chunk_size_variation,
280
+ rollup=True,
281
+ ),
282
+ )
283
+ else:
284
+ chunks = self.chunk_tokens(d.content)
285
+ # note we are ensuring we COPY the document metadata into each chunk,
286
+ # which ensures all chunks of a given doc have same metadata
287
+ # (and in particular same metadata.id, which is important later for
288
+ # add_window_ids)
289
+ chunk_docs = [
290
+ Document(
291
+ content=c, metadata=d.metadata.model_copy(update=dict(is_chunk=True))
292
+ )
293
+ for c in chunks
294
+ if c.strip() != ""
295
+ ]
296
+ self.add_window_ids(chunk_docs)
297
+ final_docs += chunk_docs
298
+ return final_docs
299
+
300
+ def chunk_tokens(
301
+ self,
302
+ text: str,
303
+ ) -> List[str]:
304
+ """
305
+ Split a text into chunks of ~CHUNK_SIZE tokens,
306
+ based on punctuation and newline boundaries.
307
+ Adapted from
308
+ https://github.com/openai/chatgpt-retrieval-plugin/blob/main/services/chunks.py
309
+
310
+ Args:
311
+ text: The text to split into chunks.
312
+
313
+ Returns:
314
+ A list of text chunks, each of which is a string of tokens
315
+ roughly self.config.chunk_size tokens long.
316
+ """
317
+ # Return an empty list if the text is empty or whitespace
318
+ if not text or text.isspace():
319
+ return []
320
+
321
+ # Tokenize the text
322
+ tokens = self.tokenizer.encode(text, disallowed_special=())
323
+
324
+ # Initialize an empty list of chunks
325
+ chunks = []
326
+
327
+ # Initialize a counter for the number of chunks
328
+ num_chunks = 0
329
+
330
+ # Loop until all tokens are consumed
331
+ while tokens and num_chunks < self.config.max_chunks:
332
+ # Take the first chunk_size tokens as a chunk
333
+ chunk = tokens[: self.config.chunk_size]
334
+
335
+ # Decode the chunk into text
336
+ chunk_text = self.tokenizer.decode(chunk)
337
+
338
+ # Skip the chunk if it is empty or whitespace
339
+ if not chunk_text or chunk_text.isspace():
340
+ # Remove the tokens corresponding to the chunk text
341
+ # from remaining tokens
342
+ tokens = tokens[len(chunk) :]
343
+ # Continue to the next iteration of the loop
344
+ continue
345
+
346
+ # Find the last period or punctuation mark in the chunk
347
+ punctuation_matches = [
348
+ (m.start(), m.group())
349
+ for m in re.finditer(r"(?:[.!?][\s\n]|\n)", chunk_text)
350
+ ]
351
+
352
+ last_punctuation = max([pos for pos, _ in punctuation_matches] + [-1])
353
+
354
+ # If there is a punctuation mark, and the last punctuation index is
355
+ # after MIN_CHUNK_SIZE_CHARS
356
+ if (
357
+ last_punctuation != -1
358
+ and last_punctuation > self.config.min_chunk_chars
359
+ ):
360
+ # Truncate the chunk text at the punctuation mark
361
+ chunk_text = chunk_text[: last_punctuation + 1]
362
+
363
+ # Replace redundant (3 or more) newlines with 2 newlines to preser
364
+ # paragraph separation!
365
+ # But do NOT strip leading/trailing whitespace, to preserve formatting
366
+ # (e.g. code blocks, or in case we want to stitch chunks back together)
367
+ chunk_text_to_append = re.sub(r"\n{3,}", "\n\n", chunk_text)
368
+
369
+ if len(chunk_text_to_append) > self.config.discard_chunk_chars:
370
+ # Append the chunk text to the list of chunks
371
+ chunks.append(chunk_text_to_append)
372
+
373
+ # Remove the tokens corresponding to the chunk text
374
+ # from the remaining tokens
375
+ tokens = tokens[
376
+ len(self.tokenizer.encode(chunk_text, disallowed_special=())) :
377
+ ]
378
+
379
+ # Increment the number of chunks
380
+ num_chunks += 1
381
+
382
+ # There may be remaining tokens, but we discard them
383
+ # since we have already reached the maximum number of chunks
384
+
385
+ return chunks
386
+
387
+ def split(self, docs: List[Document]) -> List[Document]:
388
+ if len(docs) == 0:
389
+ return []
390
+ # create ids in metadata of docs if absent:
391
+ # we need this to distinguish docs later in add_window_ids
392
+ for d in docs:
393
+ if d.metadata.id in [None, ""]:
394
+ d.metadata.id = ObjectRegistry.new_id()
395
+ # some docs are already splits, so don't split them further!
396
+ chunked_docs = [d for d in docs if d.metadata.is_chunk]
397
+ big_docs = [d for d in docs if not d.metadata.is_chunk]
398
+ if len(big_docs) == 0:
399
+ return chunked_docs
400
+ match self.config.splitter:
401
+ case Splitter.MARKDOWN | Splitter.TOKENS:
402
+ big_doc_chunks = self.split_chunk_tokens(big_docs)
403
+ case Splitter.PARA_SENTENCE:
404
+ big_doc_chunks = self.split_para_sentence(big_docs)
405
+ case Splitter.SIMPLE:
406
+ big_doc_chunks = self.split_simple(big_docs)
407
+ case _:
408
+ raise ValueError(f"Unknown splitter: {self.config.splitter}")
409
+
410
+ return chunked_docs + big_doc_chunks
@@ -18,10 +18,12 @@ if TYPE_CHECKING:
18
18
  from github.Label import Label
19
19
  from github.Repository import Repository
20
20
 
21
+ from pydantic import BaseModel, Field
22
+ from pydantic_settings import BaseSettings
23
+
21
24
  from langroid.mytypes import DocMetaData, Document
22
25
  from langroid.parsing.document_parser import DocumentParser, DocumentType
23
26
  from langroid.parsing.parser import Parser, ParsingConfig
24
- from langroid.pydantic_v1 import BaseModel, BaseSettings, Field
25
27
 
26
28
  logger = logging.getLogger(__name__)
27
29