langroid 0.58.2__py3-none-any.whl → 0.59.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/base.py +39 -17
- langroid/agent/callbacks/chainlit.py +2 -1
- langroid/agent/chat_agent.py +73 -55
- langroid/agent/chat_document.py +7 -7
- langroid/agent/done_sequence_parser.py +46 -11
- langroid/agent/openai_assistant.py +9 -9
- langroid/agent/special/arangodb/arangodb_agent.py +10 -18
- langroid/agent/special/arangodb/tools.py +3 -3
- langroid/agent/special/doc_chat_agent.py +16 -14
- langroid/agent/special/lance_rag/critic_agent.py +2 -2
- langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
- langroid/agent/special/lance_tools.py +6 -5
- langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
- langroid/agent/special/relevance_extractor_agent.py +1 -1
- langroid/agent/special/sql/sql_chat_agent.py +11 -3
- langroid/agent/task.py +53 -94
- langroid/agent/tool_message.py +33 -17
- langroid/agent/tools/file_tools.py +4 -2
- langroid/agent/tools/mcp/fastmcp_client.py +19 -6
- langroid/agent/tools/orchestration.py +22 -17
- langroid/agent/tools/recipient_tool.py +3 -3
- langroid/agent/tools/task_tool.py +22 -16
- langroid/agent/xml_tool_message.py +90 -35
- langroid/cachedb/base.py +1 -1
- langroid/embedding_models/base.py +2 -2
- langroid/embedding_models/models.py +3 -7
- langroid/exceptions.py +4 -1
- langroid/language_models/azure_openai.py +2 -2
- langroid/language_models/base.py +6 -4
- langroid/language_models/client_cache.py +64 -0
- langroid/language_models/config.py +2 -4
- langroid/language_models/model_info.py +9 -1
- langroid/language_models/openai_gpt.py +119 -20
- langroid/language_models/provider_params.py +3 -22
- langroid/mytypes.py +11 -4
- langroid/parsing/code_parser.py +1 -1
- langroid/parsing/file_attachment.py +1 -1
- langroid/parsing/md_parser.py +14 -4
- langroid/parsing/parser.py +22 -7
- langroid/parsing/repo_loader.py +3 -1
- langroid/parsing/search.py +1 -1
- langroid/parsing/url_loader.py +17 -51
- langroid/parsing/urls.py +5 -4
- langroid/prompts/prompts_config.py +1 -1
- langroid/pydantic_v1/__init__.py +61 -4
- langroid/pydantic_v1/main.py +10 -4
- langroid/utils/configuration.py +13 -11
- langroid/utils/constants.py +1 -1
- langroid/utils/globals.py +21 -5
- langroid/utils/html_logger.py +2 -1
- langroid/utils/object_registry.py +1 -1
- langroid/utils/pydantic_utils.py +55 -28
- langroid/utils/types.py +2 -2
- langroid/vector_store/base.py +3 -3
- langroid/vector_store/lancedb.py +5 -5
- langroid/vector_store/meilisearch.py +2 -2
- langroid/vector_store/pineconedb.py +4 -4
- langroid/vector_store/postgres.py +1 -1
- langroid/vector_store/qdrantdb.py +3 -3
- langroid/vector_store/weaviatedb.py +1 -1
- {langroid-0.58.2.dist-info → langroid-0.59.0.dist-info}/METADATA +3 -2
- {langroid-0.58.2.dist-info → langroid-0.59.0.dist-info}/RECORD +64 -64
- {langroid-0.58.2.dist-info → langroid-0.59.0.dist-info}/WHEEL +0 -0
- {langroid-0.58.2.dist-info → langroid-0.59.0.dist-info}/licenses/LICENSE +0 -0
@@ -12,6 +12,7 @@ from typing import (
|
|
12
12
|
Callable,
|
13
13
|
Dict,
|
14
14
|
List,
|
15
|
+
Mapping,
|
15
16
|
Optional,
|
16
17
|
Tuple,
|
17
18
|
Type,
|
@@ -24,6 +25,8 @@ from cerebras.cloud.sdk import AsyncCerebras, Cerebras
|
|
24
25
|
from groq import AsyncGroq, Groq
|
25
26
|
from httpx import Timeout
|
26
27
|
from openai import AsyncOpenAI, OpenAI
|
28
|
+
from pydantic import BaseModel
|
29
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
27
30
|
from rich import print
|
28
31
|
from rich.markup import escape
|
29
32
|
|
@@ -78,7 +81,6 @@ from langroid.language_models.utils import (
|
|
78
81
|
retry_with_exponential_backoff,
|
79
82
|
)
|
80
83
|
from langroid.parsing.parse_json import parse_imperfect_json
|
81
|
-
from langroid.pydantic_v1 import BaseModel, BaseSettings
|
82
84
|
from langroid.utils.configuration import settings
|
83
85
|
from langroid.utils.constants import Colors
|
84
86
|
from langroid.utils.system import friendly_error
|
@@ -220,7 +222,7 @@ class OpenAICallParams(BaseModel):
|
|
220
222
|
extra_body: Dict[str, Any] | None = None # additional params for API request body
|
221
223
|
|
222
224
|
def to_dict_exclude_none(self) -> Dict[str, Any]:
|
223
|
-
return {k: v for k, v in self.
|
225
|
+
return {k: v for k, v in self.model_dump().items() if v is not None}
|
224
226
|
|
225
227
|
|
226
228
|
class LiteLLMProxyConfig(BaseSettings):
|
@@ -229,8 +231,7 @@ class LiteLLMProxyConfig(BaseSettings):
|
|
229
231
|
api_key: str = "" # read from env var LITELLM_API_KEY if set
|
230
232
|
api_base: str = "" # read from env var LITELLM_API_BASE if set
|
231
233
|
|
232
|
-
|
233
|
-
env_prefix = "LITELLM_"
|
234
|
+
model_config = SettingsConfigDict(env_prefix="LITELLM_")
|
234
235
|
|
235
236
|
|
236
237
|
class OpenAIGPTConfig(LLMConfig):
|
@@ -259,7 +260,7 @@ class OpenAIGPTConfig(LLMConfig):
|
|
259
260
|
litellm_proxy: LiteLLMProxyConfig = LiteLLMProxyConfig()
|
260
261
|
ollama: bool = False # use ollama's OpenAI-compatible endpoint?
|
261
262
|
min_output_tokens: int = 1
|
262
|
-
use_chat_for_completion = True # do not change this, for OpenAI models!
|
263
|
+
use_chat_for_completion: bool = True # do not change this, for OpenAI models!
|
263
264
|
timeout: int = 20
|
264
265
|
temperature: float = 0.2
|
265
266
|
seed: int | None = 42
|
@@ -287,6 +288,9 @@ class OpenAIGPTConfig(LLMConfig):
|
|
287
288
|
langdb_params: LangDBParams = LangDBParams()
|
288
289
|
portkey_params: PortkeyParams = PortkeyParams()
|
289
290
|
headers: Dict[str, str] = {}
|
291
|
+
http_client_factory: Optional[Callable[[], Any]] = None # Factory for httpx.Client
|
292
|
+
http_verify_ssl: bool = True # Simple flag for SSL verification
|
293
|
+
http_client_config: Optional[Dict[str, Any]] = None # Config dict for httpx.Client
|
290
294
|
|
291
295
|
def __init__(self, **kwargs) -> None: # type: ignore
|
292
296
|
local_model = "api_base" in kwargs and kwargs["api_base"] is not None
|
@@ -313,8 +317,43 @@ class OpenAIGPTConfig(LLMConfig):
|
|
313
317
|
|
314
318
|
super().__init__(**kwargs)
|
315
319
|
|
316
|
-
|
317
|
-
|
320
|
+
model_config = SettingsConfigDict(env_prefix="OPENAI_")
|
321
|
+
|
322
|
+
def model_copy(
|
323
|
+
self, *, update: Mapping[str, Any] | None = None, deep: bool = False
|
324
|
+
) -> "OpenAIGPTConfig":
|
325
|
+
"""
|
326
|
+
Override model_copy to handle unpicklable fields properly.
|
327
|
+
|
328
|
+
This preserves fields like http_client_factory during normal copying
|
329
|
+
while still allowing exclusion for pickling operations.
|
330
|
+
"""
|
331
|
+
# Save references to unpicklable fields
|
332
|
+
http_client_factory = self.http_client_factory
|
333
|
+
streamer = self.streamer
|
334
|
+
streamer_async = self.streamer_async
|
335
|
+
|
336
|
+
# Get the current model data, excluding problematic fields
|
337
|
+
data = self.model_dump(
|
338
|
+
exclude={"http_client_factory", "streamer", "streamer_async"}
|
339
|
+
)
|
340
|
+
|
341
|
+
# Apply any updates
|
342
|
+
if update:
|
343
|
+
data.update(update)
|
344
|
+
|
345
|
+
# Create a new instance with the copied data
|
346
|
+
new_instance = self.__class__(**data)
|
347
|
+
|
348
|
+
# Restore the unpicklable fields if they weren't overridden by update
|
349
|
+
if "http_client_factory" not in (update or {}):
|
350
|
+
new_instance.http_client_factory = http_client_factory
|
351
|
+
if "streamer" not in (update or {}):
|
352
|
+
new_instance.streamer = streamer
|
353
|
+
if "streamer_async" not in (update or {}):
|
354
|
+
new_instance.streamer_async = streamer_async
|
355
|
+
|
356
|
+
return new_instance
|
318
357
|
|
319
358
|
def _validate_litellm(self) -> None:
|
320
359
|
"""
|
@@ -327,12 +366,12 @@ class OpenAIGPTConfig(LLMConfig):
|
|
327
366
|
import litellm
|
328
367
|
except ImportError:
|
329
368
|
raise LangroidImportError("litellm", "litellm")
|
369
|
+
|
330
370
|
litellm.telemetry = False
|
331
371
|
litellm.drop_params = True # drop un-supported params without crashing
|
332
|
-
# modify params to fit the model expectations, and avoid crashing
|
333
|
-
# (e.g. anthropic doesn't like first msg to be system msg)
|
334
372
|
litellm.modify_params = True
|
335
373
|
self.seed = None # some local mdls don't support seed
|
374
|
+
|
336
375
|
if self.api_key == DUMMY_API_KEY:
|
337
376
|
keys_dict = litellm.utils.validate_environment(self.chat_model)
|
338
377
|
missing_keys = keys_dict.get("missing_keys", [])
|
@@ -362,8 +401,7 @@ class OpenAIGPTConfig(LLMConfig):
|
|
362
401
|
class DynamicConfig(OpenAIGPTConfig):
|
363
402
|
pass
|
364
403
|
|
365
|
-
DynamicConfig.
|
366
|
-
|
404
|
+
DynamicConfig.model_config = SettingsConfigDict(env_prefix=prefix.upper() + "_")
|
367
405
|
return DynamicConfig
|
368
406
|
|
369
407
|
|
@@ -404,7 +442,7 @@ class OpenAIGPT(LanguageModel):
|
|
404
442
|
config: configuration for openai-gpt model
|
405
443
|
"""
|
406
444
|
# copy the config to avoid modifying the original
|
407
|
-
config = config.
|
445
|
+
config = config.model_copy()
|
408
446
|
super().__init__(config)
|
409
447
|
self.config: OpenAIGPTConfig = config
|
410
448
|
# save original model name such as `provider/model` before
|
@@ -631,6 +669,32 @@ class OpenAIGPT(LanguageModel):
|
|
631
669
|
# Add Portkey-specific headers
|
632
670
|
self.config.headers.update(self.config.portkey_params.get_headers())
|
633
671
|
|
672
|
+
# Create http_client if needed - Priority order:
|
673
|
+
# 1. http_client_factory (most flexibility, not cacheable)
|
674
|
+
# 2. http_client_config (cacheable, moderate flexibility)
|
675
|
+
# 3. http_verify_ssl=False (cacheable, simple SSL bypass)
|
676
|
+
http_client = None
|
677
|
+
async_http_client = None
|
678
|
+
http_client_config_used = None
|
679
|
+
|
680
|
+
if self.config.http_client_factory is not None:
|
681
|
+
# Use the factory to create http_client (not cacheable)
|
682
|
+
http_client = self.config.http_client_factory()
|
683
|
+
# Don't set async_http_client from sync client - create separately
|
684
|
+
# This avoids type mismatch issues
|
685
|
+
async_http_client = None
|
686
|
+
elif self.config.http_client_config is not None:
|
687
|
+
# Use config dict (cacheable)
|
688
|
+
http_client_config_used = self.config.http_client_config
|
689
|
+
elif not self.config.http_verify_ssl:
|
690
|
+
# Simple SSL bypass (cacheable)
|
691
|
+
http_client_config_used = {"verify": False}
|
692
|
+
logging.warning(
|
693
|
+
"SSL verification has been disabled. This is insecure and "
|
694
|
+
"should only be used in trusted environments (e.g., "
|
695
|
+
"corporate networks with self-signed certificates)."
|
696
|
+
)
|
697
|
+
|
634
698
|
if self.config.use_cached_client:
|
635
699
|
self.client = get_openai_client(
|
636
700
|
api_key=self.api_key,
|
@@ -638,6 +702,8 @@ class OpenAIGPT(LanguageModel):
|
|
638
702
|
organization=self.config.organization,
|
639
703
|
timeout=Timeout(self.config.timeout),
|
640
704
|
default_headers=self.config.headers,
|
705
|
+
http_client=http_client,
|
706
|
+
http_client_config=http_client_config_used,
|
641
707
|
)
|
642
708
|
self.async_client = get_async_openai_client(
|
643
709
|
api_key=self.api_key,
|
@@ -645,23 +711,56 @@ class OpenAIGPT(LanguageModel):
|
|
645
711
|
organization=self.config.organization,
|
646
712
|
timeout=Timeout(self.config.timeout),
|
647
713
|
default_headers=self.config.headers,
|
714
|
+
http_client=async_http_client,
|
715
|
+
http_client_config=http_client_config_used,
|
648
716
|
)
|
649
717
|
else:
|
650
718
|
# Create new clients without caching
|
651
|
-
|
719
|
+
client_kwargs: Dict[str, Any] = dict(
|
652
720
|
api_key=self.api_key,
|
653
721
|
base_url=self.api_base,
|
654
722
|
organization=self.config.organization,
|
655
723
|
timeout=Timeout(self.config.timeout),
|
656
724
|
default_headers=self.config.headers,
|
657
725
|
)
|
658
|
-
|
726
|
+
if http_client is not None:
|
727
|
+
client_kwargs["http_client"] = http_client
|
728
|
+
elif http_client_config_used is not None:
|
729
|
+
# Create http_client from config for non-cached scenario
|
730
|
+
try:
|
731
|
+
from httpx import Client
|
732
|
+
|
733
|
+
client_kwargs["http_client"] = Client(**http_client_config_used)
|
734
|
+
except ImportError:
|
735
|
+
raise ValueError(
|
736
|
+
"httpx is required to use http_client_config. "
|
737
|
+
"Install it with: pip install httpx"
|
738
|
+
)
|
739
|
+
self.client = OpenAI(**client_kwargs)
|
740
|
+
|
741
|
+
async_client_kwargs: Dict[str, Any] = dict(
|
659
742
|
api_key=self.api_key,
|
660
743
|
base_url=self.api_base,
|
661
744
|
organization=self.config.organization,
|
662
745
|
timeout=Timeout(self.config.timeout),
|
663
746
|
default_headers=self.config.headers,
|
664
747
|
)
|
748
|
+
if async_http_client is not None:
|
749
|
+
async_client_kwargs["http_client"] = async_http_client
|
750
|
+
elif http_client_config_used is not None:
|
751
|
+
# Create async http_client from config for non-cached scenario
|
752
|
+
try:
|
753
|
+
from httpx import AsyncClient
|
754
|
+
|
755
|
+
async_client_kwargs["http_client"] = AsyncClient(
|
756
|
+
**http_client_config_used
|
757
|
+
)
|
758
|
+
except ImportError:
|
759
|
+
raise ValueError(
|
760
|
+
"httpx is required to use http_client_config. "
|
761
|
+
"Install it with: pip install httpx"
|
762
|
+
)
|
763
|
+
self.async_client = AsyncOpenAI(**async_client_kwargs)
|
665
764
|
|
666
765
|
self.cache: CacheDB | None = None
|
667
766
|
use_cache = self.config.cache_config is not None
|
@@ -1413,7 +1512,7 @@ class OpenAIGPT(LanguageModel):
|
|
1413
1512
|
|
1414
1513
|
if has_function:
|
1415
1514
|
function_call = LLMFunctionCall(name=function_name)
|
1416
|
-
function_call_dict = function_call.
|
1515
|
+
function_call_dict = function_call.model_dump()
|
1417
1516
|
if function_args == "":
|
1418
1517
|
function_call.arguments = None
|
1419
1518
|
else:
|
@@ -1465,7 +1564,7 @@ class OpenAIGPT(LanguageModel):
|
|
1465
1564
|
),
|
1466
1565
|
),
|
1467
1566
|
),
|
1468
|
-
openai_response.
|
1567
|
+
openai_response.model_dump(),
|
1469
1568
|
)
|
1470
1569
|
|
1471
1570
|
def _cache_store(self, k: str, v: Any) -> None:
|
@@ -1616,7 +1715,7 @@ class OpenAIGPT(LanguageModel):
|
|
1616
1715
|
cached, hashed_key, response = completions_with_backoff(**args)
|
1617
1716
|
# assume response is an actual response rather than a streaming event
|
1618
1717
|
if not isinstance(response, dict):
|
1619
|
-
response = response.
|
1718
|
+
response = response.model_dump()
|
1620
1719
|
if "message" in response["choices"][0]:
|
1621
1720
|
msg = response["choices"][0]["message"]["content"].strip()
|
1622
1721
|
else:
|
@@ -1694,7 +1793,7 @@ class OpenAIGPT(LanguageModel):
|
|
1694
1793
|
)
|
1695
1794
|
# assume response is an actual response rather than a streaming event
|
1696
1795
|
if not isinstance(response, dict):
|
1697
|
-
response = response.
|
1796
|
+
response = response.model_dump()
|
1698
1797
|
if "message" in response["choices"][0]:
|
1699
1798
|
msg = response["choices"][0]["message"]["content"].strip()
|
1700
1799
|
else:
|
@@ -1992,7 +2091,7 @@ class OpenAIGPT(LanguageModel):
|
|
1992
2091
|
if functions is not None:
|
1993
2092
|
args.update(
|
1994
2093
|
dict(
|
1995
|
-
functions=[f.
|
2094
|
+
functions=[f.model_dump() for f in functions],
|
1996
2095
|
function_call=function_call,
|
1997
2096
|
)
|
1998
2097
|
)
|
@@ -2010,7 +2109,7 @@ class OpenAIGPT(LanguageModel):
|
|
2010
2109
|
tools=[
|
2011
2110
|
dict(
|
2012
2111
|
type="function",
|
2013
|
-
function=t.function.
|
2112
|
+
function=t.function.model_dump()
|
2014
2113
|
| ({"strict": t.strict} if t.strict is not None else {}),
|
2015
2114
|
)
|
2016
2115
|
for t in tools
|
@@ -4,7 +4,7 @@ Provider-specific parameter configurations for various LLM providers.
|
|
4
4
|
|
5
5
|
from typing import Any, Dict, Optional
|
6
6
|
|
7
|
-
from
|
7
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
8
8
|
|
9
9
|
# Constants
|
10
10
|
LANGDB_BASE_URL = "https://api.us-east-1.langdb.ai"
|
@@ -24,10 +24,7 @@ class LangDBParams(BaseSettings):
|
|
24
24
|
thread_id: Optional[str] = None
|
25
25
|
base_url: str = LANGDB_BASE_URL
|
26
26
|
|
27
|
-
|
28
|
-
# allow setting of fields via env vars,
|
29
|
-
# e.g. LANGDB_PROJECT_ID=1234
|
30
|
-
env_prefix = "LANGDB_"
|
27
|
+
model_config = SettingsConfigDict(env_prefix="LANGDB_")
|
31
28
|
|
32
29
|
|
33
30
|
class PortkeyParams(BaseSettings):
|
@@ -61,10 +58,7 @@ class PortkeyParams(BaseSettings):
|
|
61
58
|
custom_headers: Optional[Dict[str, str]] = None # Optional: additional headers
|
62
59
|
base_url: str = PORTKEY_BASE_URL
|
63
60
|
|
64
|
-
|
65
|
-
# allow setting of fields via env vars,
|
66
|
-
# e.g. PORTKEY_API_KEY=xxx, PORTKEY_PROVIDER=anthropic
|
67
|
-
env_prefix = "PORTKEY_"
|
61
|
+
model_config = SettingsConfigDict(env_prefix="PORTKEY_")
|
68
62
|
|
69
63
|
def get_headers(self) -> Dict[str, str]:
|
70
64
|
"""Generate Portkey-specific headers from parameters."""
|
@@ -73,7 +67,6 @@ class PortkeyParams(BaseSettings):
|
|
73
67
|
|
74
68
|
headers = {}
|
75
69
|
|
76
|
-
# API key - from params or environment
|
77
70
|
if self.api_key and self.api_key != DUMMY_API_KEY:
|
78
71
|
headers["x-portkey-api-key"] = self.api_key
|
79
72
|
else:
|
@@ -81,45 +74,35 @@ class PortkeyParams(BaseSettings):
|
|
81
74
|
if portkey_key:
|
82
75
|
headers["x-portkey-api-key"] = portkey_key
|
83
76
|
|
84
|
-
# Provider
|
85
77
|
if self.provider:
|
86
78
|
headers["x-portkey-provider"] = self.provider
|
87
79
|
|
88
|
-
# Virtual key
|
89
80
|
if self.virtual_key:
|
90
81
|
headers["x-portkey-virtual-key"] = self.virtual_key
|
91
82
|
|
92
|
-
# Trace ID
|
93
83
|
if self.trace_id:
|
94
84
|
headers["x-portkey-trace-id"] = self.trace_id
|
95
85
|
|
96
|
-
# Metadata
|
97
86
|
if self.metadata:
|
98
87
|
headers["x-portkey-metadata"] = json.dumps(self.metadata)
|
99
88
|
|
100
|
-
# Retry configuration
|
101
89
|
if self.retry:
|
102
90
|
headers["x-portkey-retry"] = json.dumps(self.retry)
|
103
91
|
|
104
|
-
# Cache configuration
|
105
92
|
if self.cache:
|
106
93
|
headers["x-portkey-cache"] = json.dumps(self.cache)
|
107
94
|
|
108
|
-
# Cache force refresh
|
109
95
|
if self.cache_force_refresh is not None:
|
110
96
|
headers["x-portkey-cache-force-refresh"] = str(
|
111
97
|
self.cache_force_refresh
|
112
98
|
).lower()
|
113
99
|
|
114
|
-
# User identifier
|
115
100
|
if self.user:
|
116
101
|
headers["x-portkey-user"] = self.user
|
117
102
|
|
118
|
-
# Organization identifier
|
119
103
|
if self.organization:
|
120
104
|
headers["x-portkey-organization"] = self.organization
|
121
105
|
|
122
|
-
# Add any custom headers
|
123
106
|
if self.custom_headers:
|
124
107
|
headers.update(self.custom_headers)
|
125
108
|
|
@@ -138,7 +121,6 @@ class PortkeyParams(BaseSettings):
|
|
138
121
|
_, provider, model = parts
|
139
122
|
return provider, model
|
140
123
|
else:
|
141
|
-
# Fallback: just remove "portkey/" prefix and return empty provider
|
142
124
|
model = model_string.replace("portkey/", "")
|
143
125
|
return "", model
|
144
126
|
|
@@ -157,7 +139,6 @@ class PortkeyParams(BaseSettings):
|
|
157
139
|
"""
|
158
140
|
import os
|
159
141
|
|
160
|
-
# Common environment variable patterns for different providers
|
161
142
|
env_patterns = [
|
162
143
|
f"{provider.upper()}_API_KEY",
|
163
144
|
f"{provider.upper()}_KEY",
|
langroid/mytypes.py
CHANGED
@@ -3,7 +3,7 @@ from textwrap import dedent
|
|
3
3
|
from typing import Any, Callable, Dict, List, Union
|
4
4
|
from uuid import uuid4
|
5
5
|
|
6
|
-
from
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
7
7
|
|
8
8
|
Number = Union[int, float]
|
9
9
|
Embedding = List[Number]
|
@@ -51,13 +51,21 @@ class DocMetaData(BaseModel):
|
|
51
51
|
id: str = Field(default_factory=lambda: str(uuid4()))
|
52
52
|
window_ids: List[str] = [] # for RAG: ids of chunks around this one
|
53
53
|
|
54
|
+
@field_validator("id", mode="before")
|
55
|
+
@classmethod
|
56
|
+
def convert_id_to_string(cls, v: Any) -> str:
|
57
|
+
"""Convert id to string if it's not already."""
|
58
|
+
if v is None:
|
59
|
+
return str(uuid4())
|
60
|
+
return str(v)
|
61
|
+
|
54
62
|
def dict_bool_int(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
|
55
63
|
"""
|
56
64
|
Special dict method to convert bool fields to int, to appease some
|
57
65
|
downstream libraries, e.g. Chroma which complains about bool fields in
|
58
66
|
metadata.
|
59
67
|
"""
|
60
|
-
original_dict = super().
|
68
|
+
original_dict = super().model_dump(*args, **kwargs)
|
61
69
|
|
62
70
|
for key, value in original_dict.items():
|
63
71
|
if isinstance(value, bool):
|
@@ -92,8 +100,7 @@ class DocMetaData(BaseModel):
|
|
92
100
|
)
|
93
101
|
return ", ".join(components)
|
94
102
|
|
95
|
-
|
96
|
-
extra = Extra.allow
|
103
|
+
model_config = ConfigDict(extra="allow")
|
97
104
|
|
98
105
|
|
99
106
|
class Document(BaseModel):
|
langroid/parsing/code_parser.py
CHANGED
@@ -2,12 +2,12 @@ from functools import reduce
|
|
2
2
|
from typing import Callable, List
|
3
3
|
|
4
4
|
import tiktoken
|
5
|
+
from pydantic_settings import BaseSettings
|
5
6
|
from pygments import lex
|
6
7
|
from pygments.lexers import get_lexer_by_name
|
7
8
|
from pygments.token import Token
|
8
9
|
|
9
10
|
from langroid.mytypes import Document
|
10
|
-
from langroid.pydantic_v1 import BaseSettings
|
11
11
|
|
12
12
|
|
13
13
|
def chunk_code(
|
langroid/parsing/md_parser.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import re
|
2
|
-
from typing import List
|
2
|
+
from typing import Any, List
|
3
3
|
|
4
|
-
from
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
5
5
|
|
6
6
|
HEADER_CONTEXT_SEP = "\n...\n"
|
7
7
|
|
@@ -24,8 +24,8 @@ class Node(BaseModel):
|
|
24
24
|
# Forward references will be resolved with the update_forward_refs call below.
|
25
25
|
|
26
26
|
|
27
|
-
# Resolve forward references for Node (required for recursive models in Pydantic
|
28
|
-
Node.
|
27
|
+
# Resolve forward references for Node (required for recursive models in Pydantic)
|
28
|
+
Node.model_rebuild()
|
29
29
|
|
30
30
|
|
31
31
|
def _cleanup_text(text: str) -> str:
|
@@ -180,6 +180,16 @@ class MarkdownChunkConfig(BaseModel):
|
|
180
180
|
rollup: bool = True # whether to roll up chunks
|
181
181
|
header_context_sep: str = HEADER_CONTEXT_SEP # separator for header context
|
182
182
|
|
183
|
+
@field_validator("chunk_size", mode="before")
|
184
|
+
@classmethod
|
185
|
+
def convert_chunk_size_to_int(cls, v: Any) -> int:
|
186
|
+
"""Convert chunk_size to int, maintaining backward compatibility
|
187
|
+
with Pydantic V1.
|
188
|
+
"""
|
189
|
+
if isinstance(v, float):
|
190
|
+
return int(v)
|
191
|
+
return int(v)
|
192
|
+
|
183
193
|
|
184
194
|
# A simple tokenizer that counts tokens as whitespace-separated words.
|
185
195
|
def count_words(text: str) -> int:
|
langroid/parsing/parser.py
CHANGED
@@ -4,6 +4,8 @@ from enum import Enum
|
|
4
4
|
from typing import Any, Dict, List, Literal, Optional
|
5
5
|
|
6
6
|
import tiktoken
|
7
|
+
from pydantic import field_validator, model_validator
|
8
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
7
9
|
|
8
10
|
from langroid.mytypes import Document
|
9
11
|
from langroid.parsing.md_parser import (
|
@@ -12,7 +14,6 @@ from langroid.parsing.md_parser import (
|
|
12
14
|
count_words,
|
13
15
|
)
|
14
16
|
from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
|
15
|
-
from langroid.pydantic_v1 import BaseSettings, root_validator
|
16
17
|
from langroid.utils.object_registry import ObjectRegistry
|
17
18
|
|
18
19
|
logger = logging.getLogger(__name__)
|
@@ -32,8 +33,7 @@ class BaseParsingConfig(BaseSettings):
|
|
32
33
|
|
33
34
|
library: str
|
34
35
|
|
35
|
-
|
36
|
-
extra = "ignore" # Ignore unknown settings
|
36
|
+
model_config = SettingsConfigDict(extra="ignore") # Ignore unknown settings
|
37
37
|
|
38
38
|
|
39
39
|
class LLMPdfParserConfig(BaseSettings):
|
@@ -69,7 +69,8 @@ class PdfParsingConfig(BaseParsingConfig):
|
|
69
69
|
llm_parser_config: Optional[LLMPdfParserConfig] = None
|
70
70
|
marker_config: Optional[MarkerConfig] = None
|
71
71
|
|
72
|
-
@
|
72
|
+
@model_validator(mode="before")
|
73
|
+
@classmethod
|
73
74
|
def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
74
75
|
"""Ensure correct config is set based on library selection."""
|
75
76
|
library = values.get("library")
|
@@ -114,6 +115,17 @@ class ParsingConfig(BaseSettings):
|
|
114
115
|
chunk_size_variation: float = 0.30 # max variation from chunk_size
|
115
116
|
overlap: int = 50 # overlap between chunks
|
116
117
|
max_chunks: int = 10_000
|
118
|
+
|
119
|
+
@field_validator("chunk_size", mode="before")
|
120
|
+
@classmethod
|
121
|
+
def convert_chunk_size_to_int(cls, v: Any) -> int:
|
122
|
+
"""Convert chunk_size to int, maintaining backward compatibility
|
123
|
+
with Pydantic V1.
|
124
|
+
"""
|
125
|
+
if isinstance(v, float):
|
126
|
+
return int(v)
|
127
|
+
return int(v)
|
128
|
+
|
117
129
|
# offset to subtract from page numbers:
|
118
130
|
# e.g. if physical page 12 is displayed as page 1, set page_number_offset = 11
|
119
131
|
page_number_offset: int = 0
|
@@ -203,7 +215,8 @@ class Parser:
|
|
203
215
|
# add_window_ids)
|
204
216
|
chunk_docs = [
|
205
217
|
Document(
|
206
|
-
content=c,
|
218
|
+
content=c,
|
219
|
+
metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
|
207
220
|
)
|
208
221
|
for c in chunks
|
209
222
|
if c.strip() != ""
|
@@ -255,7 +268,8 @@ class Parser:
|
|
255
268
|
# add_window_ids)
|
256
269
|
chunk_docs = [
|
257
270
|
Document(
|
258
|
-
content=c,
|
271
|
+
content=c,
|
272
|
+
metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
|
259
273
|
)
|
260
274
|
for c in chunks
|
261
275
|
if c.strip() != ""
|
@@ -287,7 +301,8 @@ class Parser:
|
|
287
301
|
# add_window_ids)
|
288
302
|
chunk_docs = [
|
289
303
|
Document(
|
290
|
-
content=c,
|
304
|
+
content=c,
|
305
|
+
metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
|
291
306
|
)
|
292
307
|
for c in chunks
|
293
308
|
if c.strip() != ""
|
langroid/parsing/repo_loader.py
CHANGED
@@ -18,10 +18,12 @@ if TYPE_CHECKING:
|
|
18
18
|
from github.Label import Label
|
19
19
|
from github.Repository import Repository
|
20
20
|
|
21
|
+
from pydantic import BaseModel, Field
|
22
|
+
from pydantic_settings import BaseSettings
|
23
|
+
|
21
24
|
from langroid.mytypes import DocMetaData, Document
|
22
25
|
from langroid.parsing.document_parser import DocumentParser, DocumentType
|
23
26
|
from langroid.parsing.parser import Parser, ParsingConfig
|
24
|
-
from langroid.pydantic_v1 import BaseModel, BaseSettings, Field
|
25
27
|
|
26
28
|
logger = logging.getLogger(__name__)
|
27
29
|
|
langroid/parsing/search.py
CHANGED
@@ -64,7 +64,7 @@ def find_fuzzy_matches_in_docs(
|
|
64
64
|
return orig_doc_matches
|
65
65
|
if len(orig_doc_matches) == 0:
|
66
66
|
return []
|
67
|
-
if set(orig_doc_matches[0][0].
|
67
|
+
if set(orig_doc_matches[0][0].model_fields) != {"content", "metadata"}:
|
68
68
|
# If there are fields beyond just content and metadata,
|
69
69
|
# we do NOT want to create new document objects with content fields
|
70
70
|
# based on words_before and words_after, since we don't know how to
|