langroid 0.58.2__py3-none-any.whl → 0.59.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. langroid/agent/base.py +39 -17
  2. langroid/agent/callbacks/chainlit.py +2 -1
  3. langroid/agent/chat_agent.py +73 -55
  4. langroid/agent/chat_document.py +7 -7
  5. langroid/agent/done_sequence_parser.py +46 -11
  6. langroid/agent/openai_assistant.py +9 -9
  7. langroid/agent/special/arangodb/arangodb_agent.py +10 -18
  8. langroid/agent/special/arangodb/tools.py +3 -3
  9. langroid/agent/special/doc_chat_agent.py +16 -14
  10. langroid/agent/special/lance_rag/critic_agent.py +2 -2
  11. langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
  12. langroid/agent/special/lance_tools.py +6 -5
  13. langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
  14. langroid/agent/special/relevance_extractor_agent.py +1 -1
  15. langroid/agent/special/sql/sql_chat_agent.py +11 -3
  16. langroid/agent/task.py +53 -94
  17. langroid/agent/tool_message.py +33 -17
  18. langroid/agent/tools/file_tools.py +4 -2
  19. langroid/agent/tools/mcp/fastmcp_client.py +19 -6
  20. langroid/agent/tools/orchestration.py +22 -17
  21. langroid/agent/tools/recipient_tool.py +3 -3
  22. langroid/agent/tools/task_tool.py +22 -16
  23. langroid/agent/xml_tool_message.py +90 -35
  24. langroid/cachedb/base.py +1 -1
  25. langroid/embedding_models/base.py +2 -2
  26. langroid/embedding_models/models.py +3 -7
  27. langroid/exceptions.py +4 -1
  28. langroid/language_models/azure_openai.py +2 -2
  29. langroid/language_models/base.py +6 -4
  30. langroid/language_models/client_cache.py +64 -0
  31. langroid/language_models/config.py +2 -4
  32. langroid/language_models/model_info.py +9 -1
  33. langroid/language_models/openai_gpt.py +119 -20
  34. langroid/language_models/provider_params.py +3 -22
  35. langroid/mytypes.py +11 -4
  36. langroid/parsing/code_parser.py +1 -1
  37. langroid/parsing/file_attachment.py +1 -1
  38. langroid/parsing/md_parser.py +14 -4
  39. langroid/parsing/parser.py +22 -7
  40. langroid/parsing/repo_loader.py +3 -1
  41. langroid/parsing/search.py +1 -1
  42. langroid/parsing/url_loader.py +17 -51
  43. langroid/parsing/urls.py +5 -4
  44. langroid/prompts/prompts_config.py +1 -1
  45. langroid/pydantic_v1/__init__.py +61 -4
  46. langroid/pydantic_v1/main.py +10 -4
  47. langroid/utils/configuration.py +13 -11
  48. langroid/utils/constants.py +1 -1
  49. langroid/utils/globals.py +21 -5
  50. langroid/utils/html_logger.py +2 -1
  51. langroid/utils/object_registry.py +1 -1
  52. langroid/utils/pydantic_utils.py +55 -28
  53. langroid/utils/types.py +2 -2
  54. langroid/vector_store/base.py +3 -3
  55. langroid/vector_store/lancedb.py +5 -5
  56. langroid/vector_store/meilisearch.py +2 -2
  57. langroid/vector_store/pineconedb.py +4 -4
  58. langroid/vector_store/postgres.py +1 -1
  59. langroid/vector_store/qdrantdb.py +3 -3
  60. langroid/vector_store/weaviatedb.py +1 -1
  61. {langroid-0.58.2.dist-info → langroid-0.59.0.dist-info}/METADATA +3 -2
  62. {langroid-0.58.2.dist-info → langroid-0.59.0.dist-info}/RECORD +64 -64
  63. {langroid-0.58.2.dist-info → langroid-0.59.0.dist-info}/WHEEL +0 -0
  64. {langroid-0.58.2.dist-info → langroid-0.59.0.dist-info}/licenses/LICENSE +0 -0
@@ -12,6 +12,7 @@ from typing import (
12
12
  Callable,
13
13
  Dict,
14
14
  List,
15
+ Mapping,
15
16
  Optional,
16
17
  Tuple,
17
18
  Type,
@@ -24,6 +25,8 @@ from cerebras.cloud.sdk import AsyncCerebras, Cerebras
24
25
  from groq import AsyncGroq, Groq
25
26
  from httpx import Timeout
26
27
  from openai import AsyncOpenAI, OpenAI
28
+ from pydantic import BaseModel
29
+ from pydantic_settings import BaseSettings, SettingsConfigDict
27
30
  from rich import print
28
31
  from rich.markup import escape
29
32
 
@@ -78,7 +81,6 @@ from langroid.language_models.utils import (
78
81
  retry_with_exponential_backoff,
79
82
  )
80
83
  from langroid.parsing.parse_json import parse_imperfect_json
81
- from langroid.pydantic_v1 import BaseModel, BaseSettings
82
84
  from langroid.utils.configuration import settings
83
85
  from langroid.utils.constants import Colors
84
86
  from langroid.utils.system import friendly_error
@@ -220,7 +222,7 @@ class OpenAICallParams(BaseModel):
220
222
  extra_body: Dict[str, Any] | None = None # additional params for API request body
221
223
 
222
224
  def to_dict_exclude_none(self) -> Dict[str, Any]:
223
- return {k: v for k, v in self.dict().items() if v is not None}
225
+ return {k: v for k, v in self.model_dump().items() if v is not None}
224
226
 
225
227
 
226
228
  class LiteLLMProxyConfig(BaseSettings):
@@ -229,8 +231,7 @@ class LiteLLMProxyConfig(BaseSettings):
229
231
  api_key: str = "" # read from env var LITELLM_API_KEY if set
230
232
  api_base: str = "" # read from env var LITELLM_API_BASE if set
231
233
 
232
- class Config:
233
- env_prefix = "LITELLM_"
234
+ model_config = SettingsConfigDict(env_prefix="LITELLM_")
234
235
 
235
236
 
236
237
  class OpenAIGPTConfig(LLMConfig):
@@ -259,7 +260,7 @@ class OpenAIGPTConfig(LLMConfig):
259
260
  litellm_proxy: LiteLLMProxyConfig = LiteLLMProxyConfig()
260
261
  ollama: bool = False # use ollama's OpenAI-compatible endpoint?
261
262
  min_output_tokens: int = 1
262
- use_chat_for_completion = True # do not change this, for OpenAI models!
263
+ use_chat_for_completion: bool = True # do not change this, for OpenAI models!
263
264
  timeout: int = 20
264
265
  temperature: float = 0.2
265
266
  seed: int | None = 42
@@ -287,6 +288,9 @@ class OpenAIGPTConfig(LLMConfig):
287
288
  langdb_params: LangDBParams = LangDBParams()
288
289
  portkey_params: PortkeyParams = PortkeyParams()
289
290
  headers: Dict[str, str] = {}
291
+ http_client_factory: Optional[Callable[[], Any]] = None # Factory for httpx.Client
292
+ http_verify_ssl: bool = True # Simple flag for SSL verification
293
+ http_client_config: Optional[Dict[str, Any]] = None # Config dict for httpx.Client
290
294
 
291
295
  def __init__(self, **kwargs) -> None: # type: ignore
292
296
  local_model = "api_base" in kwargs and kwargs["api_base"] is not None
@@ -313,8 +317,43 @@ class OpenAIGPTConfig(LLMConfig):
313
317
 
314
318
  super().__init__(**kwargs)
315
319
 
316
- class Config:
317
- env_prefix = "OPENAI_"
320
+ model_config = SettingsConfigDict(env_prefix="OPENAI_")
321
+
322
+ def model_copy(
323
+ self, *, update: Mapping[str, Any] | None = None, deep: bool = False
324
+ ) -> "OpenAIGPTConfig":
325
+ """
326
+ Override model_copy to handle unpicklable fields properly.
327
+
328
+ This preserves fields like http_client_factory during normal copying
329
+ while still allowing exclusion for pickling operations.
330
+ """
331
+ # Save references to unpicklable fields
332
+ http_client_factory = self.http_client_factory
333
+ streamer = self.streamer
334
+ streamer_async = self.streamer_async
335
+
336
+ # Get the current model data, excluding problematic fields
337
+ data = self.model_dump(
338
+ exclude={"http_client_factory", "streamer", "streamer_async"}
339
+ )
340
+
341
+ # Apply any updates
342
+ if update:
343
+ data.update(update)
344
+
345
+ # Create a new instance with the copied data
346
+ new_instance = self.__class__(**data)
347
+
348
+ # Restore the unpicklable fields if they weren't overridden by update
349
+ if "http_client_factory" not in (update or {}):
350
+ new_instance.http_client_factory = http_client_factory
351
+ if "streamer" not in (update or {}):
352
+ new_instance.streamer = streamer
353
+ if "streamer_async" not in (update or {}):
354
+ new_instance.streamer_async = streamer_async
355
+
356
+ return new_instance
318
357
 
319
358
  def _validate_litellm(self) -> None:
320
359
  """
@@ -327,12 +366,12 @@ class OpenAIGPTConfig(LLMConfig):
327
366
  import litellm
328
367
  except ImportError:
329
368
  raise LangroidImportError("litellm", "litellm")
369
+
330
370
  litellm.telemetry = False
331
371
  litellm.drop_params = True # drop un-supported params without crashing
332
- # modify params to fit the model expectations, and avoid crashing
333
- # (e.g. anthropic doesn't like first msg to be system msg)
334
372
  litellm.modify_params = True
335
373
  self.seed = None # some local mdls don't support seed
374
+
336
375
  if self.api_key == DUMMY_API_KEY:
337
376
  keys_dict = litellm.utils.validate_environment(self.chat_model)
338
377
  missing_keys = keys_dict.get("missing_keys", [])
@@ -362,8 +401,7 @@ class OpenAIGPTConfig(LLMConfig):
362
401
  class DynamicConfig(OpenAIGPTConfig):
363
402
  pass
364
403
 
365
- DynamicConfig.Config.env_prefix = prefix.upper() + "_"
366
-
404
+ DynamicConfig.model_config = SettingsConfigDict(env_prefix=prefix.upper() + "_")
367
405
  return DynamicConfig
368
406
 
369
407
 
@@ -404,7 +442,7 @@ class OpenAIGPT(LanguageModel):
404
442
  config: configuration for openai-gpt model
405
443
  """
406
444
  # copy the config to avoid modifying the original
407
- config = config.copy()
445
+ config = config.model_copy()
408
446
  super().__init__(config)
409
447
  self.config: OpenAIGPTConfig = config
410
448
  # save original model name such as `provider/model` before
@@ -631,6 +669,32 @@ class OpenAIGPT(LanguageModel):
631
669
  # Add Portkey-specific headers
632
670
  self.config.headers.update(self.config.portkey_params.get_headers())
633
671
 
672
+ # Create http_client if needed - Priority order:
673
+ # 1. http_client_factory (most flexibility, not cacheable)
674
+ # 2. http_client_config (cacheable, moderate flexibility)
675
+ # 3. http_verify_ssl=False (cacheable, simple SSL bypass)
676
+ http_client = None
677
+ async_http_client = None
678
+ http_client_config_used = None
679
+
680
+ if self.config.http_client_factory is not None:
681
+ # Use the factory to create http_client (not cacheable)
682
+ http_client = self.config.http_client_factory()
683
+ # Don't set async_http_client from sync client - create separately
684
+ # This avoids type mismatch issues
685
+ async_http_client = None
686
+ elif self.config.http_client_config is not None:
687
+ # Use config dict (cacheable)
688
+ http_client_config_used = self.config.http_client_config
689
+ elif not self.config.http_verify_ssl:
690
+ # Simple SSL bypass (cacheable)
691
+ http_client_config_used = {"verify": False}
692
+ logging.warning(
693
+ "SSL verification has been disabled. This is insecure and "
694
+ "should only be used in trusted environments (e.g., "
695
+ "corporate networks with self-signed certificates)."
696
+ )
697
+
634
698
  if self.config.use_cached_client:
635
699
  self.client = get_openai_client(
636
700
  api_key=self.api_key,
@@ -638,6 +702,8 @@ class OpenAIGPT(LanguageModel):
638
702
  organization=self.config.organization,
639
703
  timeout=Timeout(self.config.timeout),
640
704
  default_headers=self.config.headers,
705
+ http_client=http_client,
706
+ http_client_config=http_client_config_used,
641
707
  )
642
708
  self.async_client = get_async_openai_client(
643
709
  api_key=self.api_key,
@@ -645,23 +711,56 @@ class OpenAIGPT(LanguageModel):
645
711
  organization=self.config.organization,
646
712
  timeout=Timeout(self.config.timeout),
647
713
  default_headers=self.config.headers,
714
+ http_client=async_http_client,
715
+ http_client_config=http_client_config_used,
648
716
  )
649
717
  else:
650
718
  # Create new clients without caching
651
- self.client = OpenAI(
719
+ client_kwargs: Dict[str, Any] = dict(
652
720
  api_key=self.api_key,
653
721
  base_url=self.api_base,
654
722
  organization=self.config.organization,
655
723
  timeout=Timeout(self.config.timeout),
656
724
  default_headers=self.config.headers,
657
725
  )
658
- self.async_client = AsyncOpenAI(
726
+ if http_client is not None:
727
+ client_kwargs["http_client"] = http_client
728
+ elif http_client_config_used is not None:
729
+ # Create http_client from config for non-cached scenario
730
+ try:
731
+ from httpx import Client
732
+
733
+ client_kwargs["http_client"] = Client(**http_client_config_used)
734
+ except ImportError:
735
+ raise ValueError(
736
+ "httpx is required to use http_client_config. "
737
+ "Install it with: pip install httpx"
738
+ )
739
+ self.client = OpenAI(**client_kwargs)
740
+
741
+ async_client_kwargs: Dict[str, Any] = dict(
659
742
  api_key=self.api_key,
660
743
  base_url=self.api_base,
661
744
  organization=self.config.organization,
662
745
  timeout=Timeout(self.config.timeout),
663
746
  default_headers=self.config.headers,
664
747
  )
748
+ if async_http_client is not None:
749
+ async_client_kwargs["http_client"] = async_http_client
750
+ elif http_client_config_used is not None:
751
+ # Create async http_client from config for non-cached scenario
752
+ try:
753
+ from httpx import AsyncClient
754
+
755
+ async_client_kwargs["http_client"] = AsyncClient(
756
+ **http_client_config_used
757
+ )
758
+ except ImportError:
759
+ raise ValueError(
760
+ "httpx is required to use http_client_config. "
761
+ "Install it with: pip install httpx"
762
+ )
763
+ self.async_client = AsyncOpenAI(**async_client_kwargs)
665
764
 
666
765
  self.cache: CacheDB | None = None
667
766
  use_cache = self.config.cache_config is not None
@@ -1413,7 +1512,7 @@ class OpenAIGPT(LanguageModel):
1413
1512
 
1414
1513
  if has_function:
1415
1514
  function_call = LLMFunctionCall(name=function_name)
1416
- function_call_dict = function_call.dict()
1515
+ function_call_dict = function_call.model_dump()
1417
1516
  if function_args == "":
1418
1517
  function_call.arguments = None
1419
1518
  else:
@@ -1465,7 +1564,7 @@ class OpenAIGPT(LanguageModel):
1465
1564
  ),
1466
1565
  ),
1467
1566
  ),
1468
- openai_response.dict(),
1567
+ openai_response.model_dump(),
1469
1568
  )
1470
1569
 
1471
1570
  def _cache_store(self, k: str, v: Any) -> None:
@@ -1616,7 +1715,7 @@ class OpenAIGPT(LanguageModel):
1616
1715
  cached, hashed_key, response = completions_with_backoff(**args)
1617
1716
  # assume response is an actual response rather than a streaming event
1618
1717
  if not isinstance(response, dict):
1619
- response = response.dict()
1718
+ response = response.model_dump()
1620
1719
  if "message" in response["choices"][0]:
1621
1720
  msg = response["choices"][0]["message"]["content"].strip()
1622
1721
  else:
@@ -1694,7 +1793,7 @@ class OpenAIGPT(LanguageModel):
1694
1793
  )
1695
1794
  # assume response is an actual response rather than a streaming event
1696
1795
  if not isinstance(response, dict):
1697
- response = response.dict()
1796
+ response = response.model_dump()
1698
1797
  if "message" in response["choices"][0]:
1699
1798
  msg = response["choices"][0]["message"]["content"].strip()
1700
1799
  else:
@@ -1992,7 +2091,7 @@ class OpenAIGPT(LanguageModel):
1992
2091
  if functions is not None:
1993
2092
  args.update(
1994
2093
  dict(
1995
- functions=[f.dict() for f in functions],
2094
+ functions=[f.model_dump() for f in functions],
1996
2095
  function_call=function_call,
1997
2096
  )
1998
2097
  )
@@ -2010,7 +2109,7 @@ class OpenAIGPT(LanguageModel):
2010
2109
  tools=[
2011
2110
  dict(
2012
2111
  type="function",
2013
- function=t.function.dict()
2112
+ function=t.function.model_dump()
2014
2113
  | ({"strict": t.strict} if t.strict is not None else {}),
2015
2114
  )
2016
2115
  for t in tools
@@ -4,7 +4,7 @@ Provider-specific parameter configurations for various LLM providers.
4
4
 
5
5
  from typing import Any, Dict, Optional
6
6
 
7
- from langroid.pydantic_v1 import BaseSettings
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
8
 
9
9
  # Constants
10
10
  LANGDB_BASE_URL = "https://api.us-east-1.langdb.ai"
@@ -24,10 +24,7 @@ class LangDBParams(BaseSettings):
24
24
  thread_id: Optional[str] = None
25
25
  base_url: str = LANGDB_BASE_URL
26
26
 
27
- class Config:
28
- # allow setting of fields via env vars,
29
- # e.g. LANGDB_PROJECT_ID=1234
30
- env_prefix = "LANGDB_"
27
+ model_config = SettingsConfigDict(env_prefix="LANGDB_")
31
28
 
32
29
 
33
30
  class PortkeyParams(BaseSettings):
@@ -61,10 +58,7 @@ class PortkeyParams(BaseSettings):
61
58
  custom_headers: Optional[Dict[str, str]] = None # Optional: additional headers
62
59
  base_url: str = PORTKEY_BASE_URL
63
60
 
64
- class Config:
65
- # allow setting of fields via env vars,
66
- # e.g. PORTKEY_API_KEY=xxx, PORTKEY_PROVIDER=anthropic
67
- env_prefix = "PORTKEY_"
61
+ model_config = SettingsConfigDict(env_prefix="PORTKEY_")
68
62
 
69
63
  def get_headers(self) -> Dict[str, str]:
70
64
  """Generate Portkey-specific headers from parameters."""
@@ -73,7 +67,6 @@ class PortkeyParams(BaseSettings):
73
67
 
74
68
  headers = {}
75
69
 
76
- # API key - from params or environment
77
70
  if self.api_key and self.api_key != DUMMY_API_KEY:
78
71
  headers["x-portkey-api-key"] = self.api_key
79
72
  else:
@@ -81,45 +74,35 @@ class PortkeyParams(BaseSettings):
81
74
  if portkey_key:
82
75
  headers["x-portkey-api-key"] = portkey_key
83
76
 
84
- # Provider
85
77
  if self.provider:
86
78
  headers["x-portkey-provider"] = self.provider
87
79
 
88
- # Virtual key
89
80
  if self.virtual_key:
90
81
  headers["x-portkey-virtual-key"] = self.virtual_key
91
82
 
92
- # Trace ID
93
83
  if self.trace_id:
94
84
  headers["x-portkey-trace-id"] = self.trace_id
95
85
 
96
- # Metadata
97
86
  if self.metadata:
98
87
  headers["x-portkey-metadata"] = json.dumps(self.metadata)
99
88
 
100
- # Retry configuration
101
89
  if self.retry:
102
90
  headers["x-portkey-retry"] = json.dumps(self.retry)
103
91
 
104
- # Cache configuration
105
92
  if self.cache:
106
93
  headers["x-portkey-cache"] = json.dumps(self.cache)
107
94
 
108
- # Cache force refresh
109
95
  if self.cache_force_refresh is not None:
110
96
  headers["x-portkey-cache-force-refresh"] = str(
111
97
  self.cache_force_refresh
112
98
  ).lower()
113
99
 
114
- # User identifier
115
100
  if self.user:
116
101
  headers["x-portkey-user"] = self.user
117
102
 
118
- # Organization identifier
119
103
  if self.organization:
120
104
  headers["x-portkey-organization"] = self.organization
121
105
 
122
- # Add any custom headers
123
106
  if self.custom_headers:
124
107
  headers.update(self.custom_headers)
125
108
 
@@ -138,7 +121,6 @@ class PortkeyParams(BaseSettings):
138
121
  _, provider, model = parts
139
122
  return provider, model
140
123
  else:
141
- # Fallback: just remove "portkey/" prefix and return empty provider
142
124
  model = model_string.replace("portkey/", "")
143
125
  return "", model
144
126
 
@@ -157,7 +139,6 @@ class PortkeyParams(BaseSettings):
157
139
  """
158
140
  import os
159
141
 
160
- # Common environment variable patterns for different providers
161
142
  env_patterns = [
162
143
  f"{provider.upper()}_API_KEY",
163
144
  f"{provider.upper()}_KEY",
langroid/mytypes.py CHANGED
@@ -3,7 +3,7 @@ from textwrap import dedent
3
3
  from typing import Any, Callable, Dict, List, Union
4
4
  from uuid import uuid4
5
5
 
6
- from langroid.pydantic_v1 import BaseModel, Extra, Field
6
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
7
7
 
8
8
  Number = Union[int, float]
9
9
  Embedding = List[Number]
@@ -51,13 +51,21 @@ class DocMetaData(BaseModel):
51
51
  id: str = Field(default_factory=lambda: str(uuid4()))
52
52
  window_ids: List[str] = [] # for RAG: ids of chunks around this one
53
53
 
54
+ @field_validator("id", mode="before")
55
+ @classmethod
56
+ def convert_id_to_string(cls, v: Any) -> str:
57
+ """Convert id to string if it's not already."""
58
+ if v is None:
59
+ return str(uuid4())
60
+ return str(v)
61
+
54
62
  def dict_bool_int(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
55
63
  """
56
64
  Special dict method to convert bool fields to int, to appease some
57
65
  downstream libraries, e.g. Chroma which complains about bool fields in
58
66
  metadata.
59
67
  """
60
- original_dict = super().dict(*args, **kwargs)
68
+ original_dict = super().model_dump(*args, **kwargs)
61
69
 
62
70
  for key, value in original_dict.items():
63
71
  if isinstance(value, bool):
@@ -92,8 +100,7 @@ class DocMetaData(BaseModel):
92
100
  )
93
101
  return ", ".join(components)
94
102
 
95
- class Config:
96
- extra = Extra.allow
103
+ model_config = ConfigDict(extra="allow")
97
104
 
98
105
 
99
106
  class Document(BaseModel):
@@ -2,12 +2,12 @@ from functools import reduce
2
2
  from typing import Callable, List
3
3
 
4
4
  import tiktoken
5
+ from pydantic_settings import BaseSettings
5
6
  from pygments import lex
6
7
  from pygments.lexers import get_lexer_by_name
7
8
  from pygments.token import Token
8
9
 
9
10
  from langroid.mytypes import Document
10
- from langroid.pydantic_v1 import BaseSettings
11
11
 
12
12
 
13
13
  def chunk_code(
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  from typing import Any, BinaryIO, Dict, Optional, Union
6
6
  from urllib.parse import urlparse
7
7
 
8
- from langroid.pydantic_v1 import BaseModel
8
+ from pydantic import BaseModel
9
9
 
10
10
 
11
11
  class FileAttachment(BaseModel):
@@ -1,7 +1,7 @@
1
1
  import re
2
- from typing import List
2
+ from typing import Any, List
3
3
 
4
- from langroid.pydantic_v1 import BaseModel, Field
4
+ from pydantic import BaseModel, Field, field_validator
5
5
 
6
6
  HEADER_CONTEXT_SEP = "\n...\n"
7
7
 
@@ -24,8 +24,8 @@ class Node(BaseModel):
24
24
  # Forward references will be resolved with the update_forward_refs call below.
25
25
 
26
26
 
27
- # Resolve forward references for Node (required for recursive models in Pydantic v1)
28
- Node.update_forward_refs()
27
+ # Resolve forward references for Node (required for recursive models in Pydantic)
28
+ Node.model_rebuild()
29
29
 
30
30
 
31
31
  def _cleanup_text(text: str) -> str:
@@ -180,6 +180,16 @@ class MarkdownChunkConfig(BaseModel):
180
180
  rollup: bool = True # whether to roll up chunks
181
181
  header_context_sep: str = HEADER_CONTEXT_SEP # separator for header context
182
182
 
183
+ @field_validator("chunk_size", mode="before")
184
+ @classmethod
185
+ def convert_chunk_size_to_int(cls, v: Any) -> int:
186
+ """Convert chunk_size to int, maintaining backward compatibility
187
+ with Pydantic V1.
188
+ """
189
+ if isinstance(v, float):
190
+ return int(v)
191
+ return int(v)
192
+
183
193
 
184
194
  # A simple tokenizer that counts tokens as whitespace-separated words.
185
195
  def count_words(text: str) -> int:
@@ -4,6 +4,8 @@ from enum import Enum
4
4
  from typing import Any, Dict, List, Literal, Optional
5
5
 
6
6
  import tiktoken
7
+ from pydantic import field_validator, model_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
7
9
 
8
10
  from langroid.mytypes import Document
9
11
  from langroid.parsing.md_parser import (
@@ -12,7 +14,6 @@ from langroid.parsing.md_parser import (
12
14
  count_words,
13
15
  )
14
16
  from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
15
- from langroid.pydantic_v1 import BaseSettings, root_validator
16
17
  from langroid.utils.object_registry import ObjectRegistry
17
18
 
18
19
  logger = logging.getLogger(__name__)
@@ -32,8 +33,7 @@ class BaseParsingConfig(BaseSettings):
32
33
 
33
34
  library: str
34
35
 
35
- class Config:
36
- extra = "ignore" # Ignore unknown settings
36
+ model_config = SettingsConfigDict(extra="ignore") # Ignore unknown settings
37
37
 
38
38
 
39
39
  class LLMPdfParserConfig(BaseSettings):
@@ -69,7 +69,8 @@ class PdfParsingConfig(BaseParsingConfig):
69
69
  llm_parser_config: Optional[LLMPdfParserConfig] = None
70
70
  marker_config: Optional[MarkerConfig] = None
71
71
 
72
- @root_validator(pre=True)
72
+ @model_validator(mode="before")
73
+ @classmethod
73
74
  def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
74
75
  """Ensure correct config is set based on library selection."""
75
76
  library = values.get("library")
@@ -114,6 +115,17 @@ class ParsingConfig(BaseSettings):
114
115
  chunk_size_variation: float = 0.30 # max variation from chunk_size
115
116
  overlap: int = 50 # overlap between chunks
116
117
  max_chunks: int = 10_000
118
+
119
+ @field_validator("chunk_size", mode="before")
120
+ @classmethod
121
+ def convert_chunk_size_to_int(cls, v: Any) -> int:
122
+ """Convert chunk_size to int, maintaining backward compatibility
123
+ with Pydantic V1.
124
+ """
125
+ if isinstance(v, float):
126
+ return int(v)
127
+ return int(v)
128
+
117
129
  # offset to subtract from page numbers:
118
130
  # e.g. if physical page 12 is displayed as page 1, set page_number_offset = 11
119
131
  page_number_offset: int = 0
@@ -203,7 +215,8 @@ class Parser:
203
215
  # add_window_ids)
204
216
  chunk_docs = [
205
217
  Document(
206
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
218
+ content=c,
219
+ metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
207
220
  )
208
221
  for c in chunks
209
222
  if c.strip() != ""
@@ -255,7 +268,8 @@ class Parser:
255
268
  # add_window_ids)
256
269
  chunk_docs = [
257
270
  Document(
258
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
271
+ content=c,
272
+ metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
259
273
  )
260
274
  for c in chunks
261
275
  if c.strip() != ""
@@ -287,7 +301,8 @@ class Parser:
287
301
  # add_window_ids)
288
302
  chunk_docs = [
289
303
  Document(
290
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
304
+ content=c,
305
+ metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
291
306
  )
292
307
  for c in chunks
293
308
  if c.strip() != ""
@@ -18,10 +18,12 @@ if TYPE_CHECKING:
18
18
  from github.Label import Label
19
19
  from github.Repository import Repository
20
20
 
21
+ from pydantic import BaseModel, Field
22
+ from pydantic_settings import BaseSettings
23
+
21
24
  from langroid.mytypes import DocMetaData, Document
22
25
  from langroid.parsing.document_parser import DocumentParser, DocumentType
23
26
  from langroid.parsing.parser import Parser, ParsingConfig
24
- from langroid.pydantic_v1 import BaseModel, BaseSettings, Field
25
27
 
26
28
  logger = logging.getLogger(__name__)
27
29
 
@@ -64,7 +64,7 @@ def find_fuzzy_matches_in_docs(
64
64
  return orig_doc_matches
65
65
  if len(orig_doc_matches) == 0:
66
66
  return []
67
- if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
67
+ if set(orig_doc_matches[0][0].model_fields) != {"content", "metadata"}:
68
68
  # If there are fields beyond just content and metadata,
69
69
  # we do NOT want to create new document objects with content fields
70
70
  # based on words_before and words_after, since we don't know how to