langroid 0.58.3__py3-none-any.whl → 0.59.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. langroid/agent/base.py +39 -17
  2. langroid/agent/callbacks/chainlit.py +2 -1
  3. langroid/agent/chat_agent.py +73 -55
  4. langroid/agent/chat_document.py +7 -7
  5. langroid/agent/done_sequence_parser.py +46 -11
  6. langroid/agent/openai_assistant.py +9 -9
  7. langroid/agent/special/arangodb/arangodb_agent.py +10 -18
  8. langroid/agent/special/arangodb/tools.py +3 -3
  9. langroid/agent/special/doc_chat_agent.py +16 -14
  10. langroid/agent/special/lance_rag/critic_agent.py +2 -2
  11. langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
  12. langroid/agent/special/lance_tools.py +6 -5
  13. langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
  14. langroid/agent/special/relevance_extractor_agent.py +1 -1
  15. langroid/agent/special/sql/sql_chat_agent.py +11 -3
  16. langroid/agent/task.py +53 -94
  17. langroid/agent/tool_message.py +33 -17
  18. langroid/agent/tools/file_tools.py +4 -2
  19. langroid/agent/tools/mcp/fastmcp_client.py +19 -6
  20. langroid/agent/tools/orchestration.py +22 -17
  21. langroid/agent/tools/recipient_tool.py +3 -3
  22. langroid/agent/tools/task_tool.py +22 -16
  23. langroid/agent/xml_tool_message.py +90 -35
  24. langroid/cachedb/base.py +1 -1
  25. langroid/embedding_models/base.py +2 -2
  26. langroid/embedding_models/models.py +3 -7
  27. langroid/exceptions.py +4 -1
  28. langroid/language_models/azure_openai.py +2 -2
  29. langroid/language_models/base.py +6 -4
  30. langroid/language_models/config.py +2 -4
  31. langroid/language_models/model_info.py +9 -1
  32. langroid/language_models/openai_gpt.py +53 -18
  33. langroid/language_models/provider_params.py +3 -22
  34. langroid/mytypes.py +11 -4
  35. langroid/parsing/code_parser.py +1 -1
  36. langroid/parsing/file_attachment.py +1 -1
  37. langroid/parsing/md_parser.py +14 -4
  38. langroid/parsing/parser.py +22 -7
  39. langroid/parsing/repo_loader.py +3 -1
  40. langroid/parsing/search.py +1 -1
  41. langroid/parsing/url_loader.py +17 -51
  42. langroid/parsing/urls.py +5 -4
  43. langroid/prompts/prompts_config.py +1 -1
  44. langroid/pydantic_v1/__init__.py +61 -4
  45. langroid/pydantic_v1/main.py +10 -4
  46. langroid/utils/configuration.py +13 -11
  47. langroid/utils/constants.py +1 -1
  48. langroid/utils/globals.py +21 -5
  49. langroid/utils/html_logger.py +2 -1
  50. langroid/utils/object_registry.py +1 -1
  51. langroid/utils/pydantic_utils.py +55 -28
  52. langroid/utils/types.py +2 -2
  53. langroid/vector_store/base.py +3 -3
  54. langroid/vector_store/lancedb.py +5 -5
  55. langroid/vector_store/meilisearch.py +2 -2
  56. langroid/vector_store/pineconedb.py +4 -4
  57. langroid/vector_store/postgres.py +1 -1
  58. langroid/vector_store/qdrantdb.py +3 -3
  59. langroid/vector_store/weaviatedb.py +1 -1
  60. {langroid-0.58.3.dist-info → langroid-0.59.0.dist-info}/METADATA +3 -2
  61. {langroid-0.58.3.dist-info → langroid-0.59.0.dist-info}/RECORD +63 -63
  62. {langroid-0.58.3.dist-info → langroid-0.59.0.dist-info}/WHEEL +0 -0
  63. {langroid-0.58.3.dist-info → langroid-0.59.0.dist-info}/licenses/LICENSE +0 -0
@@ -12,6 +12,7 @@ from typing import (
12
12
  Callable,
13
13
  Dict,
14
14
  List,
15
+ Mapping,
15
16
  Optional,
16
17
  Tuple,
17
18
  Type,
@@ -24,6 +25,8 @@ from cerebras.cloud.sdk import AsyncCerebras, Cerebras
24
25
  from groq import AsyncGroq, Groq
25
26
  from httpx import Timeout
26
27
  from openai import AsyncOpenAI, OpenAI
28
+ from pydantic import BaseModel
29
+ from pydantic_settings import BaseSettings, SettingsConfigDict
27
30
  from rich import print
28
31
  from rich.markup import escape
29
32
 
@@ -78,7 +81,6 @@ from langroid.language_models.utils import (
78
81
  retry_with_exponential_backoff,
79
82
  )
80
83
  from langroid.parsing.parse_json import parse_imperfect_json
81
- from langroid.pydantic_v1 import BaseModel, BaseSettings
82
84
  from langroid.utils.configuration import settings
83
85
  from langroid.utils.constants import Colors
84
86
  from langroid.utils.system import friendly_error
@@ -220,7 +222,7 @@ class OpenAICallParams(BaseModel):
220
222
  extra_body: Dict[str, Any] | None = None # additional params for API request body
221
223
 
222
224
  def to_dict_exclude_none(self) -> Dict[str, Any]:
223
- return {k: v for k, v in self.dict().items() if v is not None}
225
+ return {k: v for k, v in self.model_dump().items() if v is not None}
224
226
 
225
227
 
226
228
  class LiteLLMProxyConfig(BaseSettings):
@@ -229,8 +231,7 @@ class LiteLLMProxyConfig(BaseSettings):
229
231
  api_key: str = "" # read from env var LITELLM_API_KEY if set
230
232
  api_base: str = "" # read from env var LITELLM_API_BASE if set
231
233
 
232
- class Config:
233
- env_prefix = "LITELLM_"
234
+ model_config = SettingsConfigDict(env_prefix="LITELLM_")
234
235
 
235
236
 
236
237
  class OpenAIGPTConfig(LLMConfig):
@@ -259,7 +260,7 @@ class OpenAIGPTConfig(LLMConfig):
259
260
  litellm_proxy: LiteLLMProxyConfig = LiteLLMProxyConfig()
260
261
  ollama: bool = False # use ollama's OpenAI-compatible endpoint?
261
262
  min_output_tokens: int = 1
262
- use_chat_for_completion = True # do not change this, for OpenAI models!
263
+ use_chat_for_completion: bool = True # do not change this, for OpenAI models!
263
264
  timeout: int = 20
264
265
  temperature: float = 0.2
265
266
  seed: int | None = 42
@@ -316,8 +317,43 @@ class OpenAIGPTConfig(LLMConfig):
316
317
 
317
318
  super().__init__(**kwargs)
318
319
 
319
- class Config:
320
- env_prefix = "OPENAI_"
320
+ model_config = SettingsConfigDict(env_prefix="OPENAI_")
321
+
322
+ def model_copy(
323
+ self, *, update: Mapping[str, Any] | None = None, deep: bool = False
324
+ ) -> "OpenAIGPTConfig":
325
+ """
326
+ Override model_copy to handle unpicklable fields properly.
327
+
328
+ This preserves fields like http_client_factory during normal copying
329
+ while still allowing exclusion for pickling operations.
330
+ """
331
+ # Save references to unpicklable fields
332
+ http_client_factory = self.http_client_factory
333
+ streamer = self.streamer
334
+ streamer_async = self.streamer_async
335
+
336
+ # Get the current model data, excluding problematic fields
337
+ data = self.model_dump(
338
+ exclude={"http_client_factory", "streamer", "streamer_async"}
339
+ )
340
+
341
+ # Apply any updates
342
+ if update:
343
+ data.update(update)
344
+
345
+ # Create a new instance with the copied data
346
+ new_instance = self.__class__(**data)
347
+
348
+ # Restore the unpicklable fields if they weren't overridden by update
349
+ if "http_client_factory" not in (update or {}):
350
+ new_instance.http_client_factory = http_client_factory
351
+ if "streamer" not in (update or {}):
352
+ new_instance.streamer = streamer
353
+ if "streamer_async" not in (update or {}):
354
+ new_instance.streamer_async = streamer_async
355
+
356
+ return new_instance
321
357
 
322
358
  def _validate_litellm(self) -> None:
323
359
  """
@@ -330,12 +366,12 @@ class OpenAIGPTConfig(LLMConfig):
330
366
  import litellm
331
367
  except ImportError:
332
368
  raise LangroidImportError("litellm", "litellm")
369
+
333
370
  litellm.telemetry = False
334
371
  litellm.drop_params = True # drop un-supported params without crashing
335
- # modify params to fit the model expectations, and avoid crashing
336
- # (e.g. anthropic doesn't like first msg to be system msg)
337
372
  litellm.modify_params = True
338
373
  self.seed = None # some local mdls don't support seed
374
+
339
375
  if self.api_key == DUMMY_API_KEY:
340
376
  keys_dict = litellm.utils.validate_environment(self.chat_model)
341
377
  missing_keys = keys_dict.get("missing_keys", [])
@@ -365,8 +401,7 @@ class OpenAIGPTConfig(LLMConfig):
365
401
  class DynamicConfig(OpenAIGPTConfig):
366
402
  pass
367
403
 
368
- DynamicConfig.Config.env_prefix = prefix.upper() + "_"
369
-
404
+ DynamicConfig.model_config = SettingsConfigDict(env_prefix=prefix.upper() + "_")
370
405
  return DynamicConfig
371
406
 
372
407
 
@@ -407,7 +442,7 @@ class OpenAIGPT(LanguageModel):
407
442
  config: configuration for openai-gpt model
408
443
  """
409
444
  # copy the config to avoid modifying the original
410
- config = config.copy()
445
+ config = config.model_copy()
411
446
  super().__init__(config)
412
447
  self.config: OpenAIGPTConfig = config
413
448
  # save original model name such as `provider/model` before
@@ -1477,7 +1512,7 @@ class OpenAIGPT(LanguageModel):
1477
1512
 
1478
1513
  if has_function:
1479
1514
  function_call = LLMFunctionCall(name=function_name)
1480
- function_call_dict = function_call.dict()
1515
+ function_call_dict = function_call.model_dump()
1481
1516
  if function_args == "":
1482
1517
  function_call.arguments = None
1483
1518
  else:
@@ -1529,7 +1564,7 @@ class OpenAIGPT(LanguageModel):
1529
1564
  ),
1530
1565
  ),
1531
1566
  ),
1532
- openai_response.dict(),
1567
+ openai_response.model_dump(),
1533
1568
  )
1534
1569
 
1535
1570
  def _cache_store(self, k: str, v: Any) -> None:
@@ -1680,7 +1715,7 @@ class OpenAIGPT(LanguageModel):
1680
1715
  cached, hashed_key, response = completions_with_backoff(**args)
1681
1716
  # assume response is an actual response rather than a streaming event
1682
1717
  if not isinstance(response, dict):
1683
- response = response.dict()
1718
+ response = response.model_dump()
1684
1719
  if "message" in response["choices"][0]:
1685
1720
  msg = response["choices"][0]["message"]["content"].strip()
1686
1721
  else:
@@ -1758,7 +1793,7 @@ class OpenAIGPT(LanguageModel):
1758
1793
  )
1759
1794
  # assume response is an actual response rather than a streaming event
1760
1795
  if not isinstance(response, dict):
1761
- response = response.dict()
1796
+ response = response.model_dump()
1762
1797
  if "message" in response["choices"][0]:
1763
1798
  msg = response["choices"][0]["message"]["content"].strip()
1764
1799
  else:
@@ -2056,7 +2091,7 @@ class OpenAIGPT(LanguageModel):
2056
2091
  if functions is not None:
2057
2092
  args.update(
2058
2093
  dict(
2059
- functions=[f.dict() for f in functions],
2094
+ functions=[f.model_dump() for f in functions],
2060
2095
  function_call=function_call,
2061
2096
  )
2062
2097
  )
@@ -2074,7 +2109,7 @@ class OpenAIGPT(LanguageModel):
2074
2109
  tools=[
2075
2110
  dict(
2076
2111
  type="function",
2077
- function=t.function.dict()
2112
+ function=t.function.model_dump()
2078
2113
  | ({"strict": t.strict} if t.strict is not None else {}),
2079
2114
  )
2080
2115
  for t in tools
@@ -4,7 +4,7 @@ Provider-specific parameter configurations for various LLM providers.
4
4
 
5
5
  from typing import Any, Dict, Optional
6
6
 
7
- from langroid.pydantic_v1 import BaseSettings
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
8
 
9
9
  # Constants
10
10
  LANGDB_BASE_URL = "https://api.us-east-1.langdb.ai"
@@ -24,10 +24,7 @@ class LangDBParams(BaseSettings):
24
24
  thread_id: Optional[str] = None
25
25
  base_url: str = LANGDB_BASE_URL
26
26
 
27
- class Config:
28
- # allow setting of fields via env vars,
29
- # e.g. LANGDB_PROJECT_ID=1234
30
- env_prefix = "LANGDB_"
27
+ model_config = SettingsConfigDict(env_prefix="LANGDB_")
31
28
 
32
29
 
33
30
  class PortkeyParams(BaseSettings):
@@ -61,10 +58,7 @@ class PortkeyParams(BaseSettings):
61
58
  custom_headers: Optional[Dict[str, str]] = None # Optional: additional headers
62
59
  base_url: str = PORTKEY_BASE_URL
63
60
 
64
- class Config:
65
- # allow setting of fields via env vars,
66
- # e.g. PORTKEY_API_KEY=xxx, PORTKEY_PROVIDER=anthropic
67
- env_prefix = "PORTKEY_"
61
+ model_config = SettingsConfigDict(env_prefix="PORTKEY_")
68
62
 
69
63
  def get_headers(self) -> Dict[str, str]:
70
64
  """Generate Portkey-specific headers from parameters."""
@@ -73,7 +67,6 @@ class PortkeyParams(BaseSettings):
73
67
 
74
68
  headers = {}
75
69
 
76
- # API key - from params or environment
77
70
  if self.api_key and self.api_key != DUMMY_API_KEY:
78
71
  headers["x-portkey-api-key"] = self.api_key
79
72
  else:
@@ -81,45 +74,35 @@ class PortkeyParams(BaseSettings):
81
74
  if portkey_key:
82
75
  headers["x-portkey-api-key"] = portkey_key
83
76
 
84
- # Provider
85
77
  if self.provider:
86
78
  headers["x-portkey-provider"] = self.provider
87
79
 
88
- # Virtual key
89
80
  if self.virtual_key:
90
81
  headers["x-portkey-virtual-key"] = self.virtual_key
91
82
 
92
- # Trace ID
93
83
  if self.trace_id:
94
84
  headers["x-portkey-trace-id"] = self.trace_id
95
85
 
96
- # Metadata
97
86
  if self.metadata:
98
87
  headers["x-portkey-metadata"] = json.dumps(self.metadata)
99
88
 
100
- # Retry configuration
101
89
  if self.retry:
102
90
  headers["x-portkey-retry"] = json.dumps(self.retry)
103
91
 
104
- # Cache configuration
105
92
  if self.cache:
106
93
  headers["x-portkey-cache"] = json.dumps(self.cache)
107
94
 
108
- # Cache force refresh
109
95
  if self.cache_force_refresh is not None:
110
96
  headers["x-portkey-cache-force-refresh"] = str(
111
97
  self.cache_force_refresh
112
98
  ).lower()
113
99
 
114
- # User identifier
115
100
  if self.user:
116
101
  headers["x-portkey-user"] = self.user
117
102
 
118
- # Organization identifier
119
103
  if self.organization:
120
104
  headers["x-portkey-organization"] = self.organization
121
105
 
122
- # Add any custom headers
123
106
  if self.custom_headers:
124
107
  headers.update(self.custom_headers)
125
108
 
@@ -138,7 +121,6 @@ class PortkeyParams(BaseSettings):
138
121
  _, provider, model = parts
139
122
  return provider, model
140
123
  else:
141
- # Fallback: just remove "portkey/" prefix and return empty provider
142
124
  model = model_string.replace("portkey/", "")
143
125
  return "", model
144
126
 
@@ -157,7 +139,6 @@ class PortkeyParams(BaseSettings):
157
139
  """
158
140
  import os
159
141
 
160
- # Common environment variable patterns for different providers
161
142
  env_patterns = [
162
143
  f"{provider.upper()}_API_KEY",
163
144
  f"{provider.upper()}_KEY",
langroid/mytypes.py CHANGED
@@ -3,7 +3,7 @@ from textwrap import dedent
3
3
  from typing import Any, Callable, Dict, List, Union
4
4
  from uuid import uuid4
5
5
 
6
- from langroid.pydantic_v1 import BaseModel, Extra, Field
6
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
7
7
 
8
8
  Number = Union[int, float]
9
9
  Embedding = List[Number]
@@ -51,13 +51,21 @@ class DocMetaData(BaseModel):
51
51
  id: str = Field(default_factory=lambda: str(uuid4()))
52
52
  window_ids: List[str] = [] # for RAG: ids of chunks around this one
53
53
 
54
+ @field_validator("id", mode="before")
55
+ @classmethod
56
+ def convert_id_to_string(cls, v: Any) -> str:
57
+ """Convert id to string if it's not already."""
58
+ if v is None:
59
+ return str(uuid4())
60
+ return str(v)
61
+
54
62
  def dict_bool_int(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
55
63
  """
56
64
  Special dict method to convert bool fields to int, to appease some
57
65
  downstream libraries, e.g. Chroma which complains about bool fields in
58
66
  metadata.
59
67
  """
60
- original_dict = super().dict(*args, **kwargs)
68
+ original_dict = super().model_dump(*args, **kwargs)
61
69
 
62
70
  for key, value in original_dict.items():
63
71
  if isinstance(value, bool):
@@ -92,8 +100,7 @@ class DocMetaData(BaseModel):
92
100
  )
93
101
  return ", ".join(components)
94
102
 
95
- class Config:
96
- extra = Extra.allow
103
+ model_config = ConfigDict(extra="allow")
97
104
 
98
105
 
99
106
  class Document(BaseModel):
@@ -2,12 +2,12 @@ from functools import reduce
2
2
  from typing import Callable, List
3
3
 
4
4
  import tiktoken
5
+ from pydantic_settings import BaseSettings
5
6
  from pygments import lex
6
7
  from pygments.lexers import get_lexer_by_name
7
8
  from pygments.token import Token
8
9
 
9
10
  from langroid.mytypes import Document
10
- from langroid.pydantic_v1 import BaseSettings
11
11
 
12
12
 
13
13
  def chunk_code(
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  from typing import Any, BinaryIO, Dict, Optional, Union
6
6
  from urllib.parse import urlparse
7
7
 
8
- from langroid.pydantic_v1 import BaseModel
8
+ from pydantic import BaseModel
9
9
 
10
10
 
11
11
  class FileAttachment(BaseModel):
@@ -1,7 +1,7 @@
1
1
  import re
2
- from typing import List
2
+ from typing import Any, List
3
3
 
4
- from langroid.pydantic_v1 import BaseModel, Field
4
+ from pydantic import BaseModel, Field, field_validator
5
5
 
6
6
  HEADER_CONTEXT_SEP = "\n...\n"
7
7
 
@@ -24,8 +24,8 @@ class Node(BaseModel):
24
24
  # Forward references will be resolved with the update_forward_refs call below.
25
25
 
26
26
 
27
- # Resolve forward references for Node (required for recursive models in Pydantic v1)
28
- Node.update_forward_refs()
27
+ # Resolve forward references for Node (required for recursive models in Pydantic)
28
+ Node.model_rebuild()
29
29
 
30
30
 
31
31
  def _cleanup_text(text: str) -> str:
@@ -180,6 +180,16 @@ class MarkdownChunkConfig(BaseModel):
180
180
  rollup: bool = True # whether to roll up chunks
181
181
  header_context_sep: str = HEADER_CONTEXT_SEP # separator for header context
182
182
 
183
+ @field_validator("chunk_size", mode="before")
184
+ @classmethod
185
+ def convert_chunk_size_to_int(cls, v: Any) -> int:
186
+ """Convert chunk_size to int, maintaining backward compatibility
187
+ with Pydantic V1.
188
+ """
189
+ if isinstance(v, float):
190
+ return int(v)
191
+ return int(v)
192
+
183
193
 
184
194
  # A simple tokenizer that counts tokens as whitespace-separated words.
185
195
  def count_words(text: str) -> int:
@@ -4,6 +4,8 @@ from enum import Enum
4
4
  from typing import Any, Dict, List, Literal, Optional
5
5
 
6
6
  import tiktoken
7
+ from pydantic import field_validator, model_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
7
9
 
8
10
  from langroid.mytypes import Document
9
11
  from langroid.parsing.md_parser import (
@@ -12,7 +14,6 @@ from langroid.parsing.md_parser import (
12
14
  count_words,
13
15
  )
14
16
  from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
15
- from langroid.pydantic_v1 import BaseSettings, root_validator
16
17
  from langroid.utils.object_registry import ObjectRegistry
17
18
 
18
19
  logger = logging.getLogger(__name__)
@@ -32,8 +33,7 @@ class BaseParsingConfig(BaseSettings):
32
33
 
33
34
  library: str
34
35
 
35
- class Config:
36
- extra = "ignore" # Ignore unknown settings
36
+ model_config = SettingsConfigDict(extra="ignore") # Ignore unknown settings
37
37
 
38
38
 
39
39
  class LLMPdfParserConfig(BaseSettings):
@@ -69,7 +69,8 @@ class PdfParsingConfig(BaseParsingConfig):
69
69
  llm_parser_config: Optional[LLMPdfParserConfig] = None
70
70
  marker_config: Optional[MarkerConfig] = None
71
71
 
72
- @root_validator(pre=True)
72
+ @model_validator(mode="before")
73
+ @classmethod
73
74
  def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
74
75
  """Ensure correct config is set based on library selection."""
75
76
  library = values.get("library")
@@ -114,6 +115,17 @@ class ParsingConfig(BaseSettings):
114
115
  chunk_size_variation: float = 0.30 # max variation from chunk_size
115
116
  overlap: int = 50 # overlap between chunks
116
117
  max_chunks: int = 10_000
118
+
119
+ @field_validator("chunk_size", mode="before")
120
+ @classmethod
121
+ def convert_chunk_size_to_int(cls, v: Any) -> int:
122
+ """Convert chunk_size to int, maintaining backward compatibility
123
+ with Pydantic V1.
124
+ """
125
+ if isinstance(v, float):
126
+ return int(v)
127
+ return int(v)
128
+
117
129
  # offset to subtract from page numbers:
118
130
  # e.g. if physical page 12 is displayed as page 1, set page_number_offset = 11
119
131
  page_number_offset: int = 0
@@ -203,7 +215,8 @@ class Parser:
203
215
  # add_window_ids)
204
216
  chunk_docs = [
205
217
  Document(
206
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
218
+ content=c,
219
+ metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
207
220
  )
208
221
  for c in chunks
209
222
  if c.strip() != ""
@@ -255,7 +268,8 @@ class Parser:
255
268
  # add_window_ids)
256
269
  chunk_docs = [
257
270
  Document(
258
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
271
+ content=c,
272
+ metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
259
273
  )
260
274
  for c in chunks
261
275
  if c.strip() != ""
@@ -287,7 +301,8 @@ class Parser:
287
301
  # add_window_ids)
288
302
  chunk_docs = [
289
303
  Document(
290
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
304
+ content=c,
305
+ metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
291
306
  )
292
307
  for c in chunks
293
308
  if c.strip() != ""
@@ -18,10 +18,12 @@ if TYPE_CHECKING:
18
18
  from github.Label import Label
19
19
  from github.Repository import Repository
20
20
 
21
+ from pydantic import BaseModel, Field
22
+ from pydantic_settings import BaseSettings
23
+
21
24
  from langroid.mytypes import DocMetaData, Document
22
25
  from langroid.parsing.document_parser import DocumentParser, DocumentType
23
26
  from langroid.parsing.parser import Parser, ParsingConfig
24
- from langroid.pydantic_v1 import BaseModel, BaseSettings, Field
25
27
 
26
28
  logger = logging.getLogger(__name__)
27
29
 
@@ -64,7 +64,7 @@ def find_fuzzy_matches_in_docs(
64
64
  return orig_doc_matches
65
65
  if len(orig_doc_matches) == 0:
66
66
  return []
67
- if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
67
+ if set(orig_doc_matches[0][0].model_fields) != {"content", "metadata"}:
68
68
  # If there are fields beyond just content and metadata,
69
69
  # we do NOT want to create new document objects with content fields
70
70
  # based on words_before and words_after, since we don't know how to
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
7
7
 
8
8
  import markdownify as md
9
9
  from dotenv import load_dotenv
10
+ from pydantic_settings import BaseSettings, SettingsConfigDict
10
11
 
11
12
  from langroid.exceptions import LangroidImportError
12
13
  from langroid.mytypes import DocMetaData, Document
13
14
  from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
14
15
  from langroid.parsing.parser import Parser, ParsingConfig
15
- from langroid.pydantic_v1 import BaseSettings
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from firecrawl import FirecrawlApp
@@ -54,20 +54,13 @@ class FirecrawlConfig(BaseCrawlerConfig):
54
54
  params: Dict[str, Any] = {}
55
55
  timeout: Optional[int] = None
56
56
 
57
- class Config:
58
- # Leverage Pydantic's BaseSettings to
59
- # allow setting of fields via env vars,
60
- # e.g. FIRECRAWL_MODE=scrape and FIRECRAWL_API_KEY=...
61
- env_prefix = "FIRECRAWL_"
57
+ model_config = SettingsConfigDict(env_prefix="FIRECRAWL_")
62
58
 
63
59
 
64
60
  class ExaCrawlerConfig(BaseCrawlerConfig):
65
61
  api_key: str = ""
66
62
 
67
- class Config:
68
- # Allow setting of fields via env vars with prefix EXA_
69
- # e.g., EXA_API_KEY=your_api_key
70
- env_prefix = "EXA_"
63
+ model_config = SettingsConfigDict(env_prefix="EXA_")
71
64
 
72
65
 
73
66
  class Crawl4aiConfig(BaseCrawlerConfig):
@@ -81,49 +74,22 @@ class Crawl4aiConfig(BaseCrawlerConfig):
81
74
  browser_config: Optional["BrowserConfig"] = None
82
75
  run_config: Optional["CrawlerRunConfig"] = None
83
76
 
84
- _refs_resolved: bool = False
77
+ model_config = SettingsConfigDict(arbitrary_types_allowed=True)
85
78
 
86
- def __init_subclass__(cls, **kwargs: Any) -> None:
87
- """Resolve forward references when class is first subclassed or instantiated."""
88
- super().__init_subclass__(**kwargs)
89
- cls._resolve_forward_refs()
90
79
 
91
- @classmethod
92
- def _resolve_forward_refs(cls) -> None:
93
- """Resolve forward references only when needed."""
94
- if not cls._refs_resolved:
95
- try:
96
- from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
97
- from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
98
- from crawl4ai.deep_crawling import DeepCrawlStrategy
99
- from crawl4ai.extraction_strategy import ExtractionStrategy
100
- from crawl4ai.markdown_generation_strategy import (
101
- MarkdownGenerationStrategy,
102
- )
103
-
104
- # Create namespace for update_forward_refs
105
- namespace = {
106
- "BrowserConfig": BrowserConfig,
107
- "CrawlerRunConfig": CrawlerRunConfig,
108
- "ContentScrapingStrategy": ContentScrapingStrategy,
109
- "DeepCrawlStrategy": DeepCrawlStrategy,
110
- "ExtractionStrategy": ExtractionStrategy,
111
- "MarkdownGenerationStrategy": MarkdownGenerationStrategy,
112
- }
113
-
114
- cls.update_forward_refs(**namespace)
115
- cls._refs_resolved = True
116
- except ImportError:
117
- # If crawl4ai is not installed, leave forward refs as strings
118
- pass
119
-
120
- def __init__(self, **kwargs: Any) -> None:
121
- """Initialize and ensure forward refs are resolved."""
122
- self._resolve_forward_refs()
123
- super().__init__(**kwargs)
80
+ # Resolve forward references for Crawl4aiConfig after the class is defined
81
+ try:
82
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
83
+ from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
84
+ from crawl4ai.deep_crawling import DeepCrawlStrategy
85
+ from crawl4ai.extraction_strategy import ExtractionStrategy
86
+ from crawl4ai.markdown_generation_strategy import MarkdownGenerationStrategy
124
87
 
125
- class Config:
126
- arbitrary_types_allowed = True
88
+ # Rebuild the model with resolved references
89
+ Crawl4aiConfig.model_rebuild()
90
+ except ImportError:
91
+ # If crawl4ai is not installed, leave forward refs as strings
92
+ pass
127
93
 
128
94
 
129
95
  class BaseCrawler(ABC):
@@ -347,7 +313,7 @@ class FirecrawlCrawler(BaseCrawler):
347
313
  )
348
314
  processed_urls.add(url)
349
315
  new_pages += 1
350
- pbar.update(new_pages) # Update progress bar with new pages
316
+ pbar.model_copy(update=new_pages) # Update progress bar with new pages
351
317
 
352
318
  # Break if crawl is complete
353
319
  if status["status"] == "completed":
langroid/parsing/urls.py CHANGED
@@ -9,11 +9,10 @@ from urllib.parse import urldefrag, urljoin, urlparse
9
9
  import fire
10
10
  import requests
11
11
  from bs4 import BeautifulSoup
12
+ from pydantic import BaseModel, HttpUrl, TypeAdapter, ValidationError
12
13
  from rich import print
13
14
  from rich.prompt import Prompt
14
15
 
15
- from langroid.pydantic_v1 import BaseModel, HttpUrl, ValidationError, parse_obj_as
16
-
17
16
  logger = logging.getLogger(__name__)
18
17
 
19
18
 
@@ -106,7 +105,8 @@ class Url(BaseModel):
106
105
 
107
106
  def is_url(s: str) -> bool:
108
107
  try:
109
- Url(url=parse_obj_as(HttpUrl, s))
108
+ url_adapter = TypeAdapter(HttpUrl)
109
+ Url(url=url_adapter.validate_python(s))
110
110
  return True
111
111
  except ValidationError:
112
112
  return False
@@ -133,7 +133,8 @@ def get_urls_paths_bytes_indices(
133
133
  byte_list.append(i)
134
134
  continue
135
135
  try:
136
- Url(url=parse_obj_as(HttpUrl, item))
136
+ url_adapter = TypeAdapter(HttpUrl)
137
+ Url(url=url_adapter.validate_python(item))
137
138
  urls.append(i)
138
139
  except ValidationError:
139
140
  if os.path.exists(item):
@@ -1,4 +1,4 @@
1
- from langroid.pydantic_v1 import BaseSettings
1
+ from pydantic_settings import BaseSettings
2
2
 
3
3
 
4
4
  class PromptsConfig(BaseSettings):