langroid 0.58.3__py3-none-any.whl → 0.59.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/base.py +39 -17
- langroid/agent/callbacks/chainlit.py +2 -1
- langroid/agent/chat_agent.py +73 -55
- langroid/agent/chat_document.py +7 -7
- langroid/agent/done_sequence_parser.py +46 -11
- langroid/agent/openai_assistant.py +9 -9
- langroid/agent/special/arangodb/arangodb_agent.py +10 -18
- langroid/agent/special/arangodb/tools.py +3 -3
- langroid/agent/special/doc_chat_agent.py +16 -14
- langroid/agent/special/lance_rag/critic_agent.py +2 -2
- langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
- langroid/agent/special/lance_tools.py +6 -5
- langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
- langroid/agent/special/relevance_extractor_agent.py +1 -1
- langroid/agent/special/sql/sql_chat_agent.py +11 -3
- langroid/agent/task.py +53 -94
- langroid/agent/tool_message.py +33 -17
- langroid/agent/tools/file_tools.py +4 -2
- langroid/agent/tools/mcp/fastmcp_client.py +19 -6
- langroid/agent/tools/orchestration.py +22 -17
- langroid/agent/tools/recipient_tool.py +3 -3
- langroid/agent/tools/task_tool.py +22 -16
- langroid/agent/xml_tool_message.py +90 -35
- langroid/cachedb/base.py +1 -1
- langroid/embedding_models/base.py +2 -2
- langroid/embedding_models/models.py +3 -7
- langroid/exceptions.py +4 -1
- langroid/language_models/azure_openai.py +2 -2
- langroid/language_models/base.py +6 -4
- langroid/language_models/config.py +2 -4
- langroid/language_models/model_info.py +9 -1
- langroid/language_models/openai_gpt.py +53 -18
- langroid/language_models/provider_params.py +3 -22
- langroid/mytypes.py +11 -4
- langroid/parsing/code_parser.py +1 -1
- langroid/parsing/file_attachment.py +1 -1
- langroid/parsing/md_parser.py +14 -4
- langroid/parsing/parser.py +22 -7
- langroid/parsing/repo_loader.py +3 -1
- langroid/parsing/search.py +1 -1
- langroid/parsing/url_loader.py +17 -51
- langroid/parsing/urls.py +5 -4
- langroid/prompts/prompts_config.py +1 -1
- langroid/pydantic_v1/__init__.py +61 -4
- langroid/pydantic_v1/main.py +10 -4
- langroid/utils/configuration.py +13 -11
- langroid/utils/constants.py +1 -1
- langroid/utils/globals.py +21 -5
- langroid/utils/html_logger.py +2 -1
- langroid/utils/object_registry.py +1 -1
- langroid/utils/pydantic_utils.py +55 -28
- langroid/utils/types.py +2 -2
- langroid/vector_store/base.py +3 -3
- langroid/vector_store/lancedb.py +5 -5
- langroid/vector_store/meilisearch.py +2 -2
- langroid/vector_store/pineconedb.py +4 -4
- langroid/vector_store/postgres.py +1 -1
- langroid/vector_store/qdrantdb.py +3 -3
- langroid/vector_store/weaviatedb.py +1 -1
- {langroid-0.58.3.dist-info → langroid-0.59.0.dist-info}/METADATA +3 -2
- {langroid-0.58.3.dist-info → langroid-0.59.0.dist-info}/RECORD +63 -63
- {langroid-0.58.3.dist-info → langroid-0.59.0.dist-info}/WHEEL +0 -0
- {langroid-0.58.3.dist-info → langroid-0.59.0.dist-info}/licenses/LICENSE +0 -0
@@ -12,6 +12,7 @@ from typing import (
|
|
12
12
|
Callable,
|
13
13
|
Dict,
|
14
14
|
List,
|
15
|
+
Mapping,
|
15
16
|
Optional,
|
16
17
|
Tuple,
|
17
18
|
Type,
|
@@ -24,6 +25,8 @@ from cerebras.cloud.sdk import AsyncCerebras, Cerebras
|
|
24
25
|
from groq import AsyncGroq, Groq
|
25
26
|
from httpx import Timeout
|
26
27
|
from openai import AsyncOpenAI, OpenAI
|
28
|
+
from pydantic import BaseModel
|
29
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
27
30
|
from rich import print
|
28
31
|
from rich.markup import escape
|
29
32
|
|
@@ -78,7 +81,6 @@ from langroid.language_models.utils import (
|
|
78
81
|
retry_with_exponential_backoff,
|
79
82
|
)
|
80
83
|
from langroid.parsing.parse_json import parse_imperfect_json
|
81
|
-
from langroid.pydantic_v1 import BaseModel, BaseSettings
|
82
84
|
from langroid.utils.configuration import settings
|
83
85
|
from langroid.utils.constants import Colors
|
84
86
|
from langroid.utils.system import friendly_error
|
@@ -220,7 +222,7 @@ class OpenAICallParams(BaseModel):
|
|
220
222
|
extra_body: Dict[str, Any] | None = None # additional params for API request body
|
221
223
|
|
222
224
|
def to_dict_exclude_none(self) -> Dict[str, Any]:
|
223
|
-
return {k: v for k, v in self.
|
225
|
+
return {k: v for k, v in self.model_dump().items() if v is not None}
|
224
226
|
|
225
227
|
|
226
228
|
class LiteLLMProxyConfig(BaseSettings):
|
@@ -229,8 +231,7 @@ class LiteLLMProxyConfig(BaseSettings):
|
|
229
231
|
api_key: str = "" # read from env var LITELLM_API_KEY if set
|
230
232
|
api_base: str = "" # read from env var LITELLM_API_BASE if set
|
231
233
|
|
232
|
-
|
233
|
-
env_prefix = "LITELLM_"
|
234
|
+
model_config = SettingsConfigDict(env_prefix="LITELLM_")
|
234
235
|
|
235
236
|
|
236
237
|
class OpenAIGPTConfig(LLMConfig):
|
@@ -259,7 +260,7 @@ class OpenAIGPTConfig(LLMConfig):
|
|
259
260
|
litellm_proxy: LiteLLMProxyConfig = LiteLLMProxyConfig()
|
260
261
|
ollama: bool = False # use ollama's OpenAI-compatible endpoint?
|
261
262
|
min_output_tokens: int = 1
|
262
|
-
use_chat_for_completion = True # do not change this, for OpenAI models!
|
263
|
+
use_chat_for_completion: bool = True # do not change this, for OpenAI models!
|
263
264
|
timeout: int = 20
|
264
265
|
temperature: float = 0.2
|
265
266
|
seed: int | None = 42
|
@@ -316,8 +317,43 @@ class OpenAIGPTConfig(LLMConfig):
|
|
316
317
|
|
317
318
|
super().__init__(**kwargs)
|
318
319
|
|
319
|
-
|
320
|
-
|
320
|
+
model_config = SettingsConfigDict(env_prefix="OPENAI_")
|
321
|
+
|
322
|
+
def model_copy(
|
323
|
+
self, *, update: Mapping[str, Any] | None = None, deep: bool = False
|
324
|
+
) -> "OpenAIGPTConfig":
|
325
|
+
"""
|
326
|
+
Override model_copy to handle unpicklable fields properly.
|
327
|
+
|
328
|
+
This preserves fields like http_client_factory during normal copying
|
329
|
+
while still allowing exclusion for pickling operations.
|
330
|
+
"""
|
331
|
+
# Save references to unpicklable fields
|
332
|
+
http_client_factory = self.http_client_factory
|
333
|
+
streamer = self.streamer
|
334
|
+
streamer_async = self.streamer_async
|
335
|
+
|
336
|
+
# Get the current model data, excluding problematic fields
|
337
|
+
data = self.model_dump(
|
338
|
+
exclude={"http_client_factory", "streamer", "streamer_async"}
|
339
|
+
)
|
340
|
+
|
341
|
+
# Apply any updates
|
342
|
+
if update:
|
343
|
+
data.update(update)
|
344
|
+
|
345
|
+
# Create a new instance with the copied data
|
346
|
+
new_instance = self.__class__(**data)
|
347
|
+
|
348
|
+
# Restore the unpicklable fields if they weren't overridden by update
|
349
|
+
if "http_client_factory" not in (update or {}):
|
350
|
+
new_instance.http_client_factory = http_client_factory
|
351
|
+
if "streamer" not in (update or {}):
|
352
|
+
new_instance.streamer = streamer
|
353
|
+
if "streamer_async" not in (update or {}):
|
354
|
+
new_instance.streamer_async = streamer_async
|
355
|
+
|
356
|
+
return new_instance
|
321
357
|
|
322
358
|
def _validate_litellm(self) -> None:
|
323
359
|
"""
|
@@ -330,12 +366,12 @@ class OpenAIGPTConfig(LLMConfig):
|
|
330
366
|
import litellm
|
331
367
|
except ImportError:
|
332
368
|
raise LangroidImportError("litellm", "litellm")
|
369
|
+
|
333
370
|
litellm.telemetry = False
|
334
371
|
litellm.drop_params = True # drop un-supported params without crashing
|
335
|
-
# modify params to fit the model expectations, and avoid crashing
|
336
|
-
# (e.g. anthropic doesn't like first msg to be system msg)
|
337
372
|
litellm.modify_params = True
|
338
373
|
self.seed = None # some local mdls don't support seed
|
374
|
+
|
339
375
|
if self.api_key == DUMMY_API_KEY:
|
340
376
|
keys_dict = litellm.utils.validate_environment(self.chat_model)
|
341
377
|
missing_keys = keys_dict.get("missing_keys", [])
|
@@ -365,8 +401,7 @@ class OpenAIGPTConfig(LLMConfig):
|
|
365
401
|
class DynamicConfig(OpenAIGPTConfig):
|
366
402
|
pass
|
367
403
|
|
368
|
-
DynamicConfig.
|
369
|
-
|
404
|
+
DynamicConfig.model_config = SettingsConfigDict(env_prefix=prefix.upper() + "_")
|
370
405
|
return DynamicConfig
|
371
406
|
|
372
407
|
|
@@ -407,7 +442,7 @@ class OpenAIGPT(LanguageModel):
|
|
407
442
|
config: configuration for openai-gpt model
|
408
443
|
"""
|
409
444
|
# copy the config to avoid modifying the original
|
410
|
-
config = config.
|
445
|
+
config = config.model_copy()
|
411
446
|
super().__init__(config)
|
412
447
|
self.config: OpenAIGPTConfig = config
|
413
448
|
# save original model name such as `provider/model` before
|
@@ -1477,7 +1512,7 @@ class OpenAIGPT(LanguageModel):
|
|
1477
1512
|
|
1478
1513
|
if has_function:
|
1479
1514
|
function_call = LLMFunctionCall(name=function_name)
|
1480
|
-
function_call_dict = function_call.
|
1515
|
+
function_call_dict = function_call.model_dump()
|
1481
1516
|
if function_args == "":
|
1482
1517
|
function_call.arguments = None
|
1483
1518
|
else:
|
@@ -1529,7 +1564,7 @@ class OpenAIGPT(LanguageModel):
|
|
1529
1564
|
),
|
1530
1565
|
),
|
1531
1566
|
),
|
1532
|
-
openai_response.
|
1567
|
+
openai_response.model_dump(),
|
1533
1568
|
)
|
1534
1569
|
|
1535
1570
|
def _cache_store(self, k: str, v: Any) -> None:
|
@@ -1680,7 +1715,7 @@ class OpenAIGPT(LanguageModel):
|
|
1680
1715
|
cached, hashed_key, response = completions_with_backoff(**args)
|
1681
1716
|
# assume response is an actual response rather than a streaming event
|
1682
1717
|
if not isinstance(response, dict):
|
1683
|
-
response = response.
|
1718
|
+
response = response.model_dump()
|
1684
1719
|
if "message" in response["choices"][0]:
|
1685
1720
|
msg = response["choices"][0]["message"]["content"].strip()
|
1686
1721
|
else:
|
@@ -1758,7 +1793,7 @@ class OpenAIGPT(LanguageModel):
|
|
1758
1793
|
)
|
1759
1794
|
# assume response is an actual response rather than a streaming event
|
1760
1795
|
if not isinstance(response, dict):
|
1761
|
-
response = response.
|
1796
|
+
response = response.model_dump()
|
1762
1797
|
if "message" in response["choices"][0]:
|
1763
1798
|
msg = response["choices"][0]["message"]["content"].strip()
|
1764
1799
|
else:
|
@@ -2056,7 +2091,7 @@ class OpenAIGPT(LanguageModel):
|
|
2056
2091
|
if functions is not None:
|
2057
2092
|
args.update(
|
2058
2093
|
dict(
|
2059
|
-
functions=[f.
|
2094
|
+
functions=[f.model_dump() for f in functions],
|
2060
2095
|
function_call=function_call,
|
2061
2096
|
)
|
2062
2097
|
)
|
@@ -2074,7 +2109,7 @@ class OpenAIGPT(LanguageModel):
|
|
2074
2109
|
tools=[
|
2075
2110
|
dict(
|
2076
2111
|
type="function",
|
2077
|
-
function=t.function.
|
2112
|
+
function=t.function.model_dump()
|
2078
2113
|
| ({"strict": t.strict} if t.strict is not None else {}),
|
2079
2114
|
)
|
2080
2115
|
for t in tools
|
@@ -4,7 +4,7 @@ Provider-specific parameter configurations for various LLM providers.
|
|
4
4
|
|
5
5
|
from typing import Any, Dict, Optional
|
6
6
|
|
7
|
-
from
|
7
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
8
8
|
|
9
9
|
# Constants
|
10
10
|
LANGDB_BASE_URL = "https://api.us-east-1.langdb.ai"
|
@@ -24,10 +24,7 @@ class LangDBParams(BaseSettings):
|
|
24
24
|
thread_id: Optional[str] = None
|
25
25
|
base_url: str = LANGDB_BASE_URL
|
26
26
|
|
27
|
-
|
28
|
-
# allow setting of fields via env vars,
|
29
|
-
# e.g. LANGDB_PROJECT_ID=1234
|
30
|
-
env_prefix = "LANGDB_"
|
27
|
+
model_config = SettingsConfigDict(env_prefix="LANGDB_")
|
31
28
|
|
32
29
|
|
33
30
|
class PortkeyParams(BaseSettings):
|
@@ -61,10 +58,7 @@ class PortkeyParams(BaseSettings):
|
|
61
58
|
custom_headers: Optional[Dict[str, str]] = None # Optional: additional headers
|
62
59
|
base_url: str = PORTKEY_BASE_URL
|
63
60
|
|
64
|
-
|
65
|
-
# allow setting of fields via env vars,
|
66
|
-
# e.g. PORTKEY_API_KEY=xxx, PORTKEY_PROVIDER=anthropic
|
67
|
-
env_prefix = "PORTKEY_"
|
61
|
+
model_config = SettingsConfigDict(env_prefix="PORTKEY_")
|
68
62
|
|
69
63
|
def get_headers(self) -> Dict[str, str]:
|
70
64
|
"""Generate Portkey-specific headers from parameters."""
|
@@ -73,7 +67,6 @@ class PortkeyParams(BaseSettings):
|
|
73
67
|
|
74
68
|
headers = {}
|
75
69
|
|
76
|
-
# API key - from params or environment
|
77
70
|
if self.api_key and self.api_key != DUMMY_API_KEY:
|
78
71
|
headers["x-portkey-api-key"] = self.api_key
|
79
72
|
else:
|
@@ -81,45 +74,35 @@ class PortkeyParams(BaseSettings):
|
|
81
74
|
if portkey_key:
|
82
75
|
headers["x-portkey-api-key"] = portkey_key
|
83
76
|
|
84
|
-
# Provider
|
85
77
|
if self.provider:
|
86
78
|
headers["x-portkey-provider"] = self.provider
|
87
79
|
|
88
|
-
# Virtual key
|
89
80
|
if self.virtual_key:
|
90
81
|
headers["x-portkey-virtual-key"] = self.virtual_key
|
91
82
|
|
92
|
-
# Trace ID
|
93
83
|
if self.trace_id:
|
94
84
|
headers["x-portkey-trace-id"] = self.trace_id
|
95
85
|
|
96
|
-
# Metadata
|
97
86
|
if self.metadata:
|
98
87
|
headers["x-portkey-metadata"] = json.dumps(self.metadata)
|
99
88
|
|
100
|
-
# Retry configuration
|
101
89
|
if self.retry:
|
102
90
|
headers["x-portkey-retry"] = json.dumps(self.retry)
|
103
91
|
|
104
|
-
# Cache configuration
|
105
92
|
if self.cache:
|
106
93
|
headers["x-portkey-cache"] = json.dumps(self.cache)
|
107
94
|
|
108
|
-
# Cache force refresh
|
109
95
|
if self.cache_force_refresh is not None:
|
110
96
|
headers["x-portkey-cache-force-refresh"] = str(
|
111
97
|
self.cache_force_refresh
|
112
98
|
).lower()
|
113
99
|
|
114
|
-
# User identifier
|
115
100
|
if self.user:
|
116
101
|
headers["x-portkey-user"] = self.user
|
117
102
|
|
118
|
-
# Organization identifier
|
119
103
|
if self.organization:
|
120
104
|
headers["x-portkey-organization"] = self.organization
|
121
105
|
|
122
|
-
# Add any custom headers
|
123
106
|
if self.custom_headers:
|
124
107
|
headers.update(self.custom_headers)
|
125
108
|
|
@@ -138,7 +121,6 @@ class PortkeyParams(BaseSettings):
|
|
138
121
|
_, provider, model = parts
|
139
122
|
return provider, model
|
140
123
|
else:
|
141
|
-
# Fallback: just remove "portkey/" prefix and return empty provider
|
142
124
|
model = model_string.replace("portkey/", "")
|
143
125
|
return "", model
|
144
126
|
|
@@ -157,7 +139,6 @@ class PortkeyParams(BaseSettings):
|
|
157
139
|
"""
|
158
140
|
import os
|
159
141
|
|
160
|
-
# Common environment variable patterns for different providers
|
161
142
|
env_patterns = [
|
162
143
|
f"{provider.upper()}_API_KEY",
|
163
144
|
f"{provider.upper()}_KEY",
|
langroid/mytypes.py
CHANGED
@@ -3,7 +3,7 @@ from textwrap import dedent
|
|
3
3
|
from typing import Any, Callable, Dict, List, Union
|
4
4
|
from uuid import uuid4
|
5
5
|
|
6
|
-
from
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
7
7
|
|
8
8
|
Number = Union[int, float]
|
9
9
|
Embedding = List[Number]
|
@@ -51,13 +51,21 @@ class DocMetaData(BaseModel):
|
|
51
51
|
id: str = Field(default_factory=lambda: str(uuid4()))
|
52
52
|
window_ids: List[str] = [] # for RAG: ids of chunks around this one
|
53
53
|
|
54
|
+
@field_validator("id", mode="before")
|
55
|
+
@classmethod
|
56
|
+
def convert_id_to_string(cls, v: Any) -> str:
|
57
|
+
"""Convert id to string if it's not already."""
|
58
|
+
if v is None:
|
59
|
+
return str(uuid4())
|
60
|
+
return str(v)
|
61
|
+
|
54
62
|
def dict_bool_int(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
|
55
63
|
"""
|
56
64
|
Special dict method to convert bool fields to int, to appease some
|
57
65
|
downstream libraries, e.g. Chroma which complains about bool fields in
|
58
66
|
metadata.
|
59
67
|
"""
|
60
|
-
original_dict = super().
|
68
|
+
original_dict = super().model_dump(*args, **kwargs)
|
61
69
|
|
62
70
|
for key, value in original_dict.items():
|
63
71
|
if isinstance(value, bool):
|
@@ -92,8 +100,7 @@ class DocMetaData(BaseModel):
|
|
92
100
|
)
|
93
101
|
return ", ".join(components)
|
94
102
|
|
95
|
-
|
96
|
-
extra = Extra.allow
|
103
|
+
model_config = ConfigDict(extra="allow")
|
97
104
|
|
98
105
|
|
99
106
|
class Document(BaseModel):
|
langroid/parsing/code_parser.py
CHANGED
@@ -2,12 +2,12 @@ from functools import reduce
|
|
2
2
|
from typing import Callable, List
|
3
3
|
|
4
4
|
import tiktoken
|
5
|
+
from pydantic_settings import BaseSettings
|
5
6
|
from pygments import lex
|
6
7
|
from pygments.lexers import get_lexer_by_name
|
7
8
|
from pygments.token import Token
|
8
9
|
|
9
10
|
from langroid.mytypes import Document
|
10
|
-
from langroid.pydantic_v1 import BaseSettings
|
11
11
|
|
12
12
|
|
13
13
|
def chunk_code(
|
langroid/parsing/md_parser.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import re
|
2
|
-
from typing import List
|
2
|
+
from typing import Any, List
|
3
3
|
|
4
|
-
from
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
5
5
|
|
6
6
|
HEADER_CONTEXT_SEP = "\n...\n"
|
7
7
|
|
@@ -24,8 +24,8 @@ class Node(BaseModel):
|
|
24
24
|
# Forward references will be resolved with the update_forward_refs call below.
|
25
25
|
|
26
26
|
|
27
|
-
# Resolve forward references for Node (required for recursive models in Pydantic
|
28
|
-
Node.
|
27
|
+
# Resolve forward references for Node (required for recursive models in Pydantic)
|
28
|
+
Node.model_rebuild()
|
29
29
|
|
30
30
|
|
31
31
|
def _cleanup_text(text: str) -> str:
|
@@ -180,6 +180,16 @@ class MarkdownChunkConfig(BaseModel):
|
|
180
180
|
rollup: bool = True # whether to roll up chunks
|
181
181
|
header_context_sep: str = HEADER_CONTEXT_SEP # separator for header context
|
182
182
|
|
183
|
+
@field_validator("chunk_size", mode="before")
|
184
|
+
@classmethod
|
185
|
+
def convert_chunk_size_to_int(cls, v: Any) -> int:
|
186
|
+
"""Convert chunk_size to int, maintaining backward compatibility
|
187
|
+
with Pydantic V1.
|
188
|
+
"""
|
189
|
+
if isinstance(v, float):
|
190
|
+
return int(v)
|
191
|
+
return int(v)
|
192
|
+
|
183
193
|
|
184
194
|
# A simple tokenizer that counts tokens as whitespace-separated words.
|
185
195
|
def count_words(text: str) -> int:
|
langroid/parsing/parser.py
CHANGED
@@ -4,6 +4,8 @@ from enum import Enum
|
|
4
4
|
from typing import Any, Dict, List, Literal, Optional
|
5
5
|
|
6
6
|
import tiktoken
|
7
|
+
from pydantic import field_validator, model_validator
|
8
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
7
9
|
|
8
10
|
from langroid.mytypes import Document
|
9
11
|
from langroid.parsing.md_parser import (
|
@@ -12,7 +14,6 @@ from langroid.parsing.md_parser import (
|
|
12
14
|
count_words,
|
13
15
|
)
|
14
16
|
from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
|
15
|
-
from langroid.pydantic_v1 import BaseSettings, root_validator
|
16
17
|
from langroid.utils.object_registry import ObjectRegistry
|
17
18
|
|
18
19
|
logger = logging.getLogger(__name__)
|
@@ -32,8 +33,7 @@ class BaseParsingConfig(BaseSettings):
|
|
32
33
|
|
33
34
|
library: str
|
34
35
|
|
35
|
-
|
36
|
-
extra = "ignore" # Ignore unknown settings
|
36
|
+
model_config = SettingsConfigDict(extra="ignore") # Ignore unknown settings
|
37
37
|
|
38
38
|
|
39
39
|
class LLMPdfParserConfig(BaseSettings):
|
@@ -69,7 +69,8 @@ class PdfParsingConfig(BaseParsingConfig):
|
|
69
69
|
llm_parser_config: Optional[LLMPdfParserConfig] = None
|
70
70
|
marker_config: Optional[MarkerConfig] = None
|
71
71
|
|
72
|
-
@
|
72
|
+
@model_validator(mode="before")
|
73
|
+
@classmethod
|
73
74
|
def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
74
75
|
"""Ensure correct config is set based on library selection."""
|
75
76
|
library = values.get("library")
|
@@ -114,6 +115,17 @@ class ParsingConfig(BaseSettings):
|
|
114
115
|
chunk_size_variation: float = 0.30 # max variation from chunk_size
|
115
116
|
overlap: int = 50 # overlap between chunks
|
116
117
|
max_chunks: int = 10_000
|
118
|
+
|
119
|
+
@field_validator("chunk_size", mode="before")
|
120
|
+
@classmethod
|
121
|
+
def convert_chunk_size_to_int(cls, v: Any) -> int:
|
122
|
+
"""Convert chunk_size to int, maintaining backward compatibility
|
123
|
+
with Pydantic V1.
|
124
|
+
"""
|
125
|
+
if isinstance(v, float):
|
126
|
+
return int(v)
|
127
|
+
return int(v)
|
128
|
+
|
117
129
|
# offset to subtract from page numbers:
|
118
130
|
# e.g. if physical page 12 is displayed as page 1, set page_number_offset = 11
|
119
131
|
page_number_offset: int = 0
|
@@ -203,7 +215,8 @@ class Parser:
|
|
203
215
|
# add_window_ids)
|
204
216
|
chunk_docs = [
|
205
217
|
Document(
|
206
|
-
content=c,
|
218
|
+
content=c,
|
219
|
+
metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
|
207
220
|
)
|
208
221
|
for c in chunks
|
209
222
|
if c.strip() != ""
|
@@ -255,7 +268,8 @@ class Parser:
|
|
255
268
|
# add_window_ids)
|
256
269
|
chunk_docs = [
|
257
270
|
Document(
|
258
|
-
content=c,
|
271
|
+
content=c,
|
272
|
+
metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
|
259
273
|
)
|
260
274
|
for c in chunks
|
261
275
|
if c.strip() != ""
|
@@ -287,7 +301,8 @@ class Parser:
|
|
287
301
|
# add_window_ids)
|
288
302
|
chunk_docs = [
|
289
303
|
Document(
|
290
|
-
content=c,
|
304
|
+
content=c,
|
305
|
+
metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
|
291
306
|
)
|
292
307
|
for c in chunks
|
293
308
|
if c.strip() != ""
|
langroid/parsing/repo_loader.py
CHANGED
@@ -18,10 +18,12 @@ if TYPE_CHECKING:
|
|
18
18
|
from github.Label import Label
|
19
19
|
from github.Repository import Repository
|
20
20
|
|
21
|
+
from pydantic import BaseModel, Field
|
22
|
+
from pydantic_settings import BaseSettings
|
23
|
+
|
21
24
|
from langroid.mytypes import DocMetaData, Document
|
22
25
|
from langroid.parsing.document_parser import DocumentParser, DocumentType
|
23
26
|
from langroid.parsing.parser import Parser, ParsingConfig
|
24
|
-
from langroid.pydantic_v1 import BaseModel, BaseSettings, Field
|
25
27
|
|
26
28
|
logger = logging.getLogger(__name__)
|
27
29
|
|
langroid/parsing/search.py
CHANGED
@@ -64,7 +64,7 @@ def find_fuzzy_matches_in_docs(
|
|
64
64
|
return orig_doc_matches
|
65
65
|
if len(orig_doc_matches) == 0:
|
66
66
|
return []
|
67
|
-
if set(orig_doc_matches[0][0].
|
67
|
+
if set(orig_doc_matches[0][0].model_fields) != {"content", "metadata"}:
|
68
68
|
# If there are fields beyond just content and metadata,
|
69
69
|
# we do NOT want to create new document objects with content fields
|
70
70
|
# based on words_before and words_after, since we don't know how to
|
langroid/parsing/url_loader.py
CHANGED
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
|
|
7
7
|
|
8
8
|
import markdownify as md
|
9
9
|
from dotenv import load_dotenv
|
10
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
10
11
|
|
11
12
|
from langroid.exceptions import LangroidImportError
|
12
13
|
from langroid.mytypes import DocMetaData, Document
|
13
14
|
from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
|
14
15
|
from langroid.parsing.parser import Parser, ParsingConfig
|
15
|
-
from langroid.pydantic_v1 import BaseSettings
|
16
16
|
|
17
17
|
if TYPE_CHECKING:
|
18
18
|
from firecrawl import FirecrawlApp
|
@@ -54,20 +54,13 @@ class FirecrawlConfig(BaseCrawlerConfig):
|
|
54
54
|
params: Dict[str, Any] = {}
|
55
55
|
timeout: Optional[int] = None
|
56
56
|
|
57
|
-
|
58
|
-
# Leverage Pydantic's BaseSettings to
|
59
|
-
# allow setting of fields via env vars,
|
60
|
-
# e.g. FIRECRAWL_MODE=scrape and FIRECRAWL_API_KEY=...
|
61
|
-
env_prefix = "FIRECRAWL_"
|
57
|
+
model_config = SettingsConfigDict(env_prefix="FIRECRAWL_")
|
62
58
|
|
63
59
|
|
64
60
|
class ExaCrawlerConfig(BaseCrawlerConfig):
|
65
61
|
api_key: str = ""
|
66
62
|
|
67
|
-
|
68
|
-
# Allow setting of fields via env vars with prefix EXA_
|
69
|
-
# e.g., EXA_API_KEY=your_api_key
|
70
|
-
env_prefix = "EXA_"
|
63
|
+
model_config = SettingsConfigDict(env_prefix="EXA_")
|
71
64
|
|
72
65
|
|
73
66
|
class Crawl4aiConfig(BaseCrawlerConfig):
|
@@ -81,49 +74,22 @@ class Crawl4aiConfig(BaseCrawlerConfig):
|
|
81
74
|
browser_config: Optional["BrowserConfig"] = None
|
82
75
|
run_config: Optional["CrawlerRunConfig"] = None
|
83
76
|
|
84
|
-
|
77
|
+
model_config = SettingsConfigDict(arbitrary_types_allowed=True)
|
85
78
|
|
86
|
-
def __init_subclass__(cls, **kwargs: Any) -> None:
|
87
|
-
"""Resolve forward references when class is first subclassed or instantiated."""
|
88
|
-
super().__init_subclass__(**kwargs)
|
89
|
-
cls._resolve_forward_refs()
|
90
79
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
from crawl4ai.deep_crawling import DeepCrawlStrategy
|
99
|
-
from crawl4ai.extraction_strategy import ExtractionStrategy
|
100
|
-
from crawl4ai.markdown_generation_strategy import (
|
101
|
-
MarkdownGenerationStrategy,
|
102
|
-
)
|
103
|
-
|
104
|
-
# Create namespace for update_forward_refs
|
105
|
-
namespace = {
|
106
|
-
"BrowserConfig": BrowserConfig,
|
107
|
-
"CrawlerRunConfig": CrawlerRunConfig,
|
108
|
-
"ContentScrapingStrategy": ContentScrapingStrategy,
|
109
|
-
"DeepCrawlStrategy": DeepCrawlStrategy,
|
110
|
-
"ExtractionStrategy": ExtractionStrategy,
|
111
|
-
"MarkdownGenerationStrategy": MarkdownGenerationStrategy,
|
112
|
-
}
|
113
|
-
|
114
|
-
cls.update_forward_refs(**namespace)
|
115
|
-
cls._refs_resolved = True
|
116
|
-
except ImportError:
|
117
|
-
# If crawl4ai is not installed, leave forward refs as strings
|
118
|
-
pass
|
119
|
-
|
120
|
-
def __init__(self, **kwargs: Any) -> None:
|
121
|
-
"""Initialize and ensure forward refs are resolved."""
|
122
|
-
self._resolve_forward_refs()
|
123
|
-
super().__init__(**kwargs)
|
80
|
+
# Resolve forward references for Crawl4aiConfig after the class is defined
|
81
|
+
try:
|
82
|
+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
83
|
+
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
|
84
|
+
from crawl4ai.deep_crawling import DeepCrawlStrategy
|
85
|
+
from crawl4ai.extraction_strategy import ExtractionStrategy
|
86
|
+
from crawl4ai.markdown_generation_strategy import MarkdownGenerationStrategy
|
124
87
|
|
125
|
-
|
126
|
-
|
88
|
+
# Rebuild the model with resolved references
|
89
|
+
Crawl4aiConfig.model_rebuild()
|
90
|
+
except ImportError:
|
91
|
+
# If crawl4ai is not installed, leave forward refs as strings
|
92
|
+
pass
|
127
93
|
|
128
94
|
|
129
95
|
class BaseCrawler(ABC):
|
@@ -347,7 +313,7 @@ class FirecrawlCrawler(BaseCrawler):
|
|
347
313
|
)
|
348
314
|
processed_urls.add(url)
|
349
315
|
new_pages += 1
|
350
|
-
pbar.update
|
316
|
+
pbar.model_copy(update=new_pages) # Update progress bar with new pages
|
351
317
|
|
352
318
|
# Break if crawl is complete
|
353
319
|
if status["status"] == "completed":
|
langroid/parsing/urls.py
CHANGED
@@ -9,11 +9,10 @@ from urllib.parse import urldefrag, urljoin, urlparse
|
|
9
9
|
import fire
|
10
10
|
import requests
|
11
11
|
from bs4 import BeautifulSoup
|
12
|
+
from pydantic import BaseModel, HttpUrl, TypeAdapter, ValidationError
|
12
13
|
from rich import print
|
13
14
|
from rich.prompt import Prompt
|
14
15
|
|
15
|
-
from langroid.pydantic_v1 import BaseModel, HttpUrl, ValidationError, parse_obj_as
|
16
|
-
|
17
16
|
logger = logging.getLogger(__name__)
|
18
17
|
|
19
18
|
|
@@ -106,7 +105,8 @@ class Url(BaseModel):
|
|
106
105
|
|
107
106
|
def is_url(s: str) -> bool:
|
108
107
|
try:
|
109
|
-
|
108
|
+
url_adapter = TypeAdapter(HttpUrl)
|
109
|
+
Url(url=url_adapter.validate_python(s))
|
110
110
|
return True
|
111
111
|
except ValidationError:
|
112
112
|
return False
|
@@ -133,7 +133,8 @@ def get_urls_paths_bytes_indices(
|
|
133
133
|
byte_list.append(i)
|
134
134
|
continue
|
135
135
|
try:
|
136
|
-
|
136
|
+
url_adapter = TypeAdapter(HttpUrl)
|
137
|
+
Url(url=url_adapter.validate_python(item))
|
137
138
|
urls.append(i)
|
138
139
|
except ValidationError:
|
139
140
|
if os.path.exists(item):
|