langroid 0.45.10__py3-none-any.whl → 0.47.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/special/doc_chat_agent.py +13 -3
- langroid/embedding_models/models.py +54 -14
- langroid/language_models/openai_gpt.py +51 -2
- langroid/parsing/url_loader.py +309 -89
- {langroid-0.45.10.dist-info → langroid-0.47.0.dist-info}/METADATA +3 -1
- {langroid-0.45.10.dist-info → langroid-0.47.0.dist-info}/RECORD +8 -8
- {langroid-0.45.10.dist-info → langroid-0.47.0.dist-info}/WHEEL +0 -0
- {langroid-0.45.10.dist-info → langroid-0.47.0.dist-info}/licenses/LICENSE +0 -0
@@ -50,7 +50,7 @@ from langroid.parsing.search import (
|
|
50
50
|
preprocess_text,
|
51
51
|
)
|
52
52
|
from langroid.parsing.table_loader import describe_dataframe
|
53
|
-
from langroid.parsing.url_loader import URLLoader
|
53
|
+
from langroid.parsing.url_loader import BaseCrawlerConfig, TrafilaturaConfig, URLLoader
|
54
54
|
from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
|
55
55
|
from langroid.prompts.prompts_config import PromptsConfig
|
56
56
|
from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
|
@@ -192,6 +192,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
192
192
|
library="pymupdf4llm",
|
193
193
|
),
|
194
194
|
)
|
195
|
+
crawler_config: Optional[BaseCrawlerConfig] = TrafilaturaConfig()
|
195
196
|
|
196
197
|
# Allow vecdb to be None in case we want to explicitly set it later
|
197
198
|
vecdb: Optional[VectorStoreConfig] = QdrantDBConfig(
|
@@ -336,11 +337,15 @@ class DocChatAgent(ChatAgent):
|
|
336
337
|
urls_meta = {u: idx2meta[u] for u in url_idxs}
|
337
338
|
paths_meta = {p: idx2meta[p] for p in path_idxs}
|
338
339
|
docs: List[Document] = []
|
339
|
-
parser = Parser(self.config.parsing)
|
340
|
+
parser: Parser = Parser(self.config.parsing)
|
340
341
|
if len(urls) > 0:
|
341
342
|
for ui in url_idxs:
|
342
343
|
meta = urls_meta.get(ui, {})
|
343
|
-
loader = URLLoader(
|
344
|
+
loader = URLLoader(
|
345
|
+
urls=[all_paths[ui]],
|
346
|
+
parsing_config=self.config.parsing,
|
347
|
+
crawler_config=self.config.crawler_config,
|
348
|
+
) # type: ignore
|
344
349
|
url_docs = loader.load()
|
345
350
|
# update metadata of each doc with meta
|
346
351
|
for d in url_docs:
|
@@ -466,6 +471,11 @@ class DocChatAgent(ChatAgent):
|
|
466
471
|
docs = docs[: self.config.parsing.max_chunks]
|
467
472
|
# vecdb should take care of adding docs in batches;
|
468
473
|
# batching can be controlled via vecdb.config.batch_size
|
474
|
+
if not docs:
|
475
|
+
logging.warning(
|
476
|
+
"No documents to ingest after processing. Skipping VecDB addition."
|
477
|
+
)
|
478
|
+
return 0 # Return 0 since no documents were added
|
469
479
|
self.vecdb.add_documents(docs)
|
470
480
|
self.original_docs_length = self.doc_length(docs)
|
471
481
|
self.setup_documents(docs, filter=self.config.filter)
|
@@ -10,6 +10,7 @@ from openai import AzureOpenAI, OpenAI
|
|
10
10
|
|
11
11
|
from langroid.embedding_models.base import EmbeddingModel, EmbeddingModelsConfig
|
12
12
|
from langroid.exceptions import LangroidImportError
|
13
|
+
from langroid.language_models.openai_gpt import LangDBParams
|
13
14
|
from langroid.mytypes import Embeddings
|
14
15
|
from langroid.parsing.utils import batched
|
15
16
|
|
@@ -24,6 +25,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
|
24
25
|
organization: str = ""
|
25
26
|
dims: int = 1536
|
26
27
|
context_length: int = 8192
|
28
|
+
langdb_params: LangDBParams = LangDBParams()
|
27
29
|
|
28
30
|
class Config:
|
29
31
|
# enable auto-loading of env vars with OPENAI_ prefix, e.g.
|
@@ -136,11 +138,13 @@ class EmbeddingFunctionCallable:
|
|
136
138
|
"""
|
137
139
|
embeds = []
|
138
140
|
if isinstance(self.embed_model, (OpenAIEmbeddings, AzureOpenAIEmbeddings)):
|
139
|
-
|
141
|
+
# Truncate texts to context length while preserving text format
|
142
|
+
truncated_texts = self.embed_model.truncate_texts(input)
|
140
143
|
|
141
|
-
|
144
|
+
# Process in batches
|
145
|
+
for batch in batched(truncated_texts, self.batch_size):
|
142
146
|
result = self.embed_model.client.embeddings.create(
|
143
|
-
input=batch, model=self.embed_model.config.model_name
|
147
|
+
input=batch, model=self.embed_model.config.model_name # type: ignore
|
144
148
|
)
|
145
149
|
batch_embeds = [d.embedding for d in result.data]
|
146
150
|
embeds.extend(batch_embeds)
|
@@ -183,30 +187,66 @@ class OpenAIEmbeddings(EmbeddingModel):
|
|
183
187
|
super().__init__()
|
184
188
|
self.config = config
|
185
189
|
load_dotenv()
|
186
|
-
|
190
|
+
|
191
|
+
# Check if using LangDB
|
192
|
+
self.is_langdb = self.config.model_name.startswith("langdb/")
|
193
|
+
|
194
|
+
if self.is_langdb:
|
195
|
+
self.config.model_name = self.config.model_name.replace("langdb/", "")
|
196
|
+
self.config.api_base = self.config.langdb_params.base_url
|
197
|
+
project_id = self.config.langdb_params.project_id
|
198
|
+
if project_id:
|
199
|
+
self.config.api_base += "/" + project_id + "/v1"
|
200
|
+
self.config.api_key = self.config.langdb_params.api_key
|
201
|
+
|
202
|
+
if not self.config.api_key:
|
203
|
+
self.config.api_key = os.getenv("OPENAI_API_KEY", "")
|
204
|
+
|
187
205
|
self.config.organization = os.getenv("OPENAI_ORGANIZATION", "")
|
206
|
+
|
188
207
|
if self.config.api_key == "":
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
208
|
+
if self.is_langdb:
|
209
|
+
raise ValueError(
|
210
|
+
"""
|
211
|
+
LANGDB_API_KEY must be set in .env or your environment
|
212
|
+
to use OpenAIEmbeddings via LangDB.
|
213
|
+
"""
|
214
|
+
)
|
215
|
+
else:
|
216
|
+
raise ValueError(
|
217
|
+
"""
|
218
|
+
OPENAI_API_KEY must be set in .env or your environment
|
219
|
+
to use OpenAIEmbeddings.
|
220
|
+
"""
|
221
|
+
)
|
222
|
+
|
223
|
+
self.client = OpenAI(
|
224
|
+
base_url=self.config.api_base,
|
225
|
+
api_key=self.config.api_key,
|
226
|
+
organization=self.config.organization,
|
227
|
+
)
|
228
|
+
model_for_tokenizer = self.config.model_name
|
229
|
+
if model_for_tokenizer.startswith("openai/"):
|
230
|
+
self.config.model_name = model_for_tokenizer.replace("openai/", "")
|
196
231
|
self.tokenizer = tiktoken.encoding_for_model(self.config.model_name)
|
197
232
|
|
198
|
-
def truncate_texts(self, texts: List[str]) -> List[List[int]]:
|
233
|
+
def truncate_texts(self, texts: List[str]) -> List[str] | List[List[int]]:
|
199
234
|
"""
|
200
235
|
Truncate texts to the embedding model's context length.
|
201
236
|
TODO: Maybe we should show warning, and consider doing T5 summarization?
|
202
237
|
"""
|
203
|
-
|
238
|
+
truncated_tokens = [
|
204
239
|
self.tokenizer.encode(text, disallowed_special=())[
|
205
240
|
: self.config.context_length
|
206
241
|
]
|
207
242
|
for text in texts
|
208
243
|
]
|
209
244
|
|
245
|
+
if self.is_langdb:
|
246
|
+
# LangDB embedding endpt only works with strings, not tokens
|
247
|
+
return [self.tokenizer.decode(tokens) for tokens in truncated_tokens]
|
248
|
+
return truncated_tokens
|
249
|
+
|
210
250
|
def embedding_fn(self) -> Callable[[List[str]], Embeddings]:
|
211
251
|
return EmbeddingFunctionCallable(self, self.config.batch_size)
|
212
252
|
|
@@ -256,7 +296,7 @@ class AzureOpenAIEmbeddings(EmbeddingModel):
|
|
256
296
|
)
|
257
297
|
self.tokenizer = tiktoken.encoding_for_model(self.config.model_name)
|
258
298
|
|
259
|
-
def truncate_texts(self, texts: List[str]) -> List[List[int]]:
|
299
|
+
def truncate_texts(self, texts: List[str]) -> List[str] | List[List[int]]:
|
260
300
|
"""
|
261
301
|
Truncate texts to the embedding model's context length.
|
262
302
|
TODO: Maybe we should show warning, and consider doing T5 summarization?
|
@@ -66,7 +66,7 @@ from langroid.language_models.utils import (
|
|
66
66
|
retry_with_exponential_backoff,
|
67
67
|
)
|
68
68
|
from langroid.parsing.parse_json import parse_imperfect_json
|
69
|
-
from langroid.pydantic_v1 import BaseModel
|
69
|
+
from langroid.pydantic_v1 import BaseModel, BaseSettings
|
70
70
|
from langroid.utils.configuration import settings
|
71
71
|
from langroid.utils.constants import Colors
|
72
72
|
from langroid.utils.system import friendly_error
|
@@ -82,9 +82,13 @@ DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"
|
|
82
82
|
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
|
83
83
|
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai"
|
84
84
|
GLHF_BASE_URL = "https://glhf.chat/api/openai/v1"
|
85
|
+
LANGDB_BASE_URL = "https://api.us-east-1.langdb.ai"
|
85
86
|
OLLAMA_API_KEY = "ollama"
|
86
87
|
DUMMY_API_KEY = "xxx"
|
87
88
|
|
89
|
+
VLLM_API_KEY = os.environ.get("VLLM_API_KEY", DUMMY_API_KEY)
|
90
|
+
LLAMACPP_API_KEY = os.environ.get("LLAMA_API_KEY", DUMMY_API_KEY)
|
91
|
+
|
88
92
|
|
89
93
|
openai_chat_model_pref_list = [
|
90
94
|
OpenAIChatModel.GPT4o,
|
@@ -177,6 +181,24 @@ def noop() -> None:
|
|
177
181
|
return None
|
178
182
|
|
179
183
|
|
184
|
+
class LangDBParams(BaseSettings):
|
185
|
+
"""
|
186
|
+
Parameters specific to LangDB integration.
|
187
|
+
"""
|
188
|
+
|
189
|
+
api_key: str = DUMMY_API_KEY
|
190
|
+
project_id: str = ""
|
191
|
+
label: Optional[str] = None
|
192
|
+
run_id: Optional[str] = None
|
193
|
+
thread_id: Optional[str] = None
|
194
|
+
base_url: str = LANGDB_BASE_URL
|
195
|
+
|
196
|
+
class Config:
|
197
|
+
# allow setting of fields via env vars,
|
198
|
+
# e.g. LANGDB_PROJECT_ID=1234
|
199
|
+
env_prefix = "LANGDB_"
|
200
|
+
|
201
|
+
|
180
202
|
class OpenAICallParams(BaseModel):
|
181
203
|
"""
|
182
204
|
Various params that can be sent to an OpenAI API chat-completion call.
|
@@ -253,6 +275,8 @@ class OpenAIGPTConfig(LLMConfig):
|
|
253
275
|
# e.g. "mistral-instruct-v0.2 (a fuzzy search is done to find the closest match)
|
254
276
|
formatter: str | None = None
|
255
277
|
hf_formatter: HFFormatter | None = None
|
278
|
+
langdb_params: LangDBParams = LangDBParams()
|
279
|
+
headers: Dict[str, str] = {}
|
256
280
|
|
257
281
|
def __init__(self, **kwargs) -> None: # type: ignore
|
258
282
|
local_model = "api_base" in kwargs and kwargs["api_base"] is not None
|
@@ -496,6 +520,7 @@ class OpenAIGPT(LanguageModel):
|
|
496
520
|
self.is_deepseek = self.is_deepseek_model()
|
497
521
|
self.is_glhf = self.config.chat_model.startswith("glhf/")
|
498
522
|
self.is_openrouter = self.config.chat_model.startswith("openrouter/")
|
523
|
+
self.is_langdb = self.config.chat_model.startswith("langdb/")
|
499
524
|
|
500
525
|
if self.is_groq:
|
501
526
|
# use groq-specific client
|
@@ -544,18 +569,39 @@ class OpenAIGPT(LanguageModel):
|
|
544
569
|
self.api_base = DEEPSEEK_BASE_URL
|
545
570
|
if self.api_key == OPENAI_API_KEY:
|
546
571
|
self.api_key = os.getenv("DEEPSEEK_API_KEY", DUMMY_API_KEY)
|
572
|
+
elif self.is_langdb:
|
573
|
+
self.config.chat_model = self.config.chat_model.replace("langdb/", "")
|
574
|
+
self.api_base = self.config.langdb_params.base_url
|
575
|
+
project_id = self.config.langdb_params.project_id
|
576
|
+
if project_id:
|
577
|
+
self.api_base += "/" + project_id + "/v1"
|
578
|
+
if self.api_key == OPENAI_API_KEY:
|
579
|
+
self.api_key = self.config.langdb_params.api_key or DUMMY_API_KEY
|
580
|
+
|
581
|
+
if self.config.langdb_params:
|
582
|
+
params = self.config.langdb_params
|
583
|
+
if params.project_id:
|
584
|
+
self.config.headers["x-project-id"] = params.project_id
|
585
|
+
if params.label:
|
586
|
+
self.config.headers["x-label"] = params.label
|
587
|
+
if params.run_id:
|
588
|
+
self.config.headers["x-run-id"] = params.run_id
|
589
|
+
if params.thread_id:
|
590
|
+
self.config.headers["x-thread-id"] = params.thread_id
|
547
591
|
|
548
592
|
self.client = OpenAI(
|
549
593
|
api_key=self.api_key,
|
550
594
|
base_url=self.api_base,
|
551
595
|
organization=self.config.organization,
|
552
596
|
timeout=Timeout(self.config.timeout),
|
597
|
+
default_headers=self.config.headers,
|
553
598
|
)
|
554
599
|
self.async_client = AsyncOpenAI(
|
555
600
|
api_key=self.api_key,
|
556
601
|
organization=self.config.organization,
|
557
602
|
base_url=self.api_base,
|
558
603
|
timeout=Timeout(self.config.timeout),
|
604
|
+
default_headers=self.config.headers,
|
559
605
|
)
|
560
606
|
|
561
607
|
self.cache: CacheDB | None = None
|
@@ -1028,6 +1074,7 @@ class OpenAIGPT(LanguageModel):
|
|
1028
1074
|
OpenAIResponse object (with choices, usage)
|
1029
1075
|
|
1030
1076
|
"""
|
1077
|
+
|
1031
1078
|
completion = ""
|
1032
1079
|
reasoning = ""
|
1033
1080
|
function_args = ""
|
@@ -1075,7 +1122,9 @@ class OpenAIGPT(LanguageModel):
|
|
1075
1122
|
)
|
1076
1123
|
|
1077
1124
|
@staticmethod
|
1078
|
-
def tool_deltas_to_tools(
|
1125
|
+
def tool_deltas_to_tools(
|
1126
|
+
tools: List[Dict[str, Any]],
|
1127
|
+
) -> Tuple[
|
1079
1128
|
str,
|
1080
1129
|
List[OpenAIToolCall],
|
1081
1130
|
List[Dict[str, Any]],
|
langroid/parsing/url_loader.py
CHANGED
@@ -1,120 +1,340 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
+
from abc import ABC, abstractmethod
|
3
4
|
from tempfile import NamedTemporaryFile
|
4
|
-
from typing import List,
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
5
6
|
|
6
|
-
import
|
7
|
+
from dotenv import load_dotenv
|
7
8
|
|
9
|
+
from langroid.exceptions import LangroidImportError
|
8
10
|
from langroid.mytypes import DocMetaData, Document
|
9
11
|
from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
|
10
12
|
from langroid.parsing.parser import Parser, ParsingConfig
|
13
|
+
from langroid.pydantic_v1 import BaseSettings
|
11
14
|
|
12
|
-
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from firecrawl import FirecrawlApp
|
13
17
|
|
18
|
+
load_dotenv()
|
14
19
|
|
15
|
-
|
16
|
-
"""
|
17
|
-
Load a list of URLs and extract the text content.
|
18
|
-
Alternative approaches could use `bs4` or `scrapy`.
|
19
|
-
|
20
|
-
TODO - this currently does not handle cookie dialogs,
|
21
|
-
i.e. if there is a cookie pop-up, most/all of the extracted
|
22
|
-
content could be cookie policy text.
|
23
|
-
We could use `playwright` to simulate a user clicking
|
24
|
-
the "accept" button on the cookie dialog.
|
25
|
-
"""
|
26
|
-
|
27
|
-
def __init__(self, urls: List[str], parser: Parser = Parser(ParsingConfig())):
|
28
|
-
self.urls = urls
|
29
|
-
self.parser = parser
|
20
|
+
logging.getLogger("url_loader").setLevel(logging.WARNING)
|
30
21
|
|
31
|
-
@no_type_check
|
32
|
-
def load(self) -> List[Document]:
|
33
|
-
import trafilatura
|
34
|
-
from trafilatura.downloads import (
|
35
|
-
add_to_compressed_dict,
|
36
|
-
buffered_downloads,
|
37
|
-
load_download_buffer,
|
38
|
-
)
|
39
22
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
23
|
+
# Base crawler config and specific configurations
|
24
|
+
class BaseCrawlerConfig(BaseSettings):
|
25
|
+
"""Base configuration for web crawlers."""
|
26
|
+
|
27
|
+
parser: Optional[Parser] = None
|
28
|
+
|
29
|
+
|
30
|
+
class TrafilaturaConfig(BaseCrawlerConfig):
|
31
|
+
"""Configuration for Trafilatura crawler."""
|
32
|
+
|
33
|
+
threads: int = 4
|
34
|
+
|
35
|
+
|
36
|
+
class FirecrawlConfig(BaseCrawlerConfig):
|
37
|
+
"""Configuration for Firecrawl crawler."""
|
38
|
+
|
39
|
+
api_key: str = ""
|
40
|
+
mode: str = "scrape"
|
41
|
+
params: Dict[str, Any] = {}
|
42
|
+
timeout: Optional[int] = None
|
43
|
+
|
44
|
+
class Config:
|
45
|
+
# Leverage Pydantic's BaseSettings to
|
46
|
+
# allow setting of fields via env vars,
|
47
|
+
# e.g. FIRECRAWL_MODE=scrape and FIRECRAWL_API_KEY=...
|
48
|
+
env_prefix = "FIRECRAWL_"
|
49
|
+
|
50
|
+
|
51
|
+
class BaseCrawler(ABC):
|
52
|
+
"""Abstract base class for web crawlers."""
|
53
|
+
|
54
|
+
def __init__(self, config: BaseCrawlerConfig):
|
55
|
+
"""Initialize the base crawler.
|
56
|
+
|
57
|
+
Args:
|
58
|
+
config: Configuration for the crawler
|
59
|
+
"""
|
60
|
+
self.parser = config.parser if self.needs_parser else None
|
61
|
+
self.config: BaseCrawlerConfig = config
|
62
|
+
|
63
|
+
@property
|
64
|
+
@abstractmethod
|
65
|
+
def needs_parser(self) -> bool:
|
66
|
+
"""Indicates whether the crawler requires a parser."""
|
67
|
+
pass
|
68
|
+
|
69
|
+
@abstractmethod
|
70
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
71
|
+
pass
|
72
|
+
|
73
|
+
def _process_document(self, url: str) -> List[Document]:
|
74
|
+
if self.parser:
|
75
|
+
import requests
|
76
|
+
from requests.structures import CaseInsensitiveDict
|
77
|
+
|
78
|
+
if self._is_document_url(url):
|
79
|
+
try:
|
80
|
+
doc_parser = DocumentParser.create(url, self.parser.config)
|
64
81
|
new_chunks = doc_parser.get_doc_chunks()
|
65
|
-
if
|
82
|
+
if not new_chunks:
|
66
83
|
# If the document is empty, try to extract images
|
67
84
|
img_parser = ImagePdfParser(url, self.parser.config)
|
68
85
|
new_chunks = img_parser.get_doc_chunks()
|
69
|
-
|
70
|
-
|
71
|
-
|
86
|
+
return new_chunks
|
87
|
+
except Exception as e:
|
88
|
+
logging.error(f"Error parsing {url}: {e}")
|
89
|
+
return []
|
90
|
+
|
91
|
+
else:
|
92
|
+
try:
|
93
|
+
headers = requests.head(url).headers
|
94
|
+
except Exception as e:
|
95
|
+
logging.warning(f"Error getting headers for {url}: {e}")
|
96
|
+
headers = CaseInsensitiveDict()
|
97
|
+
|
98
|
+
content_type = headers.get("Content-Type", "").lower()
|
99
|
+
temp_file_suffix = None
|
100
|
+
if "application/pdf" in content_type:
|
101
|
+
temp_file_suffix = ".pdf"
|
102
|
+
elif (
|
103
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
104
|
+
in content_type
|
105
|
+
):
|
106
|
+
temp_file_suffix = ".docx"
|
107
|
+
elif "application/msword" in content_type:
|
108
|
+
temp_file_suffix = ".doc"
|
109
|
+
|
110
|
+
if temp_file_suffix:
|
72
111
|
try:
|
73
|
-
headers = requests.head(url).headers
|
74
|
-
except Exception as e:
|
75
|
-
logging.warning(f"Error getting headers for {url}: {e}")
|
76
|
-
headers = {}
|
77
|
-
content_type = headers.get("Content-Type", "").lower()
|
78
|
-
temp_file_suffix = None
|
79
|
-
if "application/pdf" in content_type:
|
80
|
-
temp_file_suffix = ".pdf"
|
81
|
-
elif (
|
82
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
83
|
-
in content_type
|
84
|
-
):
|
85
|
-
temp_file_suffix = ".docx"
|
86
|
-
elif "application/msword" in content_type:
|
87
|
-
temp_file_suffix = ".doc"
|
88
|
-
|
89
|
-
if temp_file_suffix:
|
90
|
-
# Download the document content
|
91
112
|
response = requests.get(url)
|
92
113
|
with NamedTemporaryFile(
|
93
114
|
delete=False, suffix=temp_file_suffix
|
94
115
|
) as temp_file:
|
95
116
|
temp_file.write(response.content)
|
96
117
|
temp_file_path = temp_file.name
|
97
|
-
# Process the downloaded document
|
98
118
|
doc_parser = DocumentParser.create(
|
99
119
|
temp_file_path, self.parser.config
|
100
120
|
)
|
101
|
-
docs
|
102
|
-
# Clean up the temporary file
|
121
|
+
docs = doc_parser.get_doc_chunks()
|
103
122
|
os.remove(temp_file_path)
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
123
|
+
return docs
|
124
|
+
except Exception as e:
|
125
|
+
logging.error(f"Error downloading/parsing {url}: {e}")
|
126
|
+
return []
|
127
|
+
return []
|
128
|
+
|
129
|
+
def _is_document_url(self, url: str) -> bool:
|
130
|
+
return any(url.lower().endswith(ext) for ext in [".pdf", ".docx", ".doc"])
|
131
|
+
|
132
|
+
|
133
|
+
class CrawlerFactory:
|
134
|
+
"""Factory for creating web crawlers."""
|
135
|
+
|
136
|
+
@staticmethod
|
137
|
+
def create_crawler(config: BaseCrawlerConfig) -> BaseCrawler:
|
138
|
+
"""Create a crawler instance based on configuration type.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
config: Configuration for the crawler
|
142
|
+
|
143
|
+
Returns:
|
144
|
+
A BaseCrawler instance
|
145
|
+
|
146
|
+
Raises:
|
147
|
+
ValueError: If config type is not supported
|
148
|
+
"""
|
149
|
+
if isinstance(config, TrafilaturaConfig):
|
150
|
+
return TrafilaturaCrawler(config)
|
151
|
+
elif isinstance(config, FirecrawlConfig):
|
152
|
+
return FirecrawlCrawler(config)
|
153
|
+
else:
|
154
|
+
raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
|
155
|
+
|
156
|
+
|
157
|
+
class TrafilaturaCrawler(BaseCrawler):
|
158
|
+
"""Crawler implementation using Trafilatura."""
|
159
|
+
|
160
|
+
def __init__(self, config: TrafilaturaConfig):
|
161
|
+
"""Initialize the Trafilatura crawler.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
config: Configuration for the crawler
|
165
|
+
"""
|
166
|
+
super().__init__(config)
|
167
|
+
self.config: TrafilaturaConfig = config
|
168
|
+
|
169
|
+
@property
|
170
|
+
def needs_parser(self) -> bool:
|
171
|
+
return True
|
172
|
+
|
173
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
174
|
+
import trafilatura
|
175
|
+
from trafilatura.downloads import (
|
176
|
+
add_to_compressed_dict,
|
177
|
+
buffered_downloads,
|
178
|
+
load_download_buffer,
|
179
|
+
)
|
180
|
+
|
181
|
+
docs = []
|
182
|
+
dl_dict = add_to_compressed_dict(urls)
|
183
|
+
|
184
|
+
while not dl_dict.done:
|
185
|
+
buffer, dl_dict = load_download_buffer(dl_dict, sleep_time=5)
|
186
|
+
for url, result in buffered_downloads(buffer, self.config.threads):
|
187
|
+
parsed_doc = self._process_document(url)
|
188
|
+
if parsed_doc:
|
189
|
+
docs.extend(parsed_doc)
|
190
|
+
else:
|
191
|
+
text = trafilatura.extract(
|
192
|
+
result, no_fallback=False, favor_recall=True
|
193
|
+
)
|
194
|
+
if text is None and result is not None and isinstance(result, str):
|
195
|
+
text = result
|
196
|
+
if text:
|
197
|
+
docs.append(
|
198
|
+
Document(content=text, metadata=DocMetaData(source=url))
|
109
199
|
)
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
200
|
+
|
201
|
+
return docs
|
202
|
+
|
203
|
+
|
204
|
+
class FirecrawlCrawler(BaseCrawler):
|
205
|
+
"""Crawler implementation using Firecrawl."""
|
206
|
+
|
207
|
+
def __init__(self, config: FirecrawlConfig) -> None:
|
208
|
+
"""Initialize the Firecrawl crawler.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
config: Configuration for the crawler
|
212
|
+
"""
|
213
|
+
super().__init__(config)
|
214
|
+
self.config: FirecrawlConfig = config
|
215
|
+
|
216
|
+
@property
|
217
|
+
def needs_parser(self) -> bool:
|
218
|
+
return False
|
219
|
+
|
220
|
+
def _return_save_incremental_results(
|
221
|
+
self, app: "FirecrawlApp", crawl_id: str, output_dir: str = "firecrawl_output"
|
222
|
+
) -> List[Document]:
|
223
|
+
# Code used verbatim from firecrawl blog with few modifications
|
224
|
+
# https://www.firecrawl.dev/blog/mastering-the-crawl-endpoint-in-firecrawl
|
225
|
+
import json
|
226
|
+
import time
|
227
|
+
from pathlib import Path
|
228
|
+
|
229
|
+
from tqdm import tqdm
|
230
|
+
|
231
|
+
pbar = tqdm(desc="Pages saved", unit=" pages", dynamic_ncols=True)
|
232
|
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
233
|
+
processed_urls: set[str] = set()
|
234
|
+
docs = []
|
235
|
+
|
236
|
+
while True:
|
237
|
+
# Check current status
|
238
|
+
status = app.check_crawl_status(crawl_id)
|
239
|
+
new_pages = 0
|
240
|
+
|
241
|
+
# Save new pages
|
242
|
+
for page in status["data"]:
|
243
|
+
url = page["metadata"]["url"]
|
244
|
+
if url not in processed_urls:
|
245
|
+
content = page.get("markdown", "")
|
246
|
+
filename = f"{output_dir}/{len(processed_urls)}.md"
|
247
|
+
with open(filename, "w") as f:
|
248
|
+
f.write(content)
|
249
|
+
docs.append(
|
250
|
+
Document(content=content, metadata=DocMetaData(source=url))
|
251
|
+
)
|
252
|
+
processed_urls.add(url)
|
253
|
+
new_pages += 1
|
254
|
+
pbar.update(new_pages) # Update progress bar with new pages
|
255
|
+
|
256
|
+
# Break if crawl is complete
|
257
|
+
if status["status"] == "completed":
|
258
|
+
print(f"Saved {len(processed_urls)} pages.")
|
259
|
+
with open(f"{output_dir}/full_results.json", "w") as f:
|
260
|
+
json.dump(status, f, indent=2)
|
261
|
+
break
|
262
|
+
|
263
|
+
time.sleep(5) # Wait before checking again
|
264
|
+
return docs
|
265
|
+
|
266
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
267
|
+
try:
|
268
|
+
from firecrawl import FirecrawlApp
|
269
|
+
except ImportError:
|
270
|
+
raise LangroidImportError("firecrawl", "firecrawl")
|
271
|
+
|
272
|
+
app = FirecrawlApp(api_key=self.config.api_key)
|
273
|
+
docs = []
|
274
|
+
params = self.config.params.copy() # Create a copy of the existing params
|
275
|
+
|
276
|
+
if self.config.timeout is not None:
|
277
|
+
params["timeout"] = self.config.timeout # Add/override timeout in params
|
278
|
+
|
279
|
+
if self.config.mode == "scrape":
|
280
|
+
for url in urls:
|
281
|
+
try:
|
282
|
+
result = app.scrape_url(url, params=params)
|
283
|
+
metadata = result.get(
|
284
|
+
"metadata", {}
|
285
|
+
) # Default to empty dict if missing
|
286
|
+
status_code = metadata.get("statusCode")
|
287
|
+
|
288
|
+
if status_code == 200:
|
289
|
+
docs.append(
|
290
|
+
Document(
|
291
|
+
content=result["markdown"],
|
292
|
+
metadata=DocMetaData(source=url),
|
119
293
|
)
|
294
|
+
)
|
295
|
+
except Exception as e:
|
296
|
+
logging.warning(
|
297
|
+
f"Firecrawl encountered an error for {url}: {e}. "
|
298
|
+
"Skipping but continuing."
|
299
|
+
)
|
300
|
+
elif self.config.mode == "crawl":
|
301
|
+
if not isinstance(urls, list) or len(urls) != 1:
|
302
|
+
raise ValueError(
|
303
|
+
"Crawl mode expects 'urls' to be a list containing a single URL."
|
304
|
+
)
|
305
|
+
|
306
|
+
# Start the crawl
|
307
|
+
crawl_status = app.async_crawl_url(url=urls[0], params=params)
|
308
|
+
|
309
|
+
# Save results incrementally
|
310
|
+
docs = self._return_save_incremental_results(app, crawl_status["id"])
|
120
311
|
return docs
|
312
|
+
|
313
|
+
|
314
|
+
class URLLoader:
|
315
|
+
"""Loads URLs and extracts text using a specified crawler."""
|
316
|
+
|
317
|
+
def __init__(
|
318
|
+
self,
|
319
|
+
urls: List[Any],
|
320
|
+
parsing_config: ParsingConfig = ParsingConfig(),
|
321
|
+
crawler_config: Optional[BaseCrawlerConfig] = None,
|
322
|
+
):
|
323
|
+
"""Initialize the URL loader.
|
324
|
+
|
325
|
+
Args:
|
326
|
+
urls: List of URLs to load
|
327
|
+
parsing_config: Configuration for parsing
|
328
|
+
crawler_config: Configuration for the crawler
|
329
|
+
"""
|
330
|
+
self.urls = urls
|
331
|
+
self.parsing_config = parsing_config
|
332
|
+
|
333
|
+
if crawler_config is None:
|
334
|
+
crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
|
335
|
+
|
336
|
+
self.crawler = CrawlerFactory.create_crawler(crawler_config)
|
337
|
+
|
338
|
+
def load(self) -> List[Document]:
|
339
|
+
"""Load the URLs using the specified crawler."""
|
340
|
+
return self.crawler.crawl(self.urls)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.47.0
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -121,6 +121,8 @@ Provides-Extra: exa
|
|
121
121
|
Requires-Dist: exa-py>=1.8.7; extra == 'exa'
|
122
122
|
Provides-Extra: fastembed
|
123
123
|
Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'fastembed'
|
124
|
+
Provides-Extra: firecrawl
|
125
|
+
Requires-Dist: firecrawl-py>=1.13.5; extra == 'firecrawl'
|
124
126
|
Provides-Extra: google-genai
|
125
127
|
Requires-Dist: google-genai>=1.0.0; extra == 'google-genai'
|
126
128
|
Provides-Extra: google-generativeai
|
@@ -14,7 +14,7 @@ langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a
|
|
14
14
|
langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
langroid/agent/callbacks/chainlit.py,sha256=UHB6P_J40vsVnssosqkpkOVWRf9NK4TOY0_G2g_Arsg,20900
|
16
16
|
langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
|
17
|
-
langroid/agent/special/doc_chat_agent.py,sha256=
|
17
|
+
langroid/agent/special/doc_chat_agent.py,sha256=_CwxBx2gLs5BoUI_4CpqHyzua5ljFfqytV9mwjHdmbY,65233
|
18
18
|
langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
|
19
19
|
langroid/agent/special/lance_tools.py,sha256=qS8x4wi8mrqfbYV2ztFzrcxyhHQ0ZWOc-zkYiH7awj0,2105
|
20
20
|
langroid/agent/special/relevance_extractor_agent.py,sha256=zIx8GUdVo1aGW6ASla0NPQjYYIpmriK_TYMijqAx3F8,4796
|
@@ -59,7 +59,7 @@ langroid/cachedb/momento_cachedb.py,sha256=YEOJ62hEcV6iIeMr5aGgRYgWQqFYaej9gEDEc
|
|
59
59
|
langroid/cachedb/redis_cachedb.py,sha256=7kgnbf4b5CKsCrlL97mHWKvdvlLt8zgn7lc528jEpiE,5141
|
60
60
|
langroid/embedding_models/__init__.py,sha256=KyYxR3jDFUCfYjSuCL86qjAmrq6mXXjOT4lFNOKVj6Y,955
|
61
61
|
langroid/embedding_models/base.py,sha256=Ml7oA6PzQm0wZmIYn3fhF7dvZCi-amviWUwOeBegH3A,2562
|
62
|
-
langroid/embedding_models/models.py,sha256=
|
62
|
+
langroid/embedding_models/models.py,sha256=iGRrQR7ehDunA_7cPMu3CiHFugYWDkauOsiqHH-bv9s,20725
|
63
63
|
langroid/embedding_models/remote_embeds.py,sha256=6_kjXByVbqhY9cGwl9R83ZcYC2km-nGieNNAo1McHaY,5151
|
64
64
|
langroid/embedding_models/protoc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
65
65
|
langroid/embedding_models/protoc/embeddings.proto,sha256=_O-SgFpTaylQeOTgSpxhEJ7CUw7PeCQQJLaPqpPYKJg,321
|
@@ -72,7 +72,7 @@ langroid/language_models/base.py,sha256=mDYmFCBCLdq8_Uvws4MiewwEgcOCP8Qb0e5yUXr3
|
|
72
72
|
langroid/language_models/config.py,sha256=9Q8wk5a7RQr8LGMT_0WkpjY8S4ywK06SalVRjXlfCiI,378
|
73
73
|
langroid/language_models/mock_lm.py,sha256=5BgHKDVRWFbUwDT_PFgTZXz9-k8wJSA2e3PZmyDgQ1k,4022
|
74
74
|
langroid/language_models/model_info.py,sha256=tfBBxL0iUf2mVN6CjcvqflzFUVg2oZqOJZexZ8jHTYA,12216
|
75
|
-
langroid/language_models/openai_gpt.py,sha256=
|
75
|
+
langroid/language_models/openai_gpt.py,sha256=Re4T1my9rhOPI-w4JCluhAZUVUIbW2AZJ3MIJMYjRuk,79633
|
76
76
|
langroid/language_models/utils.py,sha256=L4_CbihDMTGcsg0TOG1Yd5JFEto46--h7CX_14m89sQ,5016
|
77
77
|
langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8yiscohil1ogbP1ECkYdBlBsk,372
|
78
78
|
langroid/language_models/prompt_formatter/base.py,sha256=eDS1sgRNZVnoajwV_ZIha6cba5Dt8xjgzdRbPITwx3Q,1221
|
@@ -91,7 +91,7 @@ langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1
|
|
91
91
|
langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
|
92
92
|
langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
|
93
93
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
94
|
-
langroid/parsing/url_loader.py,sha256=
|
94
|
+
langroid/parsing/url_loader.py,sha256=tNLyCo8A08GcB8KFr04YKDO9KFHyqNacKU0-DuWlu4I,11721
|
95
95
|
langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
|
96
96
|
langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
|
97
97
|
langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
|
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
127
127
|
langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
|
128
128
|
langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
|
129
129
|
langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
|
130
|
-
langroid-0.
|
131
|
-
langroid-0.
|
132
|
-
langroid-0.
|
133
|
-
langroid-0.
|
130
|
+
langroid-0.47.0.dist-info/METADATA,sha256=kI3V76rm2kss0FLvAbM1XvinKBBxsovU386BTBDs2Ss,63473
|
131
|
+
langroid-0.47.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
132
|
+
langroid-0.47.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
133
|
+
langroid-0.47.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|