langroid 0.58.2__py3-none-any.whl → 0.59.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/base.py +39 -17
- langroid/agent/base.py-e +2216 -0
- langroid/agent/callbacks/chainlit.py +2 -1
- langroid/agent/chat_agent.py +73 -55
- langroid/agent/chat_agent.py-e +2086 -0
- langroid/agent/chat_document.py +7 -7
- langroid/agent/chat_document.py-e +513 -0
- langroid/agent/openai_assistant.py +9 -9
- langroid/agent/openai_assistant.py-e +882 -0
- langroid/agent/special/arangodb/arangodb_agent.py +10 -18
- langroid/agent/special/arangodb/arangodb_agent.py-e +648 -0
- langroid/agent/special/arangodb/tools.py +3 -3
- langroid/agent/special/doc_chat_agent.py +16 -14
- langroid/agent/special/lance_rag/critic_agent.py +2 -2
- langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
- langroid/agent/special/lance_tools.py +6 -5
- langroid/agent/special/lance_tools.py-e +61 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
- langroid/agent/special/neo4j/neo4j_chat_agent.py-e +430 -0
- langroid/agent/special/relevance_extractor_agent.py +1 -1
- langroid/agent/special/sql/sql_chat_agent.py +11 -3
- langroid/agent/task.py +9 -87
- langroid/agent/task.py-e +2418 -0
- langroid/agent/tool_message.py +33 -17
- langroid/agent/tool_message.py-e +400 -0
- langroid/agent/tools/file_tools.py +4 -2
- langroid/agent/tools/file_tools.py-e +234 -0
- langroid/agent/tools/mcp/fastmcp_client.py +19 -6
- langroid/agent/tools/mcp/fastmcp_client.py-e +584 -0
- langroid/agent/tools/orchestration.py +22 -17
- langroid/agent/tools/orchestration.py-e +301 -0
- langroid/agent/tools/recipient_tool.py +3 -3
- langroid/agent/tools/task_tool.py +22 -16
- langroid/agent/tools/task_tool.py-e +249 -0
- langroid/agent/xml_tool_message.py +90 -35
- langroid/agent/xml_tool_message.py-e +392 -0
- langroid/cachedb/base.py +1 -1
- langroid/embedding_models/base.py +2 -2
- langroid/embedding_models/models.py +3 -7
- langroid/embedding_models/models.py-e +563 -0
- langroid/exceptions.py +4 -1
- langroid/language_models/azure_openai.py +2 -2
- langroid/language_models/azure_openai.py-e +134 -0
- langroid/language_models/base.py +6 -4
- langroid/language_models/base.py-e +812 -0
- langroid/language_models/client_cache.py +64 -0
- langroid/language_models/config.py +2 -4
- langroid/language_models/config.py-e +18 -0
- langroid/language_models/model_info.py +9 -1
- langroid/language_models/model_info.py-e +483 -0
- langroid/language_models/openai_gpt.py +119 -20
- langroid/language_models/openai_gpt.py-e +2280 -0
- langroid/language_models/provider_params.py +3 -22
- langroid/language_models/provider_params.py-e +153 -0
- langroid/mytypes.py +11 -4
- langroid/mytypes.py-e +132 -0
- langroid/parsing/code_parser.py +1 -1
- langroid/parsing/file_attachment.py +1 -1
- langroid/parsing/file_attachment.py-e +246 -0
- langroid/parsing/md_parser.py +14 -4
- langroid/parsing/md_parser.py-e +574 -0
- langroid/parsing/parser.py +22 -7
- langroid/parsing/parser.py-e +410 -0
- langroid/parsing/repo_loader.py +3 -1
- langroid/parsing/repo_loader.py-e +812 -0
- langroid/parsing/search.py +1 -1
- langroid/parsing/url_loader.py +17 -51
- langroid/parsing/url_loader.py-e +683 -0
- langroid/parsing/urls.py +5 -4
- langroid/parsing/urls.py-e +279 -0
- langroid/prompts/prompts_config.py +1 -1
- langroid/pydantic_v1/__init__.py +45 -6
- langroid/pydantic_v1/__init__.py-e +36 -0
- langroid/pydantic_v1/main.py +11 -4
- langroid/pydantic_v1/main.py-e +11 -0
- langroid/utils/configuration.py +13 -11
- langroid/utils/configuration.py-e +141 -0
- langroid/utils/constants.py +1 -1
- langroid/utils/constants.py-e +32 -0
- langroid/utils/globals.py +21 -5
- langroid/utils/globals.py-e +49 -0
- langroid/utils/html_logger.py +2 -1
- langroid/utils/html_logger.py-e +825 -0
- langroid/utils/object_registry.py +1 -1
- langroid/utils/object_registry.py-e +66 -0
- langroid/utils/pydantic_utils.py +55 -28
- langroid/utils/pydantic_utils.py-e +602 -0
- langroid/utils/types.py +2 -2
- langroid/utils/types.py-e +113 -0
- langroid/vector_store/base.py +3 -3
- langroid/vector_store/lancedb.py +5 -5
- langroid/vector_store/lancedb.py-e +404 -0
- langroid/vector_store/meilisearch.py +2 -2
- langroid/vector_store/pineconedb.py +4 -4
- langroid/vector_store/pineconedb.py-e +427 -0
- langroid/vector_store/postgres.py +1 -1
- langroid/vector_store/qdrantdb.py +3 -3
- langroid/vector_store/weaviatedb.py +1 -1
- {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/METADATA +3 -2
- langroid-0.59.0b1.dist-info/RECORD +181 -0
- langroid/agent/special/doc_chat_task.py +0 -0
- langroid/mcp/__init__.py +0 -1
- langroid/mcp/server/__init__.py +0 -1
- langroid-0.58.2.dist-info/RECORD +0 -145
- {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/WHEEL +0 -0
- {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,683 @@
|
|
1
|
+
import asyncio
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from tempfile import NamedTemporaryFile
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
|
7
|
+
|
8
|
+
import markdownify as md
|
9
|
+
from dotenv import load_dotenv
|
10
|
+
|
11
|
+
from langroid.exceptions import LangroidImportError
|
12
|
+
from langroid.mytypes import DocMetaData, Document
|
13
|
+
from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
|
14
|
+
from langroid.parsing.parser import Parser, ParsingConfig
|
15
|
+
from pydantic_settings import BaseSettings
|
16
|
+
from pydantic import ConfigDict
|
17
|
+
|
18
|
+
if TYPE_CHECKING:
|
19
|
+
from firecrawl import FirecrawlApp
|
20
|
+
|
21
|
+
try:
|
22
|
+
from crawl4ai import CrawlResult
|
23
|
+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
24
|
+
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
|
25
|
+
from crawl4ai.deep_crawling import DeepCrawlStrategy
|
26
|
+
from crawl4ai.extraction_strategy import ExtractionStrategy
|
27
|
+
from crawl4ai.markdown_generation_strategy import MarkdownGenerationStrategy
|
28
|
+
except ImportError:
|
29
|
+
raise LangroidImportError("crawl4ai", "crawl4ai")
|
30
|
+
|
31
|
+
load_dotenv()
|
32
|
+
|
33
|
+
logging.getLogger("url_loader").setLevel(logging.WARNING)
|
34
|
+
|
35
|
+
|
36
|
+
# Base crawler config and specific configurations
|
37
|
+
class BaseCrawlerConfig(BaseSettings):
|
38
|
+
"""Base configuration for web crawlers."""
|
39
|
+
|
40
|
+
parser: Optional[Parser] = None
|
41
|
+
|
42
|
+
|
43
|
+
class TrafilaturaConfig(BaseCrawlerConfig):
|
44
|
+
"""Configuration for Trafilatura crawler."""
|
45
|
+
|
46
|
+
threads: int = 4
|
47
|
+
format: str = "markdown" # or "xml" or "txt"
|
48
|
+
|
49
|
+
|
50
|
+
class FirecrawlConfig(BaseCrawlerConfig):
|
51
|
+
"""Configuration for Firecrawl crawler."""
|
52
|
+
|
53
|
+
api_key: str = ""
|
54
|
+
mode: str = "scrape"
|
55
|
+
params: Dict[str, Any] = {}
|
56
|
+
timeout: Optional[int] = None
|
57
|
+
|
58
|
+
model_config = ConfigDict(env_prefix="FIRECRAWL_")
|
59
|
+
|
60
|
+
class ExaCrawlerConfig(BaseCrawlerConfig):
|
61
|
+
api_key: str = ""
|
62
|
+
|
63
|
+
model_config = ConfigDict(env_prefix="EXA_")
|
64
|
+
|
65
|
+
class Crawl4aiConfig(BaseCrawlerConfig):
|
66
|
+
"""Configuration for the Crawl4aiCrawler."""
|
67
|
+
|
68
|
+
crawl_mode: Literal["simple", "deep"] = "simple"
|
69
|
+
extraction_strategy: Optional["ExtractionStrategy"] = None
|
70
|
+
markdown_strategy: Optional["MarkdownGenerationStrategy"] = None
|
71
|
+
deep_crawl_strategy: Optional["DeepCrawlStrategy"] = None
|
72
|
+
scraping_strategy: Optional["ContentScrapingStrategy"] = None
|
73
|
+
browser_config: Optional["BrowserConfig"] = None
|
74
|
+
run_config: Optional["CrawlerRunConfig"] = None
|
75
|
+
|
76
|
+
_refs_resolved: bool = False
|
77
|
+
|
78
|
+
def __init_subclass__(cls, **kwargs: Any) -> None:
|
79
|
+
"""Resolve forward references when class is first subclassed or instantiated."""
|
80
|
+
super().__init_subclass__(**kwargs)
|
81
|
+
cls._resolve_forward_refs()
|
82
|
+
|
83
|
+
@classmethod
|
84
|
+
def _resolve_forward_refs(cls) -> None:
|
85
|
+
"""Resolve forward references only when needed."""
|
86
|
+
if not cls._refs_resolved:
|
87
|
+
try:
|
88
|
+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
89
|
+
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
|
90
|
+
from crawl4ai.deep_crawling import DeepCrawlStrategy
|
91
|
+
from crawl4ai.extraction_strategy import ExtractionStrategy
|
92
|
+
from crawl4ai.markdown_generation_strategy import (
|
93
|
+
MarkdownGenerationStrategy,
|
94
|
+
)
|
95
|
+
|
96
|
+
# Create namespace for update_forward_refs
|
97
|
+
namespace = {
|
98
|
+
"BrowserConfig": BrowserConfig,
|
99
|
+
"CrawlerRunConfig": CrawlerRunConfig,
|
100
|
+
"ContentScrapingStrategy": ContentScrapingStrategy,
|
101
|
+
"DeepCrawlStrategy": DeepCrawlStrategy,
|
102
|
+
"ExtractionStrategy": ExtractionStrategy,
|
103
|
+
"MarkdownGenerationStrategy": MarkdownGenerationStrategy,
|
104
|
+
}
|
105
|
+
|
106
|
+
cls.update_forward_refs(**namespace)
|
107
|
+
cls._refs_resolved = True
|
108
|
+
except ImportError:
|
109
|
+
# If crawl4ai is not installed, leave forward refs as strings
|
110
|
+
pass
|
111
|
+
|
112
|
+
def __init__(self, **kwargs: Any) -> None:
|
113
|
+
"""Initialize and ensure forward refs are resolved."""
|
114
|
+
self._resolve_forward_refs()
|
115
|
+
super().__init__(**kwargs)
|
116
|
+
|
117
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
118
|
+
|
119
|
+
class BaseCrawler(ABC):
|
120
|
+
"""Abstract base class for web crawlers."""
|
121
|
+
|
122
|
+
def __init__(self, config: BaseCrawlerConfig):
|
123
|
+
"""Initialize the base crawler.
|
124
|
+
|
125
|
+
Args:
|
126
|
+
config: Configuration for the crawler
|
127
|
+
"""
|
128
|
+
self.parser = config.parser if self.needs_parser else None
|
129
|
+
self.config: BaseCrawlerConfig = config
|
130
|
+
|
131
|
+
@property
|
132
|
+
@abstractmethod
|
133
|
+
def needs_parser(self) -> bool:
|
134
|
+
"""Indicates whether the crawler requires a parser."""
|
135
|
+
pass
|
136
|
+
|
137
|
+
@abstractmethod
|
138
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
139
|
+
pass
|
140
|
+
|
141
|
+
def _process_document(self, url: str) -> List[Document]:
|
142
|
+
if self.parser:
|
143
|
+
import requests
|
144
|
+
from requests.structures import CaseInsensitiveDict
|
145
|
+
|
146
|
+
if self._is_document_url(url):
|
147
|
+
try:
|
148
|
+
doc_parser = DocumentParser.create(url, self.parser.config)
|
149
|
+
new_chunks = doc_parser.get_doc_chunks()
|
150
|
+
if not new_chunks:
|
151
|
+
# If the document is empty, try to extract images
|
152
|
+
img_parser = ImagePdfParser(url, self.parser.config)
|
153
|
+
new_chunks = img_parser.get_doc_chunks()
|
154
|
+
return new_chunks
|
155
|
+
except Exception as e:
|
156
|
+
logging.error(f"Error parsing {url}: {e}")
|
157
|
+
return []
|
158
|
+
|
159
|
+
else:
|
160
|
+
try:
|
161
|
+
headers = requests.head(url).headers
|
162
|
+
except Exception as e:
|
163
|
+
logging.warning(f"Error getting headers for {url}: {e}")
|
164
|
+
headers = CaseInsensitiveDict()
|
165
|
+
|
166
|
+
content_type = headers.get("Content-Type", "").lower()
|
167
|
+
temp_file_suffix = None
|
168
|
+
if "application/pdf" in content_type:
|
169
|
+
temp_file_suffix = ".pdf"
|
170
|
+
elif (
|
171
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
172
|
+
in content_type
|
173
|
+
):
|
174
|
+
temp_file_suffix = ".docx"
|
175
|
+
elif "application/msword" in content_type:
|
176
|
+
temp_file_suffix = ".doc"
|
177
|
+
|
178
|
+
if temp_file_suffix:
|
179
|
+
try:
|
180
|
+
response = requests.get(url)
|
181
|
+
with NamedTemporaryFile(
|
182
|
+
delete=False, suffix=temp_file_suffix
|
183
|
+
) as temp_file:
|
184
|
+
temp_file.write(response.content)
|
185
|
+
temp_file_path = temp_file.name
|
186
|
+
doc_parser = DocumentParser.create(
|
187
|
+
temp_file_path, self.parser.config
|
188
|
+
)
|
189
|
+
docs = doc_parser.get_doc_chunks()
|
190
|
+
os.remove(temp_file_path)
|
191
|
+
return docs
|
192
|
+
except Exception as e:
|
193
|
+
logging.error(f"Error downloading/parsing {url}: {e}")
|
194
|
+
return []
|
195
|
+
return []
|
196
|
+
|
197
|
+
def _is_document_url(self, url: str) -> bool:
|
198
|
+
return any(url.lower().endswith(ext) for ext in [".pdf", ".docx", ".doc"])
|
199
|
+
|
200
|
+
|
201
|
+
class CrawlerFactory:
|
202
|
+
"""Factory for creating web crawlers."""
|
203
|
+
|
204
|
+
@staticmethod
|
205
|
+
def create_crawler(config: BaseCrawlerConfig) -> BaseCrawler:
|
206
|
+
"""Create a crawler instance based on configuration type.
|
207
|
+
|
208
|
+
Args:
|
209
|
+
config: Configuration for the crawler
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
A BaseCrawler instance
|
213
|
+
|
214
|
+
Raises:
|
215
|
+
ValueError: If config type is not supported
|
216
|
+
"""
|
217
|
+
if isinstance(config, TrafilaturaConfig):
|
218
|
+
return TrafilaturaCrawler(config)
|
219
|
+
elif isinstance(config, FirecrawlConfig):
|
220
|
+
return FirecrawlCrawler(config)
|
221
|
+
elif isinstance(config, ExaCrawlerConfig):
|
222
|
+
return ExaCrawler(config)
|
223
|
+
elif isinstance(config, Crawl4aiConfig):
|
224
|
+
return Crawl4aiCrawler(config)
|
225
|
+
else:
|
226
|
+
raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
|
227
|
+
|
228
|
+
|
229
|
+
class TrafilaturaCrawler(BaseCrawler):
|
230
|
+
"""Crawler implementation using Trafilatura."""
|
231
|
+
|
232
|
+
def __init__(self, config: TrafilaturaConfig):
|
233
|
+
"""Initialize the Trafilatura crawler.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
config: Configuration for the crawler
|
237
|
+
"""
|
238
|
+
super().__init__(config)
|
239
|
+
self.config: TrafilaturaConfig = config
|
240
|
+
|
241
|
+
@property
|
242
|
+
def needs_parser(self) -> bool:
|
243
|
+
return True
|
244
|
+
|
245
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
246
|
+
import trafilatura
|
247
|
+
from trafilatura.downloads import (
|
248
|
+
add_to_compressed_dict,
|
249
|
+
buffered_downloads,
|
250
|
+
load_download_buffer,
|
251
|
+
)
|
252
|
+
|
253
|
+
docs = []
|
254
|
+
dl_dict = add_to_compressed_dict(urls)
|
255
|
+
|
256
|
+
while not dl_dict.done:
|
257
|
+
buffer, dl_dict = load_download_buffer(dl_dict, sleep_time=5)
|
258
|
+
for url, result in buffered_downloads(buffer, self.config.threads):
|
259
|
+
parsed_doc = self._process_document(url)
|
260
|
+
if parsed_doc:
|
261
|
+
docs.extend(parsed_doc)
|
262
|
+
else:
|
263
|
+
text = trafilatura.extract(
|
264
|
+
result,
|
265
|
+
no_fallback=False,
|
266
|
+
favor_recall=True,
|
267
|
+
include_formatting=True,
|
268
|
+
output_format=self.config.format,
|
269
|
+
with_metadata=True, # Title, date, author... at start of text
|
270
|
+
)
|
271
|
+
if self.config.format in ["xml", "html"]:
|
272
|
+
# heading_style="ATX" for markdown headings, i.e. #, ##, etc.
|
273
|
+
text = md.markdownify(text, heading_style="ATX")
|
274
|
+
if text is None and result is not None and isinstance(result, str):
|
275
|
+
text = result
|
276
|
+
if text:
|
277
|
+
docs.append(
|
278
|
+
Document(content=text, metadata=DocMetaData(source=url))
|
279
|
+
)
|
280
|
+
|
281
|
+
return docs
|
282
|
+
|
283
|
+
|
284
|
+
class FirecrawlCrawler(BaseCrawler):
|
285
|
+
"""Crawler implementation using Firecrawl."""
|
286
|
+
|
287
|
+
def __init__(self, config: FirecrawlConfig) -> None:
|
288
|
+
"""Initialize the Firecrawl crawler.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
config: Configuration for the crawler
|
292
|
+
"""
|
293
|
+
super().__init__(config)
|
294
|
+
self.config: FirecrawlConfig = config
|
295
|
+
|
296
|
+
@property
|
297
|
+
def needs_parser(self) -> bool:
|
298
|
+
return False
|
299
|
+
|
300
|
+
def _return_save_incremental_results(
|
301
|
+
self, app: "FirecrawlApp", crawl_id: str, output_dir: str = "firecrawl_output"
|
302
|
+
) -> List[Document]:
|
303
|
+
# Code used verbatim from firecrawl blog with few modifications
|
304
|
+
# https://www.firecrawl.dev/blog/mastering-the-crawl-endpoint-in-firecrawl
|
305
|
+
import json
|
306
|
+
import time
|
307
|
+
from pathlib import Path
|
308
|
+
|
309
|
+
from tqdm import tqdm
|
310
|
+
|
311
|
+
pbar = tqdm(desc="Pages saved", unit=" pages", dynamic_ncols=True)
|
312
|
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
313
|
+
processed_urls: set[str] = set()
|
314
|
+
docs = []
|
315
|
+
|
316
|
+
while True:
|
317
|
+
# Check current status
|
318
|
+
status = app.check_crawl_status(crawl_id)
|
319
|
+
new_pages = 0
|
320
|
+
|
321
|
+
# Save new pages
|
322
|
+
for page in status["data"]:
|
323
|
+
url = page["metadata"]["url"]
|
324
|
+
if url not in processed_urls:
|
325
|
+
content = page.get("markdown", "")
|
326
|
+
filename = f"{output_dir}/{len(processed_urls)}.md"
|
327
|
+
with open(filename, "w") as f:
|
328
|
+
f.write(content)
|
329
|
+
docs.append(
|
330
|
+
Document(
|
331
|
+
content=content,
|
332
|
+
metadata=DocMetaData(
|
333
|
+
source=url,
|
334
|
+
title=page["metadata"].get("title", "Unknown Title"),
|
335
|
+
),
|
336
|
+
)
|
337
|
+
)
|
338
|
+
processed_urls.add(url)
|
339
|
+
new_pages += 1
|
340
|
+
pbar.model_copy(update=new_pages) # Update progress bar with new pages
|
341
|
+
|
342
|
+
# Break if crawl is complete
|
343
|
+
if status["status"] == "completed":
|
344
|
+
print(f"Saved {len(processed_urls)} pages.")
|
345
|
+
with open(f"{output_dir}/full_results.json", "w") as f:
|
346
|
+
json.dump(status, f, indent=2)
|
347
|
+
break
|
348
|
+
|
349
|
+
time.sleep(5) # Wait before checking again
|
350
|
+
return docs
|
351
|
+
|
352
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
353
|
+
try:
|
354
|
+
from firecrawl import FirecrawlApp
|
355
|
+
except ImportError:
|
356
|
+
raise LangroidImportError("firecrawl", "firecrawl")
|
357
|
+
|
358
|
+
app = FirecrawlApp(api_key=self.config.api_key)
|
359
|
+
docs = []
|
360
|
+
params = self.config.params.model_copy() # Create a copy of the existing params
|
361
|
+
|
362
|
+
if self.config.timeout is not None:
|
363
|
+
params["timeout"] = self.config.timeout # Add/override timeout in params
|
364
|
+
|
365
|
+
if self.config.mode == "scrape":
|
366
|
+
for url in urls:
|
367
|
+
try:
|
368
|
+
result = app.scrape_url(url, params=params)
|
369
|
+
metadata = result.get(
|
370
|
+
"metadata", {}
|
371
|
+
) # Default to empty dict if missing
|
372
|
+
status_code = metadata.get("statusCode")
|
373
|
+
|
374
|
+
if status_code == 200:
|
375
|
+
docs.append(
|
376
|
+
Document(
|
377
|
+
content=result["markdown"],
|
378
|
+
metadata=DocMetaData(
|
379
|
+
source=url,
|
380
|
+
title=metadata.get("title", "Unknown Title"),
|
381
|
+
),
|
382
|
+
)
|
383
|
+
)
|
384
|
+
except Exception as e:
|
385
|
+
logging.warning(
|
386
|
+
f"Firecrawl encountered an error for {url}: {e}. "
|
387
|
+
"Skipping but continuing."
|
388
|
+
)
|
389
|
+
elif self.config.mode == "crawl":
|
390
|
+
if not isinstance(urls, list) or len(urls) != 1:
|
391
|
+
raise ValueError(
|
392
|
+
"Crawl mode expects 'urls' to be a list containing a single URL."
|
393
|
+
)
|
394
|
+
|
395
|
+
# Start the crawl
|
396
|
+
crawl_status = app.async_crawl_url(url=urls[0], params=params)
|
397
|
+
|
398
|
+
# Save results incrementally
|
399
|
+
docs = self._return_save_incremental_results(app, crawl_status["id"])
|
400
|
+
return docs
|
401
|
+
|
402
|
+
|
403
|
+
class ExaCrawler(BaseCrawler):
|
404
|
+
"""Crawler implementation using Exa API."""
|
405
|
+
|
406
|
+
def __init__(self, config: ExaCrawlerConfig) -> None:
|
407
|
+
"""Initialize the Exa crawler.
|
408
|
+
|
409
|
+
Args:
|
410
|
+
config: Configuration for the crawler
|
411
|
+
"""
|
412
|
+
super().__init__(config)
|
413
|
+
self.config: ExaCrawlerConfig = config
|
414
|
+
|
415
|
+
@property
|
416
|
+
def needs_parser(self) -> bool:
|
417
|
+
return True
|
418
|
+
|
419
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
420
|
+
"""Crawl the given URLs using Exa SDK.
|
421
|
+
|
422
|
+
Args:
|
423
|
+
urls: List of URLs to crawl
|
424
|
+
|
425
|
+
Returns:
|
426
|
+
List of Documents with content extracted from the URLs
|
427
|
+
|
428
|
+
Raises:
|
429
|
+
LangroidImportError: If the exa package is not installed
|
430
|
+
ValueError: If the Exa API key is not set
|
431
|
+
"""
|
432
|
+
try:
|
433
|
+
from exa_py import Exa
|
434
|
+
except ImportError:
|
435
|
+
raise LangroidImportError("exa", "exa")
|
436
|
+
|
437
|
+
if not self.config.api_key:
|
438
|
+
raise ValueError("EXA_API_KEY key is required in your env or .env")
|
439
|
+
|
440
|
+
exa = Exa(self.config.api_key)
|
441
|
+
docs = []
|
442
|
+
|
443
|
+
try:
|
444
|
+
for url in urls:
|
445
|
+
parsed_doc_chunks = self._process_document(url)
|
446
|
+
if parsed_doc_chunks:
|
447
|
+
docs.extend(parsed_doc_chunks)
|
448
|
+
continue
|
449
|
+
else:
|
450
|
+
results = exa.get_contents(
|
451
|
+
[url],
|
452
|
+
livecrawl="always",
|
453
|
+
text={
|
454
|
+
"include_html_tags": True,
|
455
|
+
},
|
456
|
+
)
|
457
|
+
result = results.results[0]
|
458
|
+
if result.text:
|
459
|
+
md_text = md.markdownify(result.text, heading_style="ATX")
|
460
|
+
# append a NON-chunked document
|
461
|
+
# (metadata.is_chunk = False, so will be chunked downstream)
|
462
|
+
docs.append(
|
463
|
+
Document(
|
464
|
+
content=md_text,
|
465
|
+
metadata=DocMetaData(
|
466
|
+
source=url,
|
467
|
+
title=getattr(result, "title", "Unknown Title"),
|
468
|
+
published_date=getattr(
|
469
|
+
result, "published_date", "Unknown Date"
|
470
|
+
),
|
471
|
+
),
|
472
|
+
)
|
473
|
+
)
|
474
|
+
|
475
|
+
except Exception as e:
|
476
|
+
logging.error(f"Error retrieving content from Exa API: {e}")
|
477
|
+
|
478
|
+
return docs
|
479
|
+
|
480
|
+
|
481
|
+
class Crawl4aiCrawler(BaseCrawler):
|
482
|
+
"""
|
483
|
+
Crawler implementation using the crawl4ai library.
|
484
|
+
|
485
|
+
This crawler intelligently dispatches URLs. Standard web pages are rendered
|
486
|
+
and scraped using the crawl4ai browser engine. Direct links to documents
|
487
|
+
(PDF, DOCX, etc.) are delegated to the framework's internal DocumentParser.
|
488
|
+
"""
|
489
|
+
|
490
|
+
def __init__(self, config: Crawl4aiConfig) -> None:
|
491
|
+
"""Initialize the Crawl4ai crawler."""
|
492
|
+
super().__init__(config)
|
493
|
+
self.config: Crawl4aiConfig = config
|
494
|
+
|
495
|
+
@property
|
496
|
+
def needs_parser(self) -> bool:
|
497
|
+
"""
|
498
|
+
Indicates that this crawler relies on the framework's DocumentParser
|
499
|
+
for handling specific file types like PDF, DOCX, etc., which
|
500
|
+
the browser engine cannot parse directly.
|
501
|
+
"""
|
502
|
+
return True
|
503
|
+
|
504
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
505
|
+
"""
|
506
|
+
Executes the crawl by separating document URLs from web page URLs.
|
507
|
+
|
508
|
+
- Document URLs (.pdf, .docx, etc.) are processed using `_process_document`.
|
509
|
+
- Web page URLs are handled using the async crawl4ai engine.
|
510
|
+
"""
|
511
|
+
all_documents: List[Document] = []
|
512
|
+
webpage_urls: List[str] = []
|
513
|
+
|
514
|
+
# Step 1: Separate URLs into documents and web pages
|
515
|
+
for url in urls:
|
516
|
+
parsed_doc_chunks = self._process_document(url)
|
517
|
+
if parsed_doc_chunks:
|
518
|
+
all_documents.extend(parsed_doc_chunks)
|
519
|
+
else:
|
520
|
+
webpage_urls.append(url)
|
521
|
+
|
522
|
+
# Step 2: Process web page URLs asynchronously
|
523
|
+
if webpage_urls:
|
524
|
+
try:
|
525
|
+
loop = asyncio.get_running_loop()
|
526
|
+
if loop.is_running():
|
527
|
+
import nest_asyncio
|
528
|
+
|
529
|
+
nest_asyncio.apply()
|
530
|
+
web_docs = asyncio.run(self._async_crawl(webpage_urls))
|
531
|
+
except RuntimeError:
|
532
|
+
web_docs = asyncio.run(self._async_crawl(webpage_urls))
|
533
|
+
|
534
|
+
all_documents.extend(web_docs)
|
535
|
+
|
536
|
+
return all_documents
|
537
|
+
|
538
|
+
def _translate_result_to_document(
|
539
|
+
self, result: "CrawlResult"
|
540
|
+
) -> Optional[Document]:
|
541
|
+
"""Converts a crawl4ai CrawlResult into the framework's Document format."""
|
542
|
+
if not result.success:
|
543
|
+
logging.warning(
|
544
|
+
f"Crawl4ai failed for URL {result.url}: {result.error_message}"
|
545
|
+
)
|
546
|
+
return None
|
547
|
+
|
548
|
+
content = ""
|
549
|
+
if result.extracted_content:
|
550
|
+
content = result.extracted_content
|
551
|
+
elif result.markdown:
|
552
|
+
if (
|
553
|
+
hasattr(result.markdown, "fit_markdown")
|
554
|
+
and result.markdown.fit_markdown
|
555
|
+
):
|
556
|
+
content = result.markdown.fit_markdown
|
557
|
+
elif hasattr(result.markdown, "raw_markdown"):
|
558
|
+
content = result.markdown.raw_markdown
|
559
|
+
else:
|
560
|
+
content = str(result.markdown)
|
561
|
+
|
562
|
+
if not content:
|
563
|
+
logging.warning(f"Crawl4ai returned no content for URL {result.url}")
|
564
|
+
return None
|
565
|
+
|
566
|
+
# Extract metadata safely
|
567
|
+
title = "Unknown Title"
|
568
|
+
published_date = "Unknown Date"
|
569
|
+
|
570
|
+
if result.metadata:
|
571
|
+
title = result.metadata.get("title", "Unknown Title")
|
572
|
+
# Try common date field names
|
573
|
+
for date_field in [
|
574
|
+
"published_date",
|
575
|
+
"datePublished",
|
576
|
+
"article:published_time",
|
577
|
+
"pubdate",
|
578
|
+
]:
|
579
|
+
if date_field in result.metadata:
|
580
|
+
published_date = result.metadata.get(date_field)
|
581
|
+
break
|
582
|
+
|
583
|
+
meta = DocMetaData(
|
584
|
+
source=result.url,
|
585
|
+
title=title,
|
586
|
+
published_date=published_date,
|
587
|
+
# Note: source_content is meant for reference content, not metadata
|
588
|
+
# Keeping it minimal as other crawlers don't populate it
|
589
|
+
)
|
590
|
+
return Document(content=content, metadata=meta)
|
591
|
+
|
592
|
+
async def _async_crawl(self, urls: List[str]) -> List[Document]:
|
593
|
+
try:
|
594
|
+
from crawl4ai import AsyncWebCrawler
|
595
|
+
|
596
|
+
# Import configs here for lazy loading
|
597
|
+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
598
|
+
except ImportError:
|
599
|
+
raise LangroidImportError(
|
600
|
+
"crawl4ai", "pip install 'crawl4ai[all]' or 'crawl4ai'"
|
601
|
+
)
|
602
|
+
|
603
|
+
# CHANGE 2: Handle the new optional config fields.
|
604
|
+
# Use the user-provided config if it exists, otherwise create a default one.
|
605
|
+
browser_config = self.config.browser_config or BrowserConfig()
|
606
|
+
run_config = self.config.run_config or CrawlerRunConfig()
|
607
|
+
|
608
|
+
if self.config.extraction_strategy:
|
609
|
+
run_config.extraction_strategy = self.config.extraction_strategy
|
610
|
+
if self.config.markdown_strategy:
|
611
|
+
run_config.markdown_generator = self.config.markdown_strategy
|
612
|
+
if self.config.deep_crawl_strategy:
|
613
|
+
run_config.deep_crawl_strategy = self.config.deep_crawl_strategy
|
614
|
+
if self.config.scraping_strategy:
|
615
|
+
run_config.scraping_strategy = self.config.scraping_strategy
|
616
|
+
|
617
|
+
crawled_documents: List[Document] = []
|
618
|
+
|
619
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
620
|
+
if self.config.crawl_mode == "simple":
|
621
|
+
for url in urls:
|
622
|
+
result = await crawler.arun(url, config=run_config)
|
623
|
+
doc = self._translate_result_to_document(result)
|
624
|
+
if doc:
|
625
|
+
crawled_documents.append(doc)
|
626
|
+
|
627
|
+
elif self.config.crawl_mode == "deep":
|
628
|
+
if not urls:
|
629
|
+
return []
|
630
|
+
if not run_config.deep_crawl_strategy:
|
631
|
+
logging.warning(
|
632
|
+
"Deep crawl mode requires a deep_crawl_strategy in the config."
|
633
|
+
)
|
634
|
+
return []
|
635
|
+
|
636
|
+
# In deep crawl mode, `crawl4ai` will discover and crawl pages
|
637
|
+
# starting from the seed URL. It will not process direct document links
|
638
|
+
# found during the deep crawl; it is designed to follow hyperlinks.
|
639
|
+
crawl_results = await crawler.arun(urls[0], config=run_config)
|
640
|
+
|
641
|
+
if isinstance(crawl_results, list):
|
642
|
+
for result in crawl_results:
|
643
|
+
doc = self._translate_result_to_document(result)
|
644
|
+
if doc:
|
645
|
+
crawled_documents.append(doc)
|
646
|
+
else:
|
647
|
+
async for result in crawl_results:
|
648
|
+
doc = self._translate_result_to_document(result)
|
649
|
+
if doc:
|
650
|
+
crawled_documents.append(doc)
|
651
|
+
|
652
|
+
return crawled_documents
|
653
|
+
|
654
|
+
|
655
|
+
class URLLoader:
|
656
|
+
"""Loads URLs and extracts text using a specified crawler."""
|
657
|
+
|
658
|
+
def __init__(
|
659
|
+
self,
|
660
|
+
urls: List[Any],
|
661
|
+
parsing_config: ParsingConfig = ParsingConfig(),
|
662
|
+
crawler_config: Optional[BaseCrawlerConfig] = None,
|
663
|
+
):
|
664
|
+
"""Initialize the URL loader.
|
665
|
+
|
666
|
+
Args:
|
667
|
+
urls: List of URLs to load
|
668
|
+
parsing_config: Configuration for parsing
|
669
|
+
crawler_config: Configuration for the crawler
|
670
|
+
"""
|
671
|
+
self.urls = urls
|
672
|
+
self.parsing_config = parsing_config
|
673
|
+
|
674
|
+
if crawler_config is None:
|
675
|
+
crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
|
676
|
+
|
677
|
+
self.crawler = CrawlerFactory.create_crawler(crawler_config)
|
678
|
+
if self.crawler.needs_parser:
|
679
|
+
self.crawler.parser = Parser(parsing_config)
|
680
|
+
|
681
|
+
def load(self) -> List[Document]:
|
682
|
+
"""Load the URLs using the specified crawler."""
|
683
|
+
return self.crawler.crawl(self.urls)
|