langroid 0.56.19__py3-none-any.whl → 0.58.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/chat_document.py +67 -19
- langroid/agent/task.py +96 -1
- langroid/parsing/url_loader.py +234 -1
- langroid/utils/html_logger.py +825 -0
- {langroid-0.56.19.dist-info → langroid-0.58.0.dist-info}/METADATA +6 -1
- {langroid-0.56.19.dist-info → langroid-0.58.0.dist-info}/RECORD +8 -7
- {langroid-0.56.19.dist-info → langroid-0.58.0.dist-info}/WHEEL +0 -0
- {langroid-0.56.19.dist-info → langroid-0.58.0.dist-info}/licenses/LICENSE +0 -0
langroid/agent/chat_document.py
CHANGED
@@ -217,25 +217,29 @@ class ChatDocument(Document):
|
|
217
217
|
"""
|
218
218
|
tool_type = "" # FUNC or TOOL
|
219
219
|
tool = "" # tool name or function name
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
220
|
+
|
221
|
+
# Skip tool detection for system messages - they contain tool instructions,
|
222
|
+
# not actual tool calls
|
223
|
+
if self.metadata.sender != Entity.SYSTEM:
|
224
|
+
oai_tools = (
|
225
|
+
[]
|
226
|
+
if self.oai_tool_calls is None
|
227
|
+
else [t for t in self.oai_tool_calls if t.function is not None]
|
228
|
+
)
|
229
|
+
if self.function_call is not None:
|
230
|
+
tool_type = "FUNC"
|
231
|
+
tool = self.function_call.name
|
232
|
+
elif len(oai_tools) > 0:
|
233
|
+
tool_type = "OAI_TOOL"
|
234
|
+
tool = ",".join(t.function.name for t in oai_tools) # type: ignore
|
235
|
+
else:
|
236
|
+
try:
|
237
|
+
json_tools = self.get_tool_names()
|
238
|
+
except Exception:
|
239
|
+
json_tools = []
|
240
|
+
if json_tools != []:
|
241
|
+
tool_type = "TOOL"
|
242
|
+
tool = json_tools[0]
|
239
243
|
recipient = self.metadata.recipient
|
240
244
|
content = self.content
|
241
245
|
sender_entity = self.metadata.sender
|
@@ -340,6 +344,50 @@ class ChatDocument(Document):
|
|
340
344
|
),
|
341
345
|
)
|
342
346
|
|
347
|
+
@staticmethod
|
348
|
+
def from_LLMMessage(
|
349
|
+
message: LLMMessage,
|
350
|
+
sender_name: str = "",
|
351
|
+
recipient: str = "",
|
352
|
+
) -> "ChatDocument":
|
353
|
+
"""
|
354
|
+
Convert LLMMessage to ChatDocument.
|
355
|
+
|
356
|
+
Args:
|
357
|
+
message (LLMMessage): LLMMessage to convert.
|
358
|
+
sender_name (str): Name of the sender. Defaults to "".
|
359
|
+
recipient (str): Name of the recipient. Defaults to "".
|
360
|
+
|
361
|
+
Returns:
|
362
|
+
ChatDocument: ChatDocument representation of this LLMMessage.
|
363
|
+
"""
|
364
|
+
# Map LLMMessage Role to ChatDocument Entity
|
365
|
+
role_to_entity = {
|
366
|
+
Role.USER: Entity.USER,
|
367
|
+
Role.SYSTEM: Entity.SYSTEM,
|
368
|
+
Role.ASSISTANT: Entity.LLM,
|
369
|
+
Role.FUNCTION: Entity.LLM,
|
370
|
+
Role.TOOL: Entity.LLM,
|
371
|
+
}
|
372
|
+
|
373
|
+
sender_entity = role_to_entity.get(message.role, Entity.USER)
|
374
|
+
|
375
|
+
return ChatDocument(
|
376
|
+
content=message.content or "",
|
377
|
+
content_any=message.content,
|
378
|
+
files=message.files,
|
379
|
+
function_call=message.function_call,
|
380
|
+
oai_tool_calls=message.tool_calls,
|
381
|
+
metadata=ChatDocMetaData(
|
382
|
+
source=sender_entity,
|
383
|
+
sender=sender_entity,
|
384
|
+
sender_name=sender_name,
|
385
|
+
recipient=recipient,
|
386
|
+
oai_tool_id=message.tool_call_id,
|
387
|
+
tool_ids=[message.tool_id] if message.tool_id else [],
|
388
|
+
),
|
389
|
+
)
|
390
|
+
|
343
391
|
@staticmethod
|
344
392
|
def to_LLMMessage(
|
345
393
|
message: Union[str, "ChatDocument"],
|
langroid/agent/task.py
CHANGED
@@ -55,6 +55,7 @@ from langroid.utils.constants import (
|
|
55
55
|
SEND_TO,
|
56
56
|
USER_QUIT_STRINGS,
|
57
57
|
)
|
58
|
+
from langroid.utils.html_logger import HTMLLogger
|
58
59
|
from langroid.utils.logging import RichFileLogger, setup_file_logger
|
59
60
|
from langroid.utils.object_registry import scheduled_cleanup
|
60
61
|
from langroid.utils.system import hash
|
@@ -154,6 +155,7 @@ class TaskConfig(BaseModel):
|
|
154
155
|
restart_as_subtask: bool = False
|
155
156
|
logs_dir: str = "logs"
|
156
157
|
enable_loggers: bool = True
|
158
|
+
enable_html_logging: bool = True
|
157
159
|
addressing_prefix: str = ""
|
158
160
|
allow_subtask_multi_oai_tools: bool = True
|
159
161
|
recognize_string_signals: bool = True
|
@@ -343,6 +345,7 @@ class Task:
|
|
343
345
|
self.session_id: str = ""
|
344
346
|
self.logger: None | RichFileLogger = None
|
345
347
|
self.tsv_logger: None | logging.Logger = None
|
348
|
+
self.html_logger: Optional[HTMLLogger] = None
|
346
349
|
self.color_log: bool = False if settings.notebook else True
|
347
350
|
|
348
351
|
self.n_stalled_steps = 0 # how many consecutive steps with no progress?
|
@@ -637,7 +640,20 @@ class Task:
|
|
637
640
|
|
638
641
|
self._show_pending_message_if_debug()
|
639
642
|
self.init_loggers()
|
640
|
-
|
643
|
+
# Log system message if it exists
|
644
|
+
if (
|
645
|
+
hasattr(self.agent, "_create_system_and_tools_message")
|
646
|
+
and hasattr(self.agent, "system_message")
|
647
|
+
and self.agent.system_message
|
648
|
+
):
|
649
|
+
system_msg = self.agent._create_system_and_tools_message()
|
650
|
+
system_message_chat_doc = ChatDocument.from_LLMMessage(
|
651
|
+
system_msg,
|
652
|
+
sender_name=self.name or "system",
|
653
|
+
)
|
654
|
+
# log the system message
|
655
|
+
self.log_message(Entity.SYSTEM, system_message_chat_doc, mark=True)
|
656
|
+
self.log_message(Entity.USER, self.pending_message, mark=True)
|
641
657
|
return self.pending_message
|
642
658
|
|
643
659
|
def init_loggers(self) -> None:
|
@@ -667,6 +683,34 @@ class Task:
|
|
667
683
|
header = ChatDocLoggerFields().tsv_header()
|
668
684
|
self.tsv_logger.info(f" \tTask\tResponder\t{header}")
|
669
685
|
|
686
|
+
# HTML logger
|
687
|
+
if self.config.enable_html_logging:
|
688
|
+
if (
|
689
|
+
self.caller is not None
|
690
|
+
and hasattr(self.caller, "html_logger")
|
691
|
+
and self.caller.html_logger is not None
|
692
|
+
):
|
693
|
+
self.html_logger = self.caller.html_logger
|
694
|
+
elif not hasattr(self, "html_logger") or self.html_logger is None:
|
695
|
+
from langroid.utils.html_logger import HTMLLogger
|
696
|
+
|
697
|
+
model_info = ""
|
698
|
+
if (
|
699
|
+
hasattr(self, "agent")
|
700
|
+
and hasattr(self.agent, "config")
|
701
|
+
and hasattr(self.agent.config, "llm")
|
702
|
+
):
|
703
|
+
model_info = getattr(self.agent.config.llm, "chat_model", "")
|
704
|
+
self.html_logger = HTMLLogger(
|
705
|
+
filename=self.name,
|
706
|
+
log_dir=self.config.logs_dir,
|
707
|
+
model_info=model_info,
|
708
|
+
append=False,
|
709
|
+
)
|
710
|
+
# Log clickable file:// link to the HTML log
|
711
|
+
html_log_path = self.html_logger.file_path.resolve()
|
712
|
+
logger.warning(f"📊 HTML Log: file://{html_log_path}")
|
713
|
+
|
670
714
|
def reset_all_sub_tasks(self) -> None:
|
671
715
|
"""
|
672
716
|
Recursively reset message history & state of own agent and
|
@@ -2037,6 +2081,8 @@ class Task:
|
|
2037
2081
|
mark (bool, optional): Whether to mark the message as the final result of
|
2038
2082
|
a `task.step()` call. Defaults to False.
|
2039
2083
|
"""
|
2084
|
+
from langroid.agent.chat_document import ChatDocLoggerFields
|
2085
|
+
|
2040
2086
|
default_values = ChatDocLoggerFields().dict().values()
|
2041
2087
|
msg_str_tsv = "\t".join(str(v) for v in default_values)
|
2042
2088
|
if msg is not None:
|
@@ -2077,6 +2123,48 @@ class Task:
|
|
2077
2123
|
resp_str = str(resp)
|
2078
2124
|
self.tsv_logger.info(f"{mark_str}\t{task_name}\t{resp_str}\t{msg_str_tsv}")
|
2079
2125
|
|
2126
|
+
# HTML logger
|
2127
|
+
if self.html_logger is not None:
|
2128
|
+
if msg is None:
|
2129
|
+
# Create a minimal fields object for None messages
|
2130
|
+
from langroid.agent.chat_document import ChatDocLoggerFields
|
2131
|
+
|
2132
|
+
fields_dict = {
|
2133
|
+
"responder": str(resp),
|
2134
|
+
"mark": "*" if mark else "",
|
2135
|
+
"task_name": self.name or "root",
|
2136
|
+
"content": "",
|
2137
|
+
"sender_entity": str(resp),
|
2138
|
+
"sender_name": "",
|
2139
|
+
"recipient": "",
|
2140
|
+
"block": None,
|
2141
|
+
"tool_type": "",
|
2142
|
+
"tool": "",
|
2143
|
+
}
|
2144
|
+
else:
|
2145
|
+
# Get fields from the message
|
2146
|
+
fields = msg.log_fields()
|
2147
|
+
fields_dict = fields.dict()
|
2148
|
+
fields_dict.update(
|
2149
|
+
{
|
2150
|
+
"responder": str(resp),
|
2151
|
+
"mark": "*" if mark else "",
|
2152
|
+
"task_name": self.name or "root",
|
2153
|
+
}
|
2154
|
+
)
|
2155
|
+
|
2156
|
+
# Create a ChatDocLoggerFields-like object for the HTML logger
|
2157
|
+
# Create a simple BaseModel subclass dynamically
|
2158
|
+
from langroid.pydantic_v1 import BaseModel
|
2159
|
+
|
2160
|
+
class LogFields(BaseModel):
|
2161
|
+
class Config:
|
2162
|
+
extra = "allow" # Allow extra fields
|
2163
|
+
|
2164
|
+
# Create instance with the fields from fields_dict
|
2165
|
+
log_obj = LogFields(**fields_dict)
|
2166
|
+
self.html_logger.log(log_obj)
|
2167
|
+
|
2080
2168
|
def _valid_recipient(self, recipient: str) -> bool:
|
2081
2169
|
"""
|
2082
2170
|
Is the recipient among the list of responders?
|
@@ -2335,6 +2423,13 @@ class Task:
|
|
2335
2423
|
# Check if we matched the entire sequence
|
2336
2424
|
return seq_idx == len(sequence.events)
|
2337
2425
|
|
2426
|
+
def close_loggers(self) -> None:
|
2427
|
+
"""Close all loggers to ensure clean shutdown."""
|
2428
|
+
if hasattr(self, "logger") and self.logger is not None:
|
2429
|
+
self.logger.close()
|
2430
|
+
if hasattr(self, "html_logger") and self.html_logger is not None:
|
2431
|
+
self.html_logger.close()
|
2432
|
+
|
2338
2433
|
def _matches_sequence_with_current(
|
2339
2434
|
self,
|
2340
2435
|
msg_chain: List[ChatDocument],
|
langroid/parsing/url_loader.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
import asyncio
|
1
2
|
import logging
|
2
3
|
import os
|
3
4
|
from abc import ABC, abstractmethod
|
4
5
|
from tempfile import NamedTemporaryFile
|
5
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
|
6
7
|
|
7
8
|
import markdownify as md
|
8
9
|
from dotenv import load_dotenv
|
@@ -16,6 +17,16 @@ from langroid.pydantic_v1 import BaseSettings
|
|
16
17
|
if TYPE_CHECKING:
|
17
18
|
from firecrawl import FirecrawlApp
|
18
19
|
|
20
|
+
try:
|
21
|
+
from crawl4ai import CrawlResult
|
22
|
+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
23
|
+
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
|
24
|
+
from crawl4ai.deep_crawling import DeepCrawlStrategy
|
25
|
+
from crawl4ai.extraction_strategy import ExtractionStrategy
|
26
|
+
from crawl4ai.markdown_generation_strategy import MarkdownGenerationStrategy
|
27
|
+
except ImportError:
|
28
|
+
raise LangroidImportError("crawl4ai", "crawl-4-ai")
|
29
|
+
|
19
30
|
load_dotenv()
|
20
31
|
|
21
32
|
logging.getLogger("url_loader").setLevel(logging.WARNING)
|
@@ -59,6 +70,52 @@ class ExaCrawlerConfig(BaseCrawlerConfig):
|
|
59
70
|
env_prefix = "EXA_"
|
60
71
|
|
61
72
|
|
73
|
+
def _resolve_crawl4ai_forward_refs(cls: Any) -> Any:
|
74
|
+
"""
|
75
|
+
A class decorator that resolves forward references for fields in a Pydantic
|
76
|
+
model that depend on the optional 'crawl4ai' library.
|
77
|
+
"""
|
78
|
+
try:
|
79
|
+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig # noqa: F401
|
80
|
+
from crawl4ai.content_scraping_strategy import ( # noqa: F401
|
81
|
+
ContentScrapingStrategy,
|
82
|
+
)
|
83
|
+
from crawl4ai.deep_crawling import DeepCrawlStrategy # noqa: F401
|
84
|
+
from crawl4ai.extraction_strategy import ExtractionStrategy # noqa: F401
|
85
|
+
from crawl4ai.markdown_generation_strategy import ( # noqa: F401
|
86
|
+
MarkdownGenerationStrategy,
|
87
|
+
)
|
88
|
+
|
89
|
+
# Create a namespace dictionary from locals() but exclude 'cls'.
|
90
|
+
# This prevents the TypeError.
|
91
|
+
namespace = {name: value for name, value in locals().items() if name != "cls"}
|
92
|
+
cls.update_forward_refs(**namespace)
|
93
|
+
|
94
|
+
except ImportError:
|
95
|
+
# If crawl4ai is not installed, do nothing.
|
96
|
+
pass
|
97
|
+
return cls
|
98
|
+
|
99
|
+
|
100
|
+
@_resolve_crawl4ai_forward_refs
|
101
|
+
class Crawl4aiConfig(BaseCrawlerConfig):
|
102
|
+
"""
|
103
|
+
Configuration for the Crawl4aiCrawler.
|
104
|
+
"""
|
105
|
+
|
106
|
+
crawl_mode: Literal["simple", "deep"] = "simple"
|
107
|
+
extraction_strategy: Optional["ExtractionStrategy"] = None
|
108
|
+
markdown_strategy: Optional["MarkdownGenerationStrategy"] = None
|
109
|
+
deep_crawl_strategy: Optional["DeepCrawlStrategy"] = None
|
110
|
+
scraping_strategy: Optional["ContentScrapingStrategy"] = None
|
111
|
+
|
112
|
+
browser_config: Optional["BrowserConfig"] = None
|
113
|
+
run_config: Optional["CrawlerRunConfig"] = None
|
114
|
+
|
115
|
+
class Config:
|
116
|
+
arbitrary_types_allowed = True
|
117
|
+
|
118
|
+
|
62
119
|
class BaseCrawler(ABC):
|
63
120
|
"""Abstract base class for web crawlers."""
|
64
121
|
|
@@ -163,6 +220,8 @@ class CrawlerFactory:
|
|
163
220
|
return FirecrawlCrawler(config)
|
164
221
|
elif isinstance(config, ExaCrawlerConfig):
|
165
222
|
return ExaCrawler(config)
|
223
|
+
elif isinstance(config, Crawl4aiConfig):
|
224
|
+
return Crawl4aiCrawler(config)
|
166
225
|
else:
|
167
226
|
raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
|
168
227
|
|
@@ -419,6 +478,180 @@ class ExaCrawler(BaseCrawler):
|
|
419
478
|
return docs
|
420
479
|
|
421
480
|
|
481
|
+
class Crawl4aiCrawler(BaseCrawler):
|
482
|
+
"""
|
483
|
+
Crawler implementation using the crawl4ai library.
|
484
|
+
|
485
|
+
This crawler intelligently dispatches URLs. Standard web pages are rendered
|
486
|
+
and scraped using the crawl4ai browser engine. Direct links to documents
|
487
|
+
(PDF, DOCX, etc.) are delegated to the framework's internal DocumentParser.
|
488
|
+
"""
|
489
|
+
|
490
|
+
def __init__(self, config: Crawl4aiConfig) -> None:
|
491
|
+
"""Initialize the Crawl4ai crawler."""
|
492
|
+
super().__init__(config)
|
493
|
+
self.config: Crawl4aiConfig = config
|
494
|
+
|
495
|
+
@property
|
496
|
+
def needs_parser(self) -> bool:
|
497
|
+
"""
|
498
|
+
Indicates that this crawler relies on the framework's DocumentParser
|
499
|
+
for handling specific file types like PDF, DOCX, etc., which
|
500
|
+
the browser engine cannot parse directly.
|
501
|
+
"""
|
502
|
+
return True
|
503
|
+
|
504
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
505
|
+
"""
|
506
|
+
Executes the crawl by separating document URLs from web page URLs.
|
507
|
+
|
508
|
+
- Document URLs (.pdf, .docx, etc.) are processed using `_process_document`.
|
509
|
+
- Web page URLs are handled using the async crawl4ai engine.
|
510
|
+
"""
|
511
|
+
all_documents: List[Document] = []
|
512
|
+
webpage_urls: List[str] = []
|
513
|
+
|
514
|
+
# Step 1: Separate URLs into documents and web pages
|
515
|
+
for url in urls:
|
516
|
+
parsed_doc_chunks = self._process_document(url)
|
517
|
+
if parsed_doc_chunks:
|
518
|
+
all_documents.extend(parsed_doc_chunks)
|
519
|
+
else:
|
520
|
+
webpage_urls.append(url)
|
521
|
+
|
522
|
+
# Step 2: Process web page URLs asynchronously
|
523
|
+
if webpage_urls:
|
524
|
+
try:
|
525
|
+
loop = asyncio.get_running_loop()
|
526
|
+
if loop.is_running():
|
527
|
+
import nest_asyncio
|
528
|
+
|
529
|
+
nest_asyncio.apply()
|
530
|
+
web_docs = asyncio.run(self._async_crawl(webpage_urls))
|
531
|
+
except RuntimeError:
|
532
|
+
web_docs = asyncio.run(self._async_crawl(webpage_urls))
|
533
|
+
|
534
|
+
all_documents.extend(web_docs)
|
535
|
+
|
536
|
+
return all_documents
|
537
|
+
|
538
|
+
def _translate_result_to_document(
|
539
|
+
self, result: "CrawlResult"
|
540
|
+
) -> Optional[Document]:
|
541
|
+
"""Converts a crawl4ai CrawlResult into the framework's Document format."""
|
542
|
+
if not result.success:
|
543
|
+
logging.warning(
|
544
|
+
f"Crawl4ai failed for URL {result.url}: {result.error_message}"
|
545
|
+
)
|
546
|
+
return None
|
547
|
+
|
548
|
+
content = ""
|
549
|
+
if result.extracted_content:
|
550
|
+
content = result.extracted_content
|
551
|
+
elif result.markdown:
|
552
|
+
if (
|
553
|
+
hasattr(result.markdown, "fit_markdown")
|
554
|
+
and result.markdown.fit_markdown
|
555
|
+
):
|
556
|
+
content = result.markdown.fit_markdown
|
557
|
+
elif hasattr(result.markdown, "raw_markdown"):
|
558
|
+
content = result.markdown.raw_markdown
|
559
|
+
else:
|
560
|
+
content = str(result.markdown)
|
561
|
+
|
562
|
+
if not content:
|
563
|
+
logging.warning(f"Crawl4ai returned no content for URL {result.url}")
|
564
|
+
return None
|
565
|
+
|
566
|
+
# Extract metadata safely
|
567
|
+
title = "Unknown Title"
|
568
|
+
published_date = "Unknown Date"
|
569
|
+
|
570
|
+
if result.metadata:
|
571
|
+
title = result.metadata.get("title", "Unknown Title")
|
572
|
+
# Try common date field names
|
573
|
+
for date_field in [
|
574
|
+
"published_date",
|
575
|
+
"datePublished",
|
576
|
+
"article:published_time",
|
577
|
+
"pubdate",
|
578
|
+
]:
|
579
|
+
if date_field in result.metadata:
|
580
|
+
published_date = result.metadata.get(date_field)
|
581
|
+
break
|
582
|
+
|
583
|
+
meta = DocMetaData(
|
584
|
+
source=result.url,
|
585
|
+
title=title,
|
586
|
+
published_date=published_date,
|
587
|
+
# Note: source_content is meant for reference content, not metadata
|
588
|
+
# Keeping it minimal as other crawlers don't populate it
|
589
|
+
)
|
590
|
+
return Document(content=content, metadata=meta)
|
591
|
+
|
592
|
+
async def _async_crawl(self, urls: List[str]) -> List[Document]:
|
593
|
+
try:
|
594
|
+
from crawl4ai import AsyncWebCrawler
|
595
|
+
|
596
|
+
# Import configs here for lazy loading
|
597
|
+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
598
|
+
except ImportError:
|
599
|
+
raise LangroidImportError(
|
600
|
+
"crawl4ai", "pip install 'crawl4ai[all]' or 'crawl4ai'"
|
601
|
+
)
|
602
|
+
|
603
|
+
# CHANGE 2: Handle the new optional config fields.
|
604
|
+
# Use the user-provided config if it exists, otherwise create a default one.
|
605
|
+
browser_config = self.config.browser_config or BrowserConfig()
|
606
|
+
run_config = self.config.run_config or CrawlerRunConfig()
|
607
|
+
|
608
|
+
if self.config.extraction_strategy:
|
609
|
+
run_config.extraction_strategy = self.config.extraction_strategy
|
610
|
+
if self.config.markdown_strategy:
|
611
|
+
run_config.markdown_generator = self.config.markdown_strategy
|
612
|
+
if self.config.deep_crawl_strategy:
|
613
|
+
run_config.deep_crawl_strategy = self.config.deep_crawl_strategy
|
614
|
+
if self.config.scraping_strategy:
|
615
|
+
run_config.scraping_strategy = self.config.scraping_strategy
|
616
|
+
|
617
|
+
crawled_documents: List[Document] = []
|
618
|
+
|
619
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
620
|
+
if self.config.crawl_mode == "simple":
|
621
|
+
for url in urls:
|
622
|
+
result = await crawler.arun(url, config=run_config)
|
623
|
+
doc = self._translate_result_to_document(result)
|
624
|
+
if doc:
|
625
|
+
crawled_documents.append(doc)
|
626
|
+
|
627
|
+
elif self.config.crawl_mode == "deep":
|
628
|
+
if not urls:
|
629
|
+
return []
|
630
|
+
if not run_config.deep_crawl_strategy:
|
631
|
+
logging.warning(
|
632
|
+
"Deep crawl mode requires a deep_crawl_strategy in the config."
|
633
|
+
)
|
634
|
+
return []
|
635
|
+
|
636
|
+
# In deep crawl mode, `crawl4ai` will discover and crawl pages
|
637
|
+
# starting from the seed URL. It will not process direct document links
|
638
|
+
# found during the deep crawl; it is designed to follow hyperlinks.
|
639
|
+
crawl_results = await crawler.arun(urls[0], config=run_config)
|
640
|
+
|
641
|
+
if isinstance(crawl_results, list):
|
642
|
+
for result in crawl_results:
|
643
|
+
doc = self._translate_result_to_document(result)
|
644
|
+
if doc:
|
645
|
+
crawled_documents.append(doc)
|
646
|
+
else:
|
647
|
+
async for result in crawl_results:
|
648
|
+
doc = self._translate_result_to_document(result)
|
649
|
+
if doc:
|
650
|
+
crawled_documents.append(doc)
|
651
|
+
|
652
|
+
return crawled_documents
|
653
|
+
|
654
|
+
|
422
655
|
class URLLoader:
|
423
656
|
"""Loads URLs and extracts text using a specified crawler."""
|
424
657
|
|