langroid 0.58.2__py3-none-any.whl → 0.59.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. langroid/agent/base.py +39 -17
  2. langroid/agent/base.py-e +2216 -0
  3. langroid/agent/callbacks/chainlit.py +2 -1
  4. langroid/agent/chat_agent.py +73 -55
  5. langroid/agent/chat_agent.py-e +2086 -0
  6. langroid/agent/chat_document.py +7 -7
  7. langroid/agent/chat_document.py-e +513 -0
  8. langroid/agent/openai_assistant.py +9 -9
  9. langroid/agent/openai_assistant.py-e +882 -0
  10. langroid/agent/special/arangodb/arangodb_agent.py +10 -18
  11. langroid/agent/special/arangodb/arangodb_agent.py-e +648 -0
  12. langroid/agent/special/arangodb/tools.py +3 -3
  13. langroid/agent/special/doc_chat_agent.py +16 -14
  14. langroid/agent/special/lance_rag/critic_agent.py +2 -2
  15. langroid/agent/special/lance_rag/query_planner_agent.py +4 -4
  16. langroid/agent/special/lance_tools.py +6 -5
  17. langroid/agent/special/lance_tools.py-e +61 -0
  18. langroid/agent/special/neo4j/neo4j_chat_agent.py +3 -7
  19. langroid/agent/special/neo4j/neo4j_chat_agent.py-e +430 -0
  20. langroid/agent/special/relevance_extractor_agent.py +1 -1
  21. langroid/agent/special/sql/sql_chat_agent.py +11 -3
  22. langroid/agent/task.py +9 -87
  23. langroid/agent/task.py-e +2418 -0
  24. langroid/agent/tool_message.py +33 -17
  25. langroid/agent/tool_message.py-e +400 -0
  26. langroid/agent/tools/file_tools.py +4 -2
  27. langroid/agent/tools/file_tools.py-e +234 -0
  28. langroid/agent/tools/mcp/fastmcp_client.py +19 -6
  29. langroid/agent/tools/mcp/fastmcp_client.py-e +584 -0
  30. langroid/agent/tools/orchestration.py +22 -17
  31. langroid/agent/tools/orchestration.py-e +301 -0
  32. langroid/agent/tools/recipient_tool.py +3 -3
  33. langroid/agent/tools/task_tool.py +22 -16
  34. langroid/agent/tools/task_tool.py-e +249 -0
  35. langroid/agent/xml_tool_message.py +90 -35
  36. langroid/agent/xml_tool_message.py-e +392 -0
  37. langroid/cachedb/base.py +1 -1
  38. langroid/embedding_models/base.py +2 -2
  39. langroid/embedding_models/models.py +3 -7
  40. langroid/embedding_models/models.py-e +563 -0
  41. langroid/exceptions.py +4 -1
  42. langroid/language_models/azure_openai.py +2 -2
  43. langroid/language_models/azure_openai.py-e +134 -0
  44. langroid/language_models/base.py +6 -4
  45. langroid/language_models/base.py-e +812 -0
  46. langroid/language_models/client_cache.py +64 -0
  47. langroid/language_models/config.py +2 -4
  48. langroid/language_models/config.py-e +18 -0
  49. langroid/language_models/model_info.py +9 -1
  50. langroid/language_models/model_info.py-e +483 -0
  51. langroid/language_models/openai_gpt.py +119 -20
  52. langroid/language_models/openai_gpt.py-e +2280 -0
  53. langroid/language_models/provider_params.py +3 -22
  54. langroid/language_models/provider_params.py-e +153 -0
  55. langroid/mytypes.py +11 -4
  56. langroid/mytypes.py-e +132 -0
  57. langroid/parsing/code_parser.py +1 -1
  58. langroid/parsing/file_attachment.py +1 -1
  59. langroid/parsing/file_attachment.py-e +246 -0
  60. langroid/parsing/md_parser.py +14 -4
  61. langroid/parsing/md_parser.py-e +574 -0
  62. langroid/parsing/parser.py +22 -7
  63. langroid/parsing/parser.py-e +410 -0
  64. langroid/parsing/repo_loader.py +3 -1
  65. langroid/parsing/repo_loader.py-e +812 -0
  66. langroid/parsing/search.py +1 -1
  67. langroid/parsing/url_loader.py +17 -51
  68. langroid/parsing/url_loader.py-e +683 -0
  69. langroid/parsing/urls.py +5 -4
  70. langroid/parsing/urls.py-e +279 -0
  71. langroid/prompts/prompts_config.py +1 -1
  72. langroid/pydantic_v1/__init__.py +45 -6
  73. langroid/pydantic_v1/__init__.py-e +36 -0
  74. langroid/pydantic_v1/main.py +11 -4
  75. langroid/pydantic_v1/main.py-e +11 -0
  76. langroid/utils/configuration.py +13 -11
  77. langroid/utils/configuration.py-e +141 -0
  78. langroid/utils/constants.py +1 -1
  79. langroid/utils/constants.py-e +32 -0
  80. langroid/utils/globals.py +21 -5
  81. langroid/utils/globals.py-e +49 -0
  82. langroid/utils/html_logger.py +2 -1
  83. langroid/utils/html_logger.py-e +825 -0
  84. langroid/utils/object_registry.py +1 -1
  85. langroid/utils/object_registry.py-e +66 -0
  86. langroid/utils/pydantic_utils.py +55 -28
  87. langroid/utils/pydantic_utils.py-e +602 -0
  88. langroid/utils/types.py +2 -2
  89. langroid/utils/types.py-e +113 -0
  90. langroid/vector_store/base.py +3 -3
  91. langroid/vector_store/lancedb.py +5 -5
  92. langroid/vector_store/lancedb.py-e +404 -0
  93. langroid/vector_store/meilisearch.py +2 -2
  94. langroid/vector_store/pineconedb.py +4 -4
  95. langroid/vector_store/pineconedb.py-e +427 -0
  96. langroid/vector_store/postgres.py +1 -1
  97. langroid/vector_store/qdrantdb.py +3 -3
  98. langroid/vector_store/weaviatedb.py +1 -1
  99. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/METADATA +3 -2
  100. langroid-0.59.0b1.dist-info/RECORD +181 -0
  101. langroid/agent/special/doc_chat_task.py +0 -0
  102. langroid/mcp/__init__.py +0 -1
  103. langroid/mcp/server/__init__.py +0 -1
  104. langroid-0.58.2.dist-info/RECORD +0 -145
  105. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/WHEEL +0 -0
  106. {langroid-0.58.2.dist-info → langroid-0.59.0b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,683 @@
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ from abc import ABC, abstractmethod
5
+ from tempfile import NamedTemporaryFile
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
7
+
8
+ import markdownify as md
9
+ from dotenv import load_dotenv
10
+
11
+ from langroid.exceptions import LangroidImportError
12
+ from langroid.mytypes import DocMetaData, Document
13
+ from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
14
+ from langroid.parsing.parser import Parser, ParsingConfig
15
+ from pydantic_settings import BaseSettings
16
+ from pydantic import ConfigDict
17
+
18
+ if TYPE_CHECKING:
19
+ from firecrawl import FirecrawlApp
20
+
21
+ try:
22
+ from crawl4ai import CrawlResult
23
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
24
+ from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
25
+ from crawl4ai.deep_crawling import DeepCrawlStrategy
26
+ from crawl4ai.extraction_strategy import ExtractionStrategy
27
+ from crawl4ai.markdown_generation_strategy import MarkdownGenerationStrategy
28
+ except ImportError:
29
+ raise LangroidImportError("crawl4ai", "crawl4ai")
30
+
31
+ load_dotenv()
32
+
33
+ logging.getLogger("url_loader").setLevel(logging.WARNING)
34
+
35
+
36
+ # Base crawler config and specific configurations
37
+ class BaseCrawlerConfig(BaseSettings):
38
+ """Base configuration for web crawlers."""
39
+
40
+ parser: Optional[Parser] = None
41
+
42
+
43
+ class TrafilaturaConfig(BaseCrawlerConfig):
44
+ """Configuration for Trafilatura crawler."""
45
+
46
+ threads: int = 4
47
+ format: str = "markdown" # or "xml" or "txt"
48
+
49
+
50
+ class FirecrawlConfig(BaseCrawlerConfig):
51
+ """Configuration for Firecrawl crawler."""
52
+
53
+ api_key: str = ""
54
+ mode: str = "scrape"
55
+ params: Dict[str, Any] = {}
56
+ timeout: Optional[int] = None
57
+
58
+ model_config = ConfigDict(env_prefix="FIRECRAWL_")
59
+
60
+ class ExaCrawlerConfig(BaseCrawlerConfig):
61
+ api_key: str = ""
62
+
63
+ model_config = ConfigDict(env_prefix="EXA_")
64
+
65
+ class Crawl4aiConfig(BaseCrawlerConfig):
66
+ """Configuration for the Crawl4aiCrawler."""
67
+
68
+ crawl_mode: Literal["simple", "deep"] = "simple"
69
+ extraction_strategy: Optional["ExtractionStrategy"] = None
70
+ markdown_strategy: Optional["MarkdownGenerationStrategy"] = None
71
+ deep_crawl_strategy: Optional["DeepCrawlStrategy"] = None
72
+ scraping_strategy: Optional["ContentScrapingStrategy"] = None
73
+ browser_config: Optional["BrowserConfig"] = None
74
+ run_config: Optional["CrawlerRunConfig"] = None
75
+
76
+ _refs_resolved: bool = False
77
+
78
+ def __init_subclass__(cls, **kwargs: Any) -> None:
79
+ """Resolve forward references when class is first subclassed or instantiated."""
80
+ super().__init_subclass__(**kwargs)
81
+ cls._resolve_forward_refs()
82
+
83
+ @classmethod
84
+ def _resolve_forward_refs(cls) -> None:
85
+ """Resolve forward references only when needed."""
86
+ if not cls._refs_resolved:
87
+ try:
88
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
89
+ from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
90
+ from crawl4ai.deep_crawling import DeepCrawlStrategy
91
+ from crawl4ai.extraction_strategy import ExtractionStrategy
92
+ from crawl4ai.markdown_generation_strategy import (
93
+ MarkdownGenerationStrategy,
94
+ )
95
+
96
+ # Create namespace for update_forward_refs
97
+ namespace = {
98
+ "BrowserConfig": BrowserConfig,
99
+ "CrawlerRunConfig": CrawlerRunConfig,
100
+ "ContentScrapingStrategy": ContentScrapingStrategy,
101
+ "DeepCrawlStrategy": DeepCrawlStrategy,
102
+ "ExtractionStrategy": ExtractionStrategy,
103
+ "MarkdownGenerationStrategy": MarkdownGenerationStrategy,
104
+ }
105
+
106
+ cls.update_forward_refs(**namespace)
107
+ cls._refs_resolved = True
108
+ except ImportError:
109
+ # If crawl4ai is not installed, leave forward refs as strings
110
+ pass
111
+
112
+ def __init__(self, **kwargs: Any) -> None:
113
+ """Initialize and ensure forward refs are resolved."""
114
+ self._resolve_forward_refs()
115
+ super().__init__(**kwargs)
116
+
117
+ model_config = ConfigDict(arbitrary_types_allowed=True)
118
+
119
+ class BaseCrawler(ABC):
120
+ """Abstract base class for web crawlers."""
121
+
122
+ def __init__(self, config: BaseCrawlerConfig):
123
+ """Initialize the base crawler.
124
+
125
+ Args:
126
+ config: Configuration for the crawler
127
+ """
128
+ self.parser = config.parser if self.needs_parser else None
129
+ self.config: BaseCrawlerConfig = config
130
+
131
+ @property
132
+ @abstractmethod
133
+ def needs_parser(self) -> bool:
134
+ """Indicates whether the crawler requires a parser."""
135
+ pass
136
+
137
+ @abstractmethod
138
+ def crawl(self, urls: List[str]) -> List[Document]:
139
+ pass
140
+
141
+ def _process_document(self, url: str) -> List[Document]:
142
+ if self.parser:
143
+ import requests
144
+ from requests.structures import CaseInsensitiveDict
145
+
146
+ if self._is_document_url(url):
147
+ try:
148
+ doc_parser = DocumentParser.create(url, self.parser.config)
149
+ new_chunks = doc_parser.get_doc_chunks()
150
+ if not new_chunks:
151
+ # If the document is empty, try to extract images
152
+ img_parser = ImagePdfParser(url, self.parser.config)
153
+ new_chunks = img_parser.get_doc_chunks()
154
+ return new_chunks
155
+ except Exception as e:
156
+ logging.error(f"Error parsing {url}: {e}")
157
+ return []
158
+
159
+ else:
160
+ try:
161
+ headers = requests.head(url).headers
162
+ except Exception as e:
163
+ logging.warning(f"Error getting headers for {url}: {e}")
164
+ headers = CaseInsensitiveDict()
165
+
166
+ content_type = headers.get("Content-Type", "").lower()
167
+ temp_file_suffix = None
168
+ if "application/pdf" in content_type:
169
+ temp_file_suffix = ".pdf"
170
+ elif (
171
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
172
+ in content_type
173
+ ):
174
+ temp_file_suffix = ".docx"
175
+ elif "application/msword" in content_type:
176
+ temp_file_suffix = ".doc"
177
+
178
+ if temp_file_suffix:
179
+ try:
180
+ response = requests.get(url)
181
+ with NamedTemporaryFile(
182
+ delete=False, suffix=temp_file_suffix
183
+ ) as temp_file:
184
+ temp_file.write(response.content)
185
+ temp_file_path = temp_file.name
186
+ doc_parser = DocumentParser.create(
187
+ temp_file_path, self.parser.config
188
+ )
189
+ docs = doc_parser.get_doc_chunks()
190
+ os.remove(temp_file_path)
191
+ return docs
192
+ except Exception as e:
193
+ logging.error(f"Error downloading/parsing {url}: {e}")
194
+ return []
195
+ return []
196
+
197
+ def _is_document_url(self, url: str) -> bool:
198
+ return any(url.lower().endswith(ext) for ext in [".pdf", ".docx", ".doc"])
199
+
200
+
201
+ class CrawlerFactory:
202
+ """Factory for creating web crawlers."""
203
+
204
+ @staticmethod
205
+ def create_crawler(config: BaseCrawlerConfig) -> BaseCrawler:
206
+ """Create a crawler instance based on configuration type.
207
+
208
+ Args:
209
+ config: Configuration for the crawler
210
+
211
+ Returns:
212
+ A BaseCrawler instance
213
+
214
+ Raises:
215
+ ValueError: If config type is not supported
216
+ """
217
+ if isinstance(config, TrafilaturaConfig):
218
+ return TrafilaturaCrawler(config)
219
+ elif isinstance(config, FirecrawlConfig):
220
+ return FirecrawlCrawler(config)
221
+ elif isinstance(config, ExaCrawlerConfig):
222
+ return ExaCrawler(config)
223
+ elif isinstance(config, Crawl4aiConfig):
224
+ return Crawl4aiCrawler(config)
225
+ else:
226
+ raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
227
+
228
+
229
+ class TrafilaturaCrawler(BaseCrawler):
230
+ """Crawler implementation using Trafilatura."""
231
+
232
+ def __init__(self, config: TrafilaturaConfig):
233
+ """Initialize the Trafilatura crawler.
234
+
235
+ Args:
236
+ config: Configuration for the crawler
237
+ """
238
+ super().__init__(config)
239
+ self.config: TrafilaturaConfig = config
240
+
241
+ @property
242
+ def needs_parser(self) -> bool:
243
+ return True
244
+
245
+ def crawl(self, urls: List[str]) -> List[Document]:
246
+ import trafilatura
247
+ from trafilatura.downloads import (
248
+ add_to_compressed_dict,
249
+ buffered_downloads,
250
+ load_download_buffer,
251
+ )
252
+
253
+ docs = []
254
+ dl_dict = add_to_compressed_dict(urls)
255
+
256
+ while not dl_dict.done:
257
+ buffer, dl_dict = load_download_buffer(dl_dict, sleep_time=5)
258
+ for url, result in buffered_downloads(buffer, self.config.threads):
259
+ parsed_doc = self._process_document(url)
260
+ if parsed_doc:
261
+ docs.extend(parsed_doc)
262
+ else:
263
+ text = trafilatura.extract(
264
+ result,
265
+ no_fallback=False,
266
+ favor_recall=True,
267
+ include_formatting=True,
268
+ output_format=self.config.format,
269
+ with_metadata=True, # Title, date, author... at start of text
270
+ )
271
+ if self.config.format in ["xml", "html"]:
272
+ # heading_style="ATX" for markdown headings, i.e. #, ##, etc.
273
+ text = md.markdownify(text, heading_style="ATX")
274
+ if text is None and result is not None and isinstance(result, str):
275
+ text = result
276
+ if text:
277
+ docs.append(
278
+ Document(content=text, metadata=DocMetaData(source=url))
279
+ )
280
+
281
+ return docs
282
+
283
+
284
+ class FirecrawlCrawler(BaseCrawler):
285
+ """Crawler implementation using Firecrawl."""
286
+
287
+ def __init__(self, config: FirecrawlConfig) -> None:
288
+ """Initialize the Firecrawl crawler.
289
+
290
+ Args:
291
+ config: Configuration for the crawler
292
+ """
293
+ super().__init__(config)
294
+ self.config: FirecrawlConfig = config
295
+
296
+ @property
297
+ def needs_parser(self) -> bool:
298
+ return False
299
+
300
+ def _return_save_incremental_results(
301
+ self, app: "FirecrawlApp", crawl_id: str, output_dir: str = "firecrawl_output"
302
+ ) -> List[Document]:
303
+ # Code used verbatim from firecrawl blog with few modifications
304
+ # https://www.firecrawl.dev/blog/mastering-the-crawl-endpoint-in-firecrawl
305
+ import json
306
+ import time
307
+ from pathlib import Path
308
+
309
+ from tqdm import tqdm
310
+
311
+ pbar = tqdm(desc="Pages saved", unit=" pages", dynamic_ncols=True)
312
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
313
+ processed_urls: set[str] = set()
314
+ docs = []
315
+
316
+ while True:
317
+ # Check current status
318
+ status = app.check_crawl_status(crawl_id)
319
+ new_pages = 0
320
+
321
+ # Save new pages
322
+ for page in status["data"]:
323
+ url = page["metadata"]["url"]
324
+ if url not in processed_urls:
325
+ content = page.get("markdown", "")
326
+ filename = f"{output_dir}/{len(processed_urls)}.md"
327
+ with open(filename, "w") as f:
328
+ f.write(content)
329
+ docs.append(
330
+ Document(
331
+ content=content,
332
+ metadata=DocMetaData(
333
+ source=url,
334
+ title=page["metadata"].get("title", "Unknown Title"),
335
+ ),
336
+ )
337
+ )
338
+ processed_urls.add(url)
339
+ new_pages += 1
340
+ pbar.model_copy(update=new_pages) # Update progress bar with new pages
341
+
342
+ # Break if crawl is complete
343
+ if status["status"] == "completed":
344
+ print(f"Saved {len(processed_urls)} pages.")
345
+ with open(f"{output_dir}/full_results.json", "w") as f:
346
+ json.dump(status, f, indent=2)
347
+ break
348
+
349
+ time.sleep(5) # Wait before checking again
350
+ return docs
351
+
352
+ def crawl(self, urls: List[str]) -> List[Document]:
353
+ try:
354
+ from firecrawl import FirecrawlApp
355
+ except ImportError:
356
+ raise LangroidImportError("firecrawl", "firecrawl")
357
+
358
+ app = FirecrawlApp(api_key=self.config.api_key)
359
+ docs = []
360
+ params = self.config.params.model_copy() # Create a copy of the existing params
361
+
362
+ if self.config.timeout is not None:
363
+ params["timeout"] = self.config.timeout # Add/override timeout in params
364
+
365
+ if self.config.mode == "scrape":
366
+ for url in urls:
367
+ try:
368
+ result = app.scrape_url(url, params=params)
369
+ metadata = result.get(
370
+ "metadata", {}
371
+ ) # Default to empty dict if missing
372
+ status_code = metadata.get("statusCode")
373
+
374
+ if status_code == 200:
375
+ docs.append(
376
+ Document(
377
+ content=result["markdown"],
378
+ metadata=DocMetaData(
379
+ source=url,
380
+ title=metadata.get("title", "Unknown Title"),
381
+ ),
382
+ )
383
+ )
384
+ except Exception as e:
385
+ logging.warning(
386
+ f"Firecrawl encountered an error for {url}: {e}. "
387
+ "Skipping but continuing."
388
+ )
389
+ elif self.config.mode == "crawl":
390
+ if not isinstance(urls, list) or len(urls) != 1:
391
+ raise ValueError(
392
+ "Crawl mode expects 'urls' to be a list containing a single URL."
393
+ )
394
+
395
+ # Start the crawl
396
+ crawl_status = app.async_crawl_url(url=urls[0], params=params)
397
+
398
+ # Save results incrementally
399
+ docs = self._return_save_incremental_results(app, crawl_status["id"])
400
+ return docs
401
+
402
+
403
+ class ExaCrawler(BaseCrawler):
404
+ """Crawler implementation using Exa API."""
405
+
406
+ def __init__(self, config: ExaCrawlerConfig) -> None:
407
+ """Initialize the Exa crawler.
408
+
409
+ Args:
410
+ config: Configuration for the crawler
411
+ """
412
+ super().__init__(config)
413
+ self.config: ExaCrawlerConfig = config
414
+
415
+ @property
416
+ def needs_parser(self) -> bool:
417
+ return True
418
+
419
+ def crawl(self, urls: List[str]) -> List[Document]:
420
+ """Crawl the given URLs using Exa SDK.
421
+
422
+ Args:
423
+ urls: List of URLs to crawl
424
+
425
+ Returns:
426
+ List of Documents with content extracted from the URLs
427
+
428
+ Raises:
429
+ LangroidImportError: If the exa package is not installed
430
+ ValueError: If the Exa API key is not set
431
+ """
432
+ try:
433
+ from exa_py import Exa
434
+ except ImportError:
435
+ raise LangroidImportError("exa", "exa")
436
+
437
+ if not self.config.api_key:
438
+ raise ValueError("EXA_API_KEY key is required in your env or .env")
439
+
440
+ exa = Exa(self.config.api_key)
441
+ docs = []
442
+
443
+ try:
444
+ for url in urls:
445
+ parsed_doc_chunks = self._process_document(url)
446
+ if parsed_doc_chunks:
447
+ docs.extend(parsed_doc_chunks)
448
+ continue
449
+ else:
450
+ results = exa.get_contents(
451
+ [url],
452
+ livecrawl="always",
453
+ text={
454
+ "include_html_tags": True,
455
+ },
456
+ )
457
+ result = results.results[0]
458
+ if result.text:
459
+ md_text = md.markdownify(result.text, heading_style="ATX")
460
+ # append a NON-chunked document
461
+ # (metadata.is_chunk = False, so will be chunked downstream)
462
+ docs.append(
463
+ Document(
464
+ content=md_text,
465
+ metadata=DocMetaData(
466
+ source=url,
467
+ title=getattr(result, "title", "Unknown Title"),
468
+ published_date=getattr(
469
+ result, "published_date", "Unknown Date"
470
+ ),
471
+ ),
472
+ )
473
+ )
474
+
475
+ except Exception as e:
476
+ logging.error(f"Error retrieving content from Exa API: {e}")
477
+
478
+ return docs
479
+
480
+
481
+ class Crawl4aiCrawler(BaseCrawler):
482
+ """
483
+ Crawler implementation using the crawl4ai library.
484
+
485
+ This crawler intelligently dispatches URLs. Standard web pages are rendered
486
+ and scraped using the crawl4ai browser engine. Direct links to documents
487
+ (PDF, DOCX, etc.) are delegated to the framework's internal DocumentParser.
488
+ """
489
+
490
+ def __init__(self, config: Crawl4aiConfig) -> None:
491
+ """Initialize the Crawl4ai crawler."""
492
+ super().__init__(config)
493
+ self.config: Crawl4aiConfig = config
494
+
495
+ @property
496
+ def needs_parser(self) -> bool:
497
+ """
498
+ Indicates that this crawler relies on the framework's DocumentParser
499
+ for handling specific file types like PDF, DOCX, etc., which
500
+ the browser engine cannot parse directly.
501
+ """
502
+ return True
503
+
504
+ def crawl(self, urls: List[str]) -> List[Document]:
505
+ """
506
+ Executes the crawl by separating document URLs from web page URLs.
507
+
508
+ - Document URLs (.pdf, .docx, etc.) are processed using `_process_document`.
509
+ - Web page URLs are handled using the async crawl4ai engine.
510
+ """
511
+ all_documents: List[Document] = []
512
+ webpage_urls: List[str] = []
513
+
514
+ # Step 1: Separate URLs into documents and web pages
515
+ for url in urls:
516
+ parsed_doc_chunks = self._process_document(url)
517
+ if parsed_doc_chunks:
518
+ all_documents.extend(parsed_doc_chunks)
519
+ else:
520
+ webpage_urls.append(url)
521
+
522
+ # Step 2: Process web page URLs asynchronously
523
+ if webpage_urls:
524
+ try:
525
+ loop = asyncio.get_running_loop()
526
+ if loop.is_running():
527
+ import nest_asyncio
528
+
529
+ nest_asyncio.apply()
530
+ web_docs = asyncio.run(self._async_crawl(webpage_urls))
531
+ except RuntimeError:
532
+ web_docs = asyncio.run(self._async_crawl(webpage_urls))
533
+
534
+ all_documents.extend(web_docs)
535
+
536
+ return all_documents
537
+
538
+ def _translate_result_to_document(
539
+ self, result: "CrawlResult"
540
+ ) -> Optional[Document]:
541
+ """Converts a crawl4ai CrawlResult into the framework's Document format."""
542
+ if not result.success:
543
+ logging.warning(
544
+ f"Crawl4ai failed for URL {result.url}: {result.error_message}"
545
+ )
546
+ return None
547
+
548
+ content = ""
549
+ if result.extracted_content:
550
+ content = result.extracted_content
551
+ elif result.markdown:
552
+ if (
553
+ hasattr(result.markdown, "fit_markdown")
554
+ and result.markdown.fit_markdown
555
+ ):
556
+ content = result.markdown.fit_markdown
557
+ elif hasattr(result.markdown, "raw_markdown"):
558
+ content = result.markdown.raw_markdown
559
+ else:
560
+ content = str(result.markdown)
561
+
562
+ if not content:
563
+ logging.warning(f"Crawl4ai returned no content for URL {result.url}")
564
+ return None
565
+
566
+ # Extract metadata safely
567
+ title = "Unknown Title"
568
+ published_date = "Unknown Date"
569
+
570
+ if result.metadata:
571
+ title = result.metadata.get("title", "Unknown Title")
572
+ # Try common date field names
573
+ for date_field in [
574
+ "published_date",
575
+ "datePublished",
576
+ "article:published_time",
577
+ "pubdate",
578
+ ]:
579
+ if date_field in result.metadata:
580
+ published_date = result.metadata.get(date_field)
581
+ break
582
+
583
+ meta = DocMetaData(
584
+ source=result.url,
585
+ title=title,
586
+ published_date=published_date,
587
+ # Note: source_content is meant for reference content, not metadata
588
+ # Keeping it minimal as other crawlers don't populate it
589
+ )
590
+ return Document(content=content, metadata=meta)
591
+
592
+ async def _async_crawl(self, urls: List[str]) -> List[Document]:
593
+ try:
594
+ from crawl4ai import AsyncWebCrawler
595
+
596
+ # Import configs here for lazy loading
597
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
598
+ except ImportError:
599
+ raise LangroidImportError(
600
+ "crawl4ai", "pip install 'crawl4ai[all]' or 'crawl4ai'"
601
+ )
602
+
603
+ # CHANGE 2: Handle the new optional config fields.
604
+ # Use the user-provided config if it exists, otherwise create a default one.
605
+ browser_config = self.config.browser_config or BrowserConfig()
606
+ run_config = self.config.run_config or CrawlerRunConfig()
607
+
608
+ if self.config.extraction_strategy:
609
+ run_config.extraction_strategy = self.config.extraction_strategy
610
+ if self.config.markdown_strategy:
611
+ run_config.markdown_generator = self.config.markdown_strategy
612
+ if self.config.deep_crawl_strategy:
613
+ run_config.deep_crawl_strategy = self.config.deep_crawl_strategy
614
+ if self.config.scraping_strategy:
615
+ run_config.scraping_strategy = self.config.scraping_strategy
616
+
617
+ crawled_documents: List[Document] = []
618
+
619
+ async with AsyncWebCrawler(config=browser_config) as crawler:
620
+ if self.config.crawl_mode == "simple":
621
+ for url in urls:
622
+ result = await crawler.arun(url, config=run_config)
623
+ doc = self._translate_result_to_document(result)
624
+ if doc:
625
+ crawled_documents.append(doc)
626
+
627
+ elif self.config.crawl_mode == "deep":
628
+ if not urls:
629
+ return []
630
+ if not run_config.deep_crawl_strategy:
631
+ logging.warning(
632
+ "Deep crawl mode requires a deep_crawl_strategy in the config."
633
+ )
634
+ return []
635
+
636
+ # In deep crawl mode, `crawl4ai` will discover and crawl pages
637
+ # starting from the seed URL. It will not process direct document links
638
+ # found during the deep crawl; it is designed to follow hyperlinks.
639
+ crawl_results = await crawler.arun(urls[0], config=run_config)
640
+
641
+ if isinstance(crawl_results, list):
642
+ for result in crawl_results:
643
+ doc = self._translate_result_to_document(result)
644
+ if doc:
645
+ crawled_documents.append(doc)
646
+ else:
647
+ async for result in crawl_results:
648
+ doc = self._translate_result_to_document(result)
649
+ if doc:
650
+ crawled_documents.append(doc)
651
+
652
+ return crawled_documents
653
+
654
+
655
+ class URLLoader:
656
+ """Loads URLs and extracts text using a specified crawler."""
657
+
658
+ def __init__(
659
+ self,
660
+ urls: List[Any],
661
+ parsing_config: ParsingConfig = ParsingConfig(),
662
+ crawler_config: Optional[BaseCrawlerConfig] = None,
663
+ ):
664
+ """Initialize the URL loader.
665
+
666
+ Args:
667
+ urls: List of URLs to load
668
+ parsing_config: Configuration for parsing
669
+ crawler_config: Configuration for the crawler
670
+ """
671
+ self.urls = urls
672
+ self.parsing_config = parsing_config
673
+
674
+ if crawler_config is None:
675
+ crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
676
+
677
+ self.crawler = CrawlerFactory.create_crawler(crawler_config)
678
+ if self.crawler.needs_parser:
679
+ self.crawler.parser = Parser(parsing_config)
680
+
681
+ def load(self) -> List[Document]:
682
+ """Load the URLs using the specified crawler."""
683
+ return self.crawler.crawl(self.urls)