langroid 0.57.0__py3-none-any.whl → 0.58.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,9 @@
1
+ import asyncio
1
2
  import logging
2
3
  import os
3
4
  from abc import ABC, abstractmethod
4
5
  from tempfile import NamedTemporaryFile
5
- from typing import TYPE_CHECKING, Any, Dict, List, Optional
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
6
7
 
7
8
  import markdownify as md
8
9
  from dotenv import load_dotenv
@@ -16,6 +17,16 @@ from langroid.pydantic_v1 import BaseSettings
16
17
  if TYPE_CHECKING:
17
18
  from firecrawl import FirecrawlApp
18
19
 
20
+ try:
21
+ from crawl4ai import CrawlResult
22
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
23
+ from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
24
+ from crawl4ai.deep_crawling import DeepCrawlStrategy
25
+ from crawl4ai.extraction_strategy import ExtractionStrategy
26
+ from crawl4ai.markdown_generation_strategy import MarkdownGenerationStrategy
27
+ except ImportError:
28
+ raise LangroidImportError("crawl4ai", "crawl-4-ai")
29
+
19
30
  load_dotenv()
20
31
 
21
32
  logging.getLogger("url_loader").setLevel(logging.WARNING)
@@ -59,6 +70,62 @@ class ExaCrawlerConfig(BaseCrawlerConfig):
59
70
  env_prefix = "EXA_"
60
71
 
61
72
 
73
+ class Crawl4aiConfig(BaseCrawlerConfig):
74
+ """Configuration for the Crawl4aiCrawler."""
75
+
76
+ crawl_mode: Literal["simple", "deep"] = "simple"
77
+ extraction_strategy: Optional["ExtractionStrategy"] = None
78
+ markdown_strategy: Optional["MarkdownGenerationStrategy"] = None
79
+ deep_crawl_strategy: Optional["DeepCrawlStrategy"] = None
80
+ scraping_strategy: Optional["ContentScrapingStrategy"] = None
81
+ browser_config: Optional["BrowserConfig"] = None
82
+ run_config: Optional["CrawlerRunConfig"] = None
83
+
84
+ _refs_resolved: bool = False
85
+
86
+ def __init_subclass__(cls, **kwargs: Any) -> None:
87
+ """Resolve forward references when class is first subclassed or instantiated."""
88
+ super().__init_subclass__(**kwargs)
89
+ cls._resolve_forward_refs()
90
+
91
+ @classmethod
92
+ def _resolve_forward_refs(cls) -> None:
93
+ """Resolve forward references only when needed."""
94
+ if not cls._refs_resolved:
95
+ try:
96
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
97
+ from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
98
+ from crawl4ai.deep_crawling import DeepCrawlStrategy
99
+ from crawl4ai.extraction_strategy import ExtractionStrategy
100
+ from crawl4ai.markdown_generation_strategy import (
101
+ MarkdownGenerationStrategy,
102
+ )
103
+
104
+ # Create namespace for update_forward_refs
105
+ namespace = {
106
+ "BrowserConfig": BrowserConfig,
107
+ "CrawlerRunConfig": CrawlerRunConfig,
108
+ "ContentScrapingStrategy": ContentScrapingStrategy,
109
+ "DeepCrawlStrategy": DeepCrawlStrategy,
110
+ "ExtractionStrategy": ExtractionStrategy,
111
+ "MarkdownGenerationStrategy": MarkdownGenerationStrategy,
112
+ }
113
+
114
+ cls.update_forward_refs(**namespace)
115
+ cls._refs_resolved = True
116
+ except ImportError:
117
+ # If crawl4ai is not installed, leave forward refs as strings
118
+ pass
119
+
120
+ def __init__(self, **kwargs: Any) -> None:
121
+ """Initialize and ensure forward refs are resolved."""
122
+ self._resolve_forward_refs()
123
+ super().__init__(**kwargs)
124
+
125
+ class Config:
126
+ arbitrary_types_allowed = True
127
+
128
+
62
129
  class BaseCrawler(ABC):
63
130
  """Abstract base class for web crawlers."""
64
131
 
@@ -163,6 +230,8 @@ class CrawlerFactory:
163
230
  return FirecrawlCrawler(config)
164
231
  elif isinstance(config, ExaCrawlerConfig):
165
232
  return ExaCrawler(config)
233
+ elif isinstance(config, Crawl4aiConfig):
234
+ return Crawl4aiCrawler(config)
166
235
  else:
167
236
  raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
168
237
 
@@ -419,6 +488,180 @@ class ExaCrawler(BaseCrawler):
419
488
  return docs
420
489
 
421
490
 
491
+ class Crawl4aiCrawler(BaseCrawler):
492
+ """
493
+ Crawler implementation using the crawl4ai library.
494
+
495
+ This crawler intelligently dispatches URLs. Standard web pages are rendered
496
+ and scraped using the crawl4ai browser engine. Direct links to documents
497
+ (PDF, DOCX, etc.) are delegated to the framework's internal DocumentParser.
498
+ """
499
+
500
+ def __init__(self, config: Crawl4aiConfig) -> None:
501
+ """Initialize the Crawl4ai crawler."""
502
+ super().__init__(config)
503
+ self.config: Crawl4aiConfig = config
504
+
505
+ @property
506
+ def needs_parser(self) -> bool:
507
+ """
508
+ Indicates that this crawler relies on the framework's DocumentParser
509
+ for handling specific file types like PDF, DOCX, etc., which
510
+ the browser engine cannot parse directly.
511
+ """
512
+ return True
513
+
514
+ def crawl(self, urls: List[str]) -> List[Document]:
515
+ """
516
+ Executes the crawl by separating document URLs from web page URLs.
517
+
518
+ - Document URLs (.pdf, .docx, etc.) are processed using `_process_document`.
519
+ - Web page URLs are handled using the async crawl4ai engine.
520
+ """
521
+ all_documents: List[Document] = []
522
+ webpage_urls: List[str] = []
523
+
524
+ # Step 1: Separate URLs into documents and web pages
525
+ for url in urls:
526
+ parsed_doc_chunks = self._process_document(url)
527
+ if parsed_doc_chunks:
528
+ all_documents.extend(parsed_doc_chunks)
529
+ else:
530
+ webpage_urls.append(url)
531
+
532
+ # Step 2: Process web page URLs asynchronously
533
+ if webpage_urls:
534
+ try:
535
+ loop = asyncio.get_running_loop()
536
+ if loop.is_running():
537
+ import nest_asyncio
538
+
539
+ nest_asyncio.apply()
540
+ web_docs = asyncio.run(self._async_crawl(webpage_urls))
541
+ except RuntimeError:
542
+ web_docs = asyncio.run(self._async_crawl(webpage_urls))
543
+
544
+ all_documents.extend(web_docs)
545
+
546
+ return all_documents
547
+
548
+ def _translate_result_to_document(
549
+ self, result: "CrawlResult"
550
+ ) -> Optional[Document]:
551
+ """Converts a crawl4ai CrawlResult into the framework's Document format."""
552
+ if not result.success:
553
+ logging.warning(
554
+ f"Crawl4ai failed for URL {result.url}: {result.error_message}"
555
+ )
556
+ return None
557
+
558
+ content = ""
559
+ if result.extracted_content:
560
+ content = result.extracted_content
561
+ elif result.markdown:
562
+ if (
563
+ hasattr(result.markdown, "fit_markdown")
564
+ and result.markdown.fit_markdown
565
+ ):
566
+ content = result.markdown.fit_markdown
567
+ elif hasattr(result.markdown, "raw_markdown"):
568
+ content = result.markdown.raw_markdown
569
+ else:
570
+ content = str(result.markdown)
571
+
572
+ if not content:
573
+ logging.warning(f"Crawl4ai returned no content for URL {result.url}")
574
+ return None
575
+
576
+ # Extract metadata safely
577
+ title = "Unknown Title"
578
+ published_date = "Unknown Date"
579
+
580
+ if result.metadata:
581
+ title = result.metadata.get("title", "Unknown Title")
582
+ # Try common date field names
583
+ for date_field in [
584
+ "published_date",
585
+ "datePublished",
586
+ "article:published_time",
587
+ "pubdate",
588
+ ]:
589
+ if date_field in result.metadata:
590
+ published_date = result.metadata.get(date_field)
591
+ break
592
+
593
+ meta = DocMetaData(
594
+ source=result.url,
595
+ title=title,
596
+ published_date=published_date,
597
+ # Note: source_content is meant for reference content, not metadata
598
+ # Keeping it minimal as other crawlers don't populate it
599
+ )
600
+ return Document(content=content, metadata=meta)
601
+
602
+ async def _async_crawl(self, urls: List[str]) -> List[Document]:
603
+ try:
604
+ from crawl4ai import AsyncWebCrawler
605
+
606
+ # Import configs here for lazy loading
607
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
608
+ except ImportError:
609
+ raise LangroidImportError(
610
+ "crawl4ai", "pip install 'crawl4ai[all]' or 'crawl4ai'"
611
+ )
612
+
613
+ # CHANGE 2: Handle the new optional config fields.
614
+ # Use the user-provided config if it exists, otherwise create a default one.
615
+ browser_config = self.config.browser_config or BrowserConfig()
616
+ run_config = self.config.run_config or CrawlerRunConfig()
617
+
618
+ if self.config.extraction_strategy:
619
+ run_config.extraction_strategy = self.config.extraction_strategy
620
+ if self.config.markdown_strategy:
621
+ run_config.markdown_generator = self.config.markdown_strategy
622
+ if self.config.deep_crawl_strategy:
623
+ run_config.deep_crawl_strategy = self.config.deep_crawl_strategy
624
+ if self.config.scraping_strategy:
625
+ run_config.scraping_strategy = self.config.scraping_strategy
626
+
627
+ crawled_documents: List[Document] = []
628
+
629
+ async with AsyncWebCrawler(config=browser_config) as crawler:
630
+ if self.config.crawl_mode == "simple":
631
+ for url in urls:
632
+ result = await crawler.arun(url, config=run_config)
633
+ doc = self._translate_result_to_document(result)
634
+ if doc:
635
+ crawled_documents.append(doc)
636
+
637
+ elif self.config.crawl_mode == "deep":
638
+ if not urls:
639
+ return []
640
+ if not run_config.deep_crawl_strategy:
641
+ logging.warning(
642
+ "Deep crawl mode requires a deep_crawl_strategy in the config."
643
+ )
644
+ return []
645
+
646
+ # In deep crawl mode, `crawl4ai` will discover and crawl pages
647
+ # starting from the seed URL. It will not process direct document links
648
+ # found during the deep crawl; it is designed to follow hyperlinks.
649
+ crawl_results = await crawler.arun(urls[0], config=run_config)
650
+
651
+ if isinstance(crawl_results, list):
652
+ for result in crawl_results:
653
+ doc = self._translate_result_to_document(result)
654
+ if doc:
655
+ crawled_documents.append(doc)
656
+ else:
657
+ async for result in crawl_results:
658
+ doc = self._translate_result_to_document(result)
659
+ if doc:
660
+ crawled_documents.append(doc)
661
+
662
+ return crawled_documents
663
+
664
+
422
665
  class URLLoader:
423
666
  """Loads URLs and extracts text using a specified crawler."""
424
667
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.57.0
3
+ Version: 0.58.1
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -94,6 +94,8 @@ Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'chainlit'
94
94
  Requires-Dist: python-socketio<6.0.0,>=5.11.0; extra == 'chainlit'
95
95
  Provides-Extra: chromadb
96
96
  Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'chromadb'
97
+ Provides-Extra: crawl-4-ai
98
+ Requires-Dist: crawl4ai>=0.6.3; extra == 'crawl-4-ai'
97
99
  Provides-Extra: db
98
100
  Requires-Dist: psycopg2-binary>=2.9.10; extra == 'db'
99
101
  Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'db'
@@ -344,6 +346,11 @@ teacher_task.run()
344
346
  <details>
345
347
  <summary> <b>Click to expand</b></summary>
346
348
 
349
+ - **Jul 2025:**
350
+ - [0.58.0](https://github.com/langroid/langroid/releases/tag/0.58.0) Crawl4AI integration -
351
+ browser-based web crawling with Playwright for JavaScript-heavy sites, no API key required (thank you @abab-dev!).
352
+ - [0.57.0](https://github.com/langroid/langroid/releases/tag/0.57.0) HTML Logger for interactive task visualization -
353
+ self-contained HTML logs with collapsible entries, auto-refresh, and persistent UI state.
347
354
  - **Jun 2025:**
348
355
  - [0.56.0](https://github.com/langroid/langroid/releases/tag/0.56.0) `TaskTool` for delegating tasks to sub-agents -
349
356
  enables agents to spawn sub-agents with specific tools and configurations.
@@ -102,7 +102,7 @@ langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1
102
102
  langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
103
103
  langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
104
104
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
105
- langroid/parsing/url_loader.py,sha256=NQuCxa-hTOuxLZDq4xKLvPfGVB4IWFzh2ItqWq297DI,15675
105
+ langroid/parsing/url_loader.py,sha256=ELLSimgdf0-oscmtnDxXJcL-W9jVdLDNfQ90ouCDfHE,25402
106
106
  langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
107
107
  langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
108
108
  langroid/parsing/web_search.py,sha256=atk8wIpOfiGTvW8yL_26TvjvyY2zD24xIHIi0QjEklI,8599
@@ -139,7 +139,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
139
139
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
140
140
  langroid/vector_store/qdrantdb.py,sha256=ZYrT9mxoUCx_67Qzb5xnkWuFG12rfe30yAg4NgG2ueA,19168
141
141
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
142
- langroid-0.57.0.dist-info/METADATA,sha256=y3GSqdrxEQMaV2iMTmioT9KkTGBIo4oHoPRQexgCXdw,65744
143
- langroid-0.57.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
144
- langroid-0.57.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
145
- langroid-0.57.0.dist-info/RECORD,,
142
+ langroid-0.58.1.dist-info/METADATA,sha256=mkfwCAdL_zIZxy4v12_ikK7AZLMmbHwwEkh_RJNt3QE,66270
143
+ langroid-0.58.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
144
+ langroid-0.58.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
145
+ langroid-0.58.1.dist-info/RECORD,,