langroid 0.57.0__py3-none-any.whl → 0.58.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,9 @@
1
+ import asyncio
1
2
  import logging
2
3
  import os
3
4
  from abc import ABC, abstractmethod
4
5
  from tempfile import NamedTemporaryFile
5
- from typing import TYPE_CHECKING, Any, Dict, List, Optional
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
6
7
 
7
8
  import markdownify as md
8
9
  from dotenv import load_dotenv
@@ -16,6 +17,16 @@ from langroid.pydantic_v1 import BaseSettings
16
17
  if TYPE_CHECKING:
17
18
  from firecrawl import FirecrawlApp
18
19
 
20
+ try:
21
+ from crawl4ai import CrawlResult
22
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
23
+ from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
24
+ from crawl4ai.deep_crawling import DeepCrawlStrategy
25
+ from crawl4ai.extraction_strategy import ExtractionStrategy
26
+ from crawl4ai.markdown_generation_strategy import MarkdownGenerationStrategy
27
+ except ImportError:
28
+ raise LangroidImportError("crawl4ai", "crawl-4-ai")
29
+
19
30
  load_dotenv()
20
31
 
21
32
  logging.getLogger("url_loader").setLevel(logging.WARNING)
@@ -59,6 +70,52 @@ class ExaCrawlerConfig(BaseCrawlerConfig):
59
70
  env_prefix = "EXA_"
60
71
 
61
72
 
73
+ def _resolve_crawl4ai_forward_refs(cls: Any) -> Any:
74
+ """
75
+ A class decorator that resolves forward references for fields in a Pydantic
76
+ model that depend on the optional 'crawl4ai' library.
77
+ """
78
+ try:
79
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig # noqa: F401
80
+ from crawl4ai.content_scraping_strategy import ( # noqa: F401
81
+ ContentScrapingStrategy,
82
+ )
83
+ from crawl4ai.deep_crawling import DeepCrawlStrategy # noqa: F401
84
+ from crawl4ai.extraction_strategy import ExtractionStrategy # noqa: F401
85
+ from crawl4ai.markdown_generation_strategy import ( # noqa: F401
86
+ MarkdownGenerationStrategy,
87
+ )
88
+
89
+ # Create a namespace dictionary from locals() but exclude 'cls'.
90
+ # This prevents the TypeError.
91
+ namespace = {name: value for name, value in locals().items() if name != "cls"}
92
+ cls.update_forward_refs(**namespace)
93
+
94
+ except ImportError:
95
+ # If crawl4ai is not installed, do nothing.
96
+ pass
97
+ return cls
98
+
99
+
100
+ @_resolve_crawl4ai_forward_refs
101
+ class Crawl4aiConfig(BaseCrawlerConfig):
102
+ """
103
+ Configuration for the Crawl4aiCrawler.
104
+ """
105
+
106
+ crawl_mode: Literal["simple", "deep"] = "simple"
107
+ extraction_strategy: Optional["ExtractionStrategy"] = None
108
+ markdown_strategy: Optional["MarkdownGenerationStrategy"] = None
109
+ deep_crawl_strategy: Optional["DeepCrawlStrategy"] = None
110
+ scraping_strategy: Optional["ContentScrapingStrategy"] = None
111
+
112
+ browser_config: Optional["BrowserConfig"] = None
113
+ run_config: Optional["CrawlerRunConfig"] = None
114
+
115
+ class Config:
116
+ arbitrary_types_allowed = True
117
+
118
+
62
119
  class BaseCrawler(ABC):
63
120
  """Abstract base class for web crawlers."""
64
121
 
@@ -163,6 +220,8 @@ class CrawlerFactory:
163
220
  return FirecrawlCrawler(config)
164
221
  elif isinstance(config, ExaCrawlerConfig):
165
222
  return ExaCrawler(config)
223
+ elif isinstance(config, Crawl4aiConfig):
224
+ return Crawl4aiCrawler(config)
166
225
  else:
167
226
  raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
168
227
 
@@ -419,6 +478,180 @@ class ExaCrawler(BaseCrawler):
419
478
  return docs
420
479
 
421
480
 
481
+ class Crawl4aiCrawler(BaseCrawler):
482
+ """
483
+ Crawler implementation using the crawl4ai library.
484
+
485
+ This crawler intelligently dispatches URLs. Standard web pages are rendered
486
+ and scraped using the crawl4ai browser engine. Direct links to documents
487
+ (PDF, DOCX, etc.) are delegated to the framework's internal DocumentParser.
488
+ """
489
+
490
+ def __init__(self, config: Crawl4aiConfig) -> None:
491
+ """Initialize the Crawl4ai crawler."""
492
+ super().__init__(config)
493
+ self.config: Crawl4aiConfig = config
494
+
495
+ @property
496
+ def needs_parser(self) -> bool:
497
+ """
498
+ Indicates that this crawler relies on the framework's DocumentParser
499
+ for handling specific file types like PDF, DOCX, etc., which
500
+ the browser engine cannot parse directly.
501
+ """
502
+ return True
503
+
504
+ def crawl(self, urls: List[str]) -> List[Document]:
505
+ """
506
+ Executes the crawl by separating document URLs from web page URLs.
507
+
508
+ - Document URLs (.pdf, .docx, etc.) are processed using `_process_document`.
509
+ - Web page URLs are handled using the async crawl4ai engine.
510
+ """
511
+ all_documents: List[Document] = []
512
+ webpage_urls: List[str] = []
513
+
514
+ # Step 1: Separate URLs into documents and web pages
515
+ for url in urls:
516
+ parsed_doc_chunks = self._process_document(url)
517
+ if parsed_doc_chunks:
518
+ all_documents.extend(parsed_doc_chunks)
519
+ else:
520
+ webpage_urls.append(url)
521
+
522
+ # Step 2: Process web page URLs asynchronously
523
+ if webpage_urls:
524
+ try:
525
+ loop = asyncio.get_running_loop()
526
+ if loop.is_running():
527
+ import nest_asyncio
528
+
529
+ nest_asyncio.apply()
530
+ web_docs = asyncio.run(self._async_crawl(webpage_urls))
531
+ except RuntimeError:
532
+ web_docs = asyncio.run(self._async_crawl(webpage_urls))
533
+
534
+ all_documents.extend(web_docs)
535
+
536
+ return all_documents
537
+
538
+ def _translate_result_to_document(
539
+ self, result: "CrawlResult"
540
+ ) -> Optional[Document]:
541
+ """Converts a crawl4ai CrawlResult into the framework's Document format."""
542
+ if not result.success:
543
+ logging.warning(
544
+ f"Crawl4ai failed for URL {result.url}: {result.error_message}"
545
+ )
546
+ return None
547
+
548
+ content = ""
549
+ if result.extracted_content:
550
+ content = result.extracted_content
551
+ elif result.markdown:
552
+ if (
553
+ hasattr(result.markdown, "fit_markdown")
554
+ and result.markdown.fit_markdown
555
+ ):
556
+ content = result.markdown.fit_markdown
557
+ elif hasattr(result.markdown, "raw_markdown"):
558
+ content = result.markdown.raw_markdown
559
+ else:
560
+ content = str(result.markdown)
561
+
562
+ if not content:
563
+ logging.warning(f"Crawl4ai returned no content for URL {result.url}")
564
+ return None
565
+
566
+ # Extract metadata safely
567
+ title = "Unknown Title"
568
+ published_date = "Unknown Date"
569
+
570
+ if result.metadata:
571
+ title = result.metadata.get("title", "Unknown Title")
572
+ # Try common date field names
573
+ for date_field in [
574
+ "published_date",
575
+ "datePublished",
576
+ "article:published_time",
577
+ "pubdate",
578
+ ]:
579
+ if date_field in result.metadata:
580
+ published_date = result.metadata.get(date_field)
581
+ break
582
+
583
+ meta = DocMetaData(
584
+ source=result.url,
585
+ title=title,
586
+ published_date=published_date,
587
+ # Note: source_content is meant for reference content, not metadata
588
+ # Keeping it minimal as other crawlers don't populate it
589
+ )
590
+ return Document(content=content, metadata=meta)
591
+
592
+ async def _async_crawl(self, urls: List[str]) -> List[Document]:
593
+ try:
594
+ from crawl4ai import AsyncWebCrawler
595
+
596
+ # Import configs here for lazy loading
597
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
598
+ except ImportError:
599
+ raise LangroidImportError(
600
+ "crawl4ai", "pip install 'crawl4ai[all]' or 'crawl4ai'"
601
+ )
602
+
603
+ # CHANGE 2: Handle the new optional config fields.
604
+ # Use the user-provided config if it exists, otherwise create a default one.
605
+ browser_config = self.config.browser_config or BrowserConfig()
606
+ run_config = self.config.run_config or CrawlerRunConfig()
607
+
608
+ if self.config.extraction_strategy:
609
+ run_config.extraction_strategy = self.config.extraction_strategy
610
+ if self.config.markdown_strategy:
611
+ run_config.markdown_generator = self.config.markdown_strategy
612
+ if self.config.deep_crawl_strategy:
613
+ run_config.deep_crawl_strategy = self.config.deep_crawl_strategy
614
+ if self.config.scraping_strategy:
615
+ run_config.scraping_strategy = self.config.scraping_strategy
616
+
617
+ crawled_documents: List[Document] = []
618
+
619
+ async with AsyncWebCrawler(config=browser_config) as crawler:
620
+ if self.config.crawl_mode == "simple":
621
+ for url in urls:
622
+ result = await crawler.arun(url, config=run_config)
623
+ doc = self._translate_result_to_document(result)
624
+ if doc:
625
+ crawled_documents.append(doc)
626
+
627
+ elif self.config.crawl_mode == "deep":
628
+ if not urls:
629
+ return []
630
+ if not run_config.deep_crawl_strategy:
631
+ logging.warning(
632
+ "Deep crawl mode requires a deep_crawl_strategy in the config."
633
+ )
634
+ return []
635
+
636
+ # In deep crawl mode, `crawl4ai` will discover and crawl pages
637
+ # starting from the seed URL. It will not process direct document links
638
+ # found during the deep crawl; it is designed to follow hyperlinks.
639
+ crawl_results = await crawler.arun(urls[0], config=run_config)
640
+
641
+ if isinstance(crawl_results, list):
642
+ for result in crawl_results:
643
+ doc = self._translate_result_to_document(result)
644
+ if doc:
645
+ crawled_documents.append(doc)
646
+ else:
647
+ async for result in crawl_results:
648
+ doc = self._translate_result_to_document(result)
649
+ if doc:
650
+ crawled_documents.append(doc)
651
+
652
+ return crawled_documents
653
+
654
+
422
655
  class URLLoader:
423
656
  """Loads URLs and extracts text using a specified crawler."""
424
657
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.57.0
3
+ Version: 0.58.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -94,6 +94,8 @@ Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'chainlit'
94
94
  Requires-Dist: python-socketio<6.0.0,>=5.11.0; extra == 'chainlit'
95
95
  Provides-Extra: chromadb
96
96
  Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'chromadb'
97
+ Provides-Extra: crawl-4-ai
98
+ Requires-Dist: crawl4ai>=0.6.3; extra == 'crawl-4-ai'
97
99
  Provides-Extra: db
98
100
  Requires-Dist: psycopg2-binary>=2.9.10; extra == 'db'
99
101
  Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'db'
@@ -344,6 +346,9 @@ teacher_task.run()
344
346
  <details>
345
347
  <summary> <b>Click to expand</b></summary>
346
348
 
349
+ - **Jul 2025:**
350
+ - [0.57.0](https://github.com/langroid/langroid/releases/tag/0.57.0) HTML Logger for interactive task visualization -
351
+ self-contained HTML logs with collapsible entries, auto-refresh, and persistent UI state.
347
352
  - **Jun 2025:**
348
353
  - [0.56.0](https://github.com/langroid/langroid/releases/tag/0.56.0) `TaskTool` for delegating tasks to sub-agents -
349
354
  enables agents to spawn sub-agents with specific tools and configurations.
@@ -102,7 +102,7 @@ langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1
102
102
  langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
103
103
  langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
104
104
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
105
- langroid/parsing/url_loader.py,sha256=NQuCxa-hTOuxLZDq4xKLvPfGVB4IWFzh2ItqWq297DI,15675
105
+ langroid/parsing/url_loader.py,sha256=E2PCxlHMKsNjn6HdQYkMgCMLJYaut-ALGVHXVVwyzbU,24721
106
106
  langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
107
107
  langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
108
108
  langroid/parsing/web_search.py,sha256=atk8wIpOfiGTvW8yL_26TvjvyY2zD24xIHIi0QjEklI,8599
@@ -139,7 +139,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
139
139
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
140
140
  langroid/vector_store/qdrantdb.py,sha256=ZYrT9mxoUCx_67Qzb5xnkWuFG12rfe30yAg4NgG2ueA,19168
141
141
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
142
- langroid-0.57.0.dist-info/METADATA,sha256=y3GSqdrxEQMaV2iMTmioT9KkTGBIo4oHoPRQexgCXdw,65744
143
- langroid-0.57.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
144
- langroid-0.57.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
145
- langroid-0.57.0.dist-info/RECORD,,
142
+ langroid-0.58.0.dist-info/METADATA,sha256=cG1GUrTBX5XQUKLBT5f-3fnPr3j36xvRAf28K7UDUqM,66056
143
+ langroid-0.58.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
144
+ langroid-0.58.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
145
+ langroid-0.58.0.dist-info/RECORD,,