langroid 0.48.0__py3-none-any.whl → 0.48.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langroid/mytypes.py CHANGED
@@ -3,7 +3,7 @@ from textwrap import dedent
3
3
  from typing import Any, Callable, Dict, List, Union
4
4
  from uuid import uuid4
5
5
 
6
- from langroid.pydantic_v1 import BaseModel, Extra, Field
6
+ from langroid.pydantic_v1 import BaseModel, Extra, Field, validator
7
7
 
8
8
  Number = Union[int, float]
9
9
  Embedding = List[Number]
@@ -45,10 +45,19 @@ class DocMetaData(BaseModel):
45
45
 
46
46
  source: str = "context" # just reference
47
47
  source_content: str = "context" # reference and content
48
+ title: str = "Unknown Title"
49
+ published_date: str = "Unknown Date"
48
50
  is_chunk: bool = False # if it is a chunk, don't split
49
51
  id: str = Field(default_factory=lambda: str(uuid4()))
50
52
  window_ids: List[str] = [] # for RAG: ids of chunks around this one
51
53
 
54
+ @validator("source", "source_content", "id", "title", "published_date")
55
+ def ensure_not_empty(cls, v: str) -> str:
56
+ """Ensure required string fields are not empty."""
57
+ if not v:
58
+ raise ValueError("Field cannot be empty")
59
+ return v
60
+
52
61
  def dict_bool_int(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
53
62
  """
54
63
  Special dict method to convert bool fields to int, to appease some
@@ -258,7 +258,13 @@ class FirecrawlCrawler(BaseCrawler):
258
258
  with open(filename, "w") as f:
259
259
  f.write(content)
260
260
  docs.append(
261
- Document(content=content, metadata=DocMetaData(source=url))
261
+ Document(
262
+ content=content,
263
+ metadata=DocMetaData(
264
+ source=url,
265
+ title=page["metadata"].get("title", "Unknown Title"),
266
+ ),
267
+ )
262
268
  )
263
269
  processed_urls.add(url)
264
270
  new_pages += 1
@@ -300,7 +306,10 @@ class FirecrawlCrawler(BaseCrawler):
300
306
  docs.append(
301
307
  Document(
302
308
  content=result["markdown"],
303
- metadata=DocMetaData(source=url),
309
+ metadata=DocMetaData(
310
+ source=url,
311
+ title=metadata.get("title", "Unknown Title"),
312
+ ),
304
313
  )
305
314
  )
306
315
  except Exception as e:
@@ -336,7 +345,7 @@ class ExaCrawler(BaseCrawler):
336
345
 
337
346
  @property
338
347
  def needs_parser(self) -> bool:
339
- return False
348
+ return True
340
349
 
341
350
  def crawl(self, urls: List[str]) -> List[Document]:
342
351
  """Crawl the given URLs using Exa SDK.
@@ -363,12 +372,29 @@ class ExaCrawler(BaseCrawler):
363
372
  docs = []
364
373
 
365
374
  try:
366
- results = exa.get_contents(urls, text=True)
367
-
368
- for result in results.results:
369
- if result.text:
370
- metadata = DocMetaData(source=result.url)
371
- docs.append(Document(content=result.text, metadata=metadata))
375
+ for url in urls:
376
+ parsed_doc_chunks = self._process_document(url)
377
+ if parsed_doc_chunks:
378
+ docs.extend(parsed_doc_chunks)
379
+ continue
380
+ else:
381
+ results = exa.get_contents([url], livecrawl="always", text=True)
382
+ result = results.results[0]
383
+ if result.text:
384
+ # append a NON-chunked document
385
+ # (metadata.is_chunk = False, so will be chunked downstream)
386
+ docs.append(
387
+ Document(
388
+ content=result.text,
389
+ metadata=DocMetaData(
390
+ source=url,
391
+ title=getattr(result, "title", "Unknown Title"),
392
+ published_date=getattr(
393
+ result, "published_date", "Unknown Date"
394
+ ),
395
+ ),
396
+ )
397
+ )
372
398
 
373
399
  except Exception as e:
374
400
  logging.error(f"Error retrieving content from Exa API: {e}")
@@ -399,6 +425,8 @@ class URLLoader:
399
425
  crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
400
426
 
401
427
  self.crawler = CrawlerFactory.create_crawler(crawler_config)
428
+ if self.crawler.needs_parser:
429
+ self.crawler.parser = Parser(parsing_config)
402
430
 
403
431
  def load(self) -> List[Document]:
404
432
  """Load the URLs using the specified crawler."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.48.0
3
+ Version: 0.48.2
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
2
2
  langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
3
- langroid/mytypes.py,sha256=wfb320SFnZVTv_CgcLWsvoKBXxAFfY4EISeue8MFqpQ,2912
3
+ langroid/mytypes.py,sha256=yzsPpDQqfndMP8ZX9zuQY_oLuUTkW2VJ_iLPARmKoLE,3268
4
4
  langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
6
6
  langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
@@ -91,7 +91,7 @@ langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1
91
91
  langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
92
92
  langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
93
93
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
94
- langroid/parsing/url_loader.py,sha256=UiKlokh8AE0Qz9d4uIGPGJ-1yUfXrJjjuRBzsbxvurg,13552
94
+ langroid/parsing/url_loader.py,sha256=DvgkdCZ3gDlAajH0dIUjea4YyXkziK-g36WnaE1J_WI,14884
95
95
  langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
96
96
  langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
97
97
  langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
127
127
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
128
128
  langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
129
129
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
130
- langroid-0.48.0.dist-info/METADATA,sha256=mhJmePv93Tsvw2Q1C2EeKrmtFzLmzxGBobBU8RAqpZs,63606
131
- langroid-0.48.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- langroid-0.48.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
- langroid-0.48.0.dist-info/RECORD,,
130
+ langroid-0.48.2.dist-info/METADATA,sha256=kCjeNq2-TNlc0DM8DRitNPJsHUhubpKGNJ2q-Mp6rY4,63606
131
+ langroid-0.48.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
+ langroid-0.48.2.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
+ langroid-0.48.2.dist-info/RECORD,,