langroid 0.48.0__py3-none-any.whl → 0.48.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langroid/mytypes.py CHANGED
@@ -45,6 +45,8 @@ class DocMetaData(BaseModel):
45
45
 
46
46
  source: str = "context" # just reference
47
47
  source_content: str = "context" # reference and content
48
+ title: str = "unknown"
49
+ published_date: str = "unknown"
48
50
  is_chunk: bool = False # if it is a chunk, don't split
49
51
  id: str = Field(default_factory=lambda: str(uuid4()))
50
52
  window_ids: List[str] = [] # for RAG: ids of chunks around this one
@@ -258,7 +258,13 @@ class FirecrawlCrawler(BaseCrawler):
258
258
  with open(filename, "w") as f:
259
259
  f.write(content)
260
260
  docs.append(
261
- Document(content=content, metadata=DocMetaData(source=url))
261
+ Document(
262
+ content=content,
263
+ metadata=DocMetaData(
264
+ source=url,
265
+ title=page["metadata"].get("title", ""),
266
+ ),
267
+ )
262
268
  )
263
269
  processed_urls.add(url)
264
270
  new_pages += 1
@@ -300,7 +306,10 @@ class FirecrawlCrawler(BaseCrawler):
300
306
  docs.append(
301
307
  Document(
302
308
  content=result["markdown"],
303
- metadata=DocMetaData(source=url),
309
+ metadata=DocMetaData(
310
+ source=url,
311
+ title=metadata.get("title", ""),
312
+ ),
304
313
  )
305
314
  )
306
315
  except Exception as e:
@@ -336,7 +345,7 @@ class ExaCrawler(BaseCrawler):
336
345
 
337
346
  @property
338
347
  def needs_parser(self) -> bool:
339
- return False
348
+ return True
340
349
 
341
350
  def crawl(self, urls: List[str]) -> List[Document]:
342
351
  """Crawl the given URLs using Exa SDK.
@@ -363,12 +372,29 @@ class ExaCrawler(BaseCrawler):
363
372
  docs = []
364
373
 
365
374
  try:
366
- results = exa.get_contents(urls, text=True)
367
-
368
- for result in results.results:
369
- if result.text:
370
- metadata = DocMetaData(source=result.url)
371
- docs.append(Document(content=result.text, metadata=metadata))
375
+ for url in urls:
376
+ parsed_doc_chunks = self._process_document(url)
377
+ if parsed_doc_chunks:
378
+ docs.extend(parsed_doc_chunks)
379
+ continue
380
+ else:
381
+ results = exa.get_contents([url], livecrawl="always", text=True)
382
+ result = results.results[0]
383
+ if result.text:
384
+ # append a NON-chunked document
385
+ # (metadata.is_chunk = False, so will be chunked downstream)
386
+ docs.append(
387
+ Document(
388
+ content=result.text,
389
+ metadata=DocMetaData(
390
+ source=url,
391
+ title=getattr(result, "title", ""),
392
+ published_date=getattr(
393
+ result, "published_date", ""
394
+ ),
395
+ ),
396
+ )
397
+ )
372
398
 
373
399
  except Exception as e:
374
400
  logging.error(f"Error retrieving content from Exa API: {e}")
@@ -399,6 +425,8 @@ class URLLoader:
399
425
  crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
400
426
 
401
427
  self.crawler = CrawlerFactory.create_crawler(crawler_config)
428
+ if self.crawler.needs_parser:
429
+ self.crawler.parser = Parser(parsing_config)
402
430
 
403
431
  def load(self) -> List[Document]:
404
432
  """Load the URLs using the specified crawler."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.48.0
3
+ Version: 0.48.1
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
2
2
  langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
3
- langroid/mytypes.py,sha256=wfb320SFnZVTv_CgcLWsvoKBXxAFfY4EISeue8MFqpQ,2912
3
+ langroid/mytypes.py,sha256=ZW06CyhOPtemUvAGl5m4uPMHd8kEeEfwq04d4U8PntE,2975
4
4
  langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
6
6
  langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
@@ -91,7 +91,7 @@ langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1
91
91
  langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
92
92
  langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
93
93
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
94
- langroid/parsing/url_loader.py,sha256=UiKlokh8AE0Qz9d4uIGPGJ-1yUfXrJjjuRBzsbxvurg,13552
94
+ langroid/parsing/url_loader.py,sha256=Y1kFi6DoIjIxuQmMwR9SPVyHfeCJAe41eofdXUIA1fQ,14833
95
95
  langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
96
96
  langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
97
97
  langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
127
127
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
128
128
  langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
129
129
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
130
- langroid-0.48.0.dist-info/METADATA,sha256=mhJmePv93Tsvw2Q1C2EeKrmtFzLmzxGBobBU8RAqpZs,63606
131
- langroid-0.48.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- langroid-0.48.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
- langroid-0.48.0.dist-info/RECORD,,
130
+ langroid-0.48.1.dist-info/METADATA,sha256=5tA8WlsZ5n91APjQVDaNBVmUNwOgZ11jfdQunonoW5w,63606
131
+ langroid-0.48.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
+ langroid-0.48.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
+ langroid-0.48.1.dist-info/RECORD,,