langroid 0.48.0__py3-none-any.whl → 0.48.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/mytypes.py +2 -0
- langroid/parsing/url_loader.py +37 -9
- {langroid-0.48.0.dist-info → langroid-0.48.1.dist-info}/METADATA +1 -1
- {langroid-0.48.0.dist-info → langroid-0.48.1.dist-info}/RECORD +6 -6
- {langroid-0.48.0.dist-info → langroid-0.48.1.dist-info}/WHEEL +0 -0
- {langroid-0.48.0.dist-info → langroid-0.48.1.dist-info}/licenses/LICENSE +0 -0
langroid/mytypes.py
CHANGED
@@ -45,6 +45,8 @@ class DocMetaData(BaseModel):
|
|
45
45
|
|
46
46
|
source: str = "context" # just reference
|
47
47
|
source_content: str = "context" # reference and content
|
48
|
+
title: str = "unknown"
|
49
|
+
published_date: str = "unknown"
|
48
50
|
is_chunk: bool = False # if it is a chunk, don't split
|
49
51
|
id: str = Field(default_factory=lambda: str(uuid4()))
|
50
52
|
window_ids: List[str] = [] # for RAG: ids of chunks around this one
|
langroid/parsing/url_loader.py
CHANGED
@@ -258,7 +258,13 @@ class FirecrawlCrawler(BaseCrawler):
|
|
258
258
|
with open(filename, "w") as f:
|
259
259
|
f.write(content)
|
260
260
|
docs.append(
|
261
|
-
Document(
|
261
|
+
Document(
|
262
|
+
content=content,
|
263
|
+
metadata=DocMetaData(
|
264
|
+
source=url,
|
265
|
+
title=page["metadata"].get("title", ""),
|
266
|
+
),
|
267
|
+
)
|
262
268
|
)
|
263
269
|
processed_urls.add(url)
|
264
270
|
new_pages += 1
|
@@ -300,7 +306,10 @@ class FirecrawlCrawler(BaseCrawler):
|
|
300
306
|
docs.append(
|
301
307
|
Document(
|
302
308
|
content=result["markdown"],
|
303
|
-
metadata=DocMetaData(
|
309
|
+
metadata=DocMetaData(
|
310
|
+
source=url,
|
311
|
+
title=metadata.get("title", ""),
|
312
|
+
),
|
304
313
|
)
|
305
314
|
)
|
306
315
|
except Exception as e:
|
@@ -336,7 +345,7 @@ class ExaCrawler(BaseCrawler):
|
|
336
345
|
|
337
346
|
@property
|
338
347
|
def needs_parser(self) -> bool:
|
339
|
-
return
|
348
|
+
return True
|
340
349
|
|
341
350
|
def crawl(self, urls: List[str]) -> List[Document]:
|
342
351
|
"""Crawl the given URLs using Exa SDK.
|
@@ -363,12 +372,29 @@ class ExaCrawler(BaseCrawler):
|
|
363
372
|
docs = []
|
364
373
|
|
365
374
|
try:
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
375
|
+
for url in urls:
|
376
|
+
parsed_doc_chunks = self._process_document(url)
|
377
|
+
if parsed_doc_chunks:
|
378
|
+
docs.extend(parsed_doc_chunks)
|
379
|
+
continue
|
380
|
+
else:
|
381
|
+
results = exa.get_contents([url], livecrawl="always", text=True)
|
382
|
+
result = results.results[0]
|
383
|
+
if result.text:
|
384
|
+
# append a NON-chunked document
|
385
|
+
# (metadata.is_chunk = False, so will be chunked downstream)
|
386
|
+
docs.append(
|
387
|
+
Document(
|
388
|
+
content=result.text,
|
389
|
+
metadata=DocMetaData(
|
390
|
+
source=url,
|
391
|
+
title=getattr(result, "title", ""),
|
392
|
+
published_date=getattr(
|
393
|
+
result, "published_date", ""
|
394
|
+
),
|
395
|
+
),
|
396
|
+
)
|
397
|
+
)
|
372
398
|
|
373
399
|
except Exception as e:
|
374
400
|
logging.error(f"Error retrieving content from Exa API: {e}")
|
@@ -399,6 +425,8 @@ class URLLoader:
|
|
399
425
|
crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
|
400
426
|
|
401
427
|
self.crawler = CrawlerFactory.create_crawler(crawler_config)
|
428
|
+
if self.crawler.needs_parser:
|
429
|
+
self.crawler.parser = Parser(parsing_config)
|
402
430
|
|
403
431
|
def load(self) -> List[Document]:
|
404
432
|
"""Load the URLs using the specified crawler."""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
|
2
2
|
langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
|
3
|
-
langroid/mytypes.py,sha256=
|
3
|
+
langroid/mytypes.py,sha256=ZW06CyhOPtemUvAGl5m4uPMHd8kEeEfwq04d4U8PntE,2975
|
4
4
|
langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
|
6
6
|
langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
|
@@ -91,7 +91,7 @@ langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1
|
|
91
91
|
langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
|
92
92
|
langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
|
93
93
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
94
|
-
langroid/parsing/url_loader.py,sha256=
|
94
|
+
langroid/parsing/url_loader.py,sha256=Y1kFi6DoIjIxuQmMwR9SPVyHfeCJAe41eofdXUIA1fQ,14833
|
95
95
|
langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
|
96
96
|
langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
|
97
97
|
langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
|
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
127
127
|
langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
|
128
128
|
langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
|
129
129
|
langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
|
130
|
-
langroid-0.48.
|
131
|
-
langroid-0.48.
|
132
|
-
langroid-0.48.
|
133
|
-
langroid-0.48.
|
130
|
+
langroid-0.48.1.dist-info/METADATA,sha256=5tA8WlsZ5n91APjQVDaNBVmUNwOgZ11jfdQunonoW5w,63606
|
131
|
+
langroid-0.48.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
132
|
+
langroid-0.48.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
133
|
+
langroid-0.48.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|