langroid 0.48.0__py3-none-any.whl → 0.48.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/mytypes.py +10 -1
- langroid/parsing/url_loader.py +37 -9
- {langroid-0.48.0.dist-info → langroid-0.48.2.dist-info}/METADATA +1 -1
- {langroid-0.48.0.dist-info → langroid-0.48.2.dist-info}/RECORD +6 -6
- {langroid-0.48.0.dist-info → langroid-0.48.2.dist-info}/WHEEL +0 -0
- {langroid-0.48.0.dist-info → langroid-0.48.2.dist-info}/licenses/LICENSE +0 -0
langroid/mytypes.py
CHANGED
@@ -3,7 +3,7 @@ from textwrap import dedent
|
|
3
3
|
from typing import Any, Callable, Dict, List, Union
|
4
4
|
from uuid import uuid4
|
5
5
|
|
6
|
-
from langroid.pydantic_v1 import BaseModel, Extra, Field
|
6
|
+
from langroid.pydantic_v1 import BaseModel, Extra, Field, validator
|
7
7
|
|
8
8
|
Number = Union[int, float]
|
9
9
|
Embedding = List[Number]
|
@@ -45,10 +45,19 @@ class DocMetaData(BaseModel):
|
|
45
45
|
|
46
46
|
source: str = "context" # just reference
|
47
47
|
source_content: str = "context" # reference and content
|
48
|
+
title: str = "Unknown Title"
|
49
|
+
published_date: str = "Unknown Date"
|
48
50
|
is_chunk: bool = False # if it is a chunk, don't split
|
49
51
|
id: str = Field(default_factory=lambda: str(uuid4()))
|
50
52
|
window_ids: List[str] = [] # for RAG: ids of chunks around this one
|
51
53
|
|
54
|
+
@validator("source", "source_content", "id", "title", "published_date")
|
55
|
+
def ensure_not_empty(cls, v: str) -> str:
|
56
|
+
"""Ensure required string fields are not empty."""
|
57
|
+
if not v:
|
58
|
+
raise ValueError("Field cannot be empty")
|
59
|
+
return v
|
60
|
+
|
52
61
|
def dict_bool_int(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
|
53
62
|
"""
|
54
63
|
Special dict method to convert bool fields to int, to appease some
|
langroid/parsing/url_loader.py
CHANGED
@@ -258,7 +258,13 @@ class FirecrawlCrawler(BaseCrawler):
|
|
258
258
|
with open(filename, "w") as f:
|
259
259
|
f.write(content)
|
260
260
|
docs.append(
|
261
|
-
Document(
|
261
|
+
Document(
|
262
|
+
content=content,
|
263
|
+
metadata=DocMetaData(
|
264
|
+
source=url,
|
265
|
+
title=page["metadata"].get("title", "Unknown Title"),
|
266
|
+
),
|
267
|
+
)
|
262
268
|
)
|
263
269
|
processed_urls.add(url)
|
264
270
|
new_pages += 1
|
@@ -300,7 +306,10 @@ class FirecrawlCrawler(BaseCrawler):
|
|
300
306
|
docs.append(
|
301
307
|
Document(
|
302
308
|
content=result["markdown"],
|
303
|
-
metadata=DocMetaData(
|
309
|
+
metadata=DocMetaData(
|
310
|
+
source=url,
|
311
|
+
title=metadata.get("title", "Unknown Title"),
|
312
|
+
),
|
304
313
|
)
|
305
314
|
)
|
306
315
|
except Exception as e:
|
@@ -336,7 +345,7 @@ class ExaCrawler(BaseCrawler):
|
|
336
345
|
|
337
346
|
@property
|
338
347
|
def needs_parser(self) -> bool:
|
339
|
-
return
|
348
|
+
return True
|
340
349
|
|
341
350
|
def crawl(self, urls: List[str]) -> List[Document]:
|
342
351
|
"""Crawl the given URLs using Exa SDK.
|
@@ -363,12 +372,29 @@ class ExaCrawler(BaseCrawler):
|
|
363
372
|
docs = []
|
364
373
|
|
365
374
|
try:
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
375
|
+
for url in urls:
|
376
|
+
parsed_doc_chunks = self._process_document(url)
|
377
|
+
if parsed_doc_chunks:
|
378
|
+
docs.extend(parsed_doc_chunks)
|
379
|
+
continue
|
380
|
+
else:
|
381
|
+
results = exa.get_contents([url], livecrawl="always", text=True)
|
382
|
+
result = results.results[0]
|
383
|
+
if result.text:
|
384
|
+
# append a NON-chunked document
|
385
|
+
# (metadata.is_chunk = False, so will be chunked downstream)
|
386
|
+
docs.append(
|
387
|
+
Document(
|
388
|
+
content=result.text,
|
389
|
+
metadata=DocMetaData(
|
390
|
+
source=url,
|
391
|
+
title=getattr(result, "title", "Unknown Title"),
|
392
|
+
published_date=getattr(
|
393
|
+
result, "published_date", "Unknown Date"
|
394
|
+
),
|
395
|
+
),
|
396
|
+
)
|
397
|
+
)
|
372
398
|
|
373
399
|
except Exception as e:
|
374
400
|
logging.error(f"Error retrieving content from Exa API: {e}")
|
@@ -399,6 +425,8 @@ class URLLoader:
|
|
399
425
|
crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
|
400
426
|
|
401
427
|
self.crawler = CrawlerFactory.create_crawler(crawler_config)
|
428
|
+
if self.crawler.needs_parser:
|
429
|
+
self.crawler.parser = Parser(parsing_config)
|
402
430
|
|
403
431
|
def load(self) -> List[Document]:
|
404
432
|
"""Load the URLs using the specified crawler."""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
|
2
2
|
langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
|
3
|
-
langroid/mytypes.py,sha256=
|
3
|
+
langroid/mytypes.py,sha256=yzsPpDQqfndMP8ZX9zuQY_oLuUTkW2VJ_iLPARmKoLE,3268
|
4
4
|
langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
|
6
6
|
langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
|
@@ -91,7 +91,7 @@ langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1
|
|
91
91
|
langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
|
92
92
|
langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
|
93
93
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
94
|
-
langroid/parsing/url_loader.py,sha256=
|
94
|
+
langroid/parsing/url_loader.py,sha256=DvgkdCZ3gDlAajH0dIUjea4YyXkziK-g36WnaE1J_WI,14884
|
95
95
|
langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
|
96
96
|
langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
|
97
97
|
langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
|
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
127
127
|
langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
|
128
128
|
langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
|
129
129
|
langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
|
130
|
-
langroid-0.48.
|
131
|
-
langroid-0.48.
|
132
|
-
langroid-0.48.
|
133
|
-
langroid-0.48.
|
130
|
+
langroid-0.48.2.dist-info/METADATA,sha256=kCjeNq2-TNlc0DM8DRitNPJsHUhubpKGNJ2q-Mp6rY4,63606
|
131
|
+
langroid-0.48.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
132
|
+
langroid-0.48.2.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
133
|
+
langroid-0.48.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|