langroid 0.36.0__py3-none-any.whl → 0.37.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/special/doc_chat_agent.py +14 -10
- langroid/embedding_models/models.py +2 -2
- langroid/exceptions.py +16 -4
- langroid/parsing/code_parser.py +1 -1
- langroid/parsing/document_parser.py +167 -64
- langroid/parsing/parser.py +11 -7
- langroid/parsing/utils.py +2 -2
- langroid/utils/output/citations.py +32 -12
- langroid/vector_store/base.py +1 -1
- langroid/vector_store/chromadb.py +12 -1
- langroid/vector_store/qdrantdb.py +1 -1
- langroid/vector_store/weaviatedb.py +5 -5
- {langroid-0.36.0.dist-info → langroid-0.37.0.dist-info}/METADATA +33 -16
- {langroid-0.36.0.dist-info → langroid-0.37.0.dist-info}/RECORD +16 -16
- {langroid-0.36.0.dist-info → langroid-0.37.0.dist-info}/WHEEL +0 -0
- {langroid-0.36.0.dist-info → langroid-0.37.0.dist-info}/licenses/LICENSE +0 -0
@@ -15,6 +15,7 @@ pip install "langroid[hf-embeddings]"
|
|
15
15
|
"""
|
16
16
|
|
17
17
|
import logging
|
18
|
+
import textwrap
|
18
19
|
from collections import OrderedDict
|
19
20
|
from functools import cache
|
20
21
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, no_type_check
|
@@ -81,7 +82,7 @@ You will be given various passages from these documents, and asked to answer que
|
|
81
82
|
about them, or summarize them into coherent answers.
|
82
83
|
"""
|
83
84
|
|
84
|
-
CHUNK_ENRICHMENT_DELIMITER = "<##-##-##>"
|
85
|
+
CHUNK_ENRICHMENT_DELIMITER = "\n<##-##-##>"
|
85
86
|
|
86
87
|
has_sentence_transformers = False
|
87
88
|
try:
|
@@ -99,7 +100,7 @@ hf_embed_config = SentenceTransformerEmbeddingsConfig(
|
|
99
100
|
|
100
101
|
oai_embed_config = OpenAIEmbeddingsConfig(
|
101
102
|
model_type="openai",
|
102
|
-
model_name="text-embedding-
|
103
|
+
model_name="text-embedding-3-small",
|
103
104
|
dims=1536,
|
104
105
|
)
|
105
106
|
|
@@ -188,8 +189,8 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
188
189
|
# NOTE: PDF parsing is extremely challenging, and each library
|
189
190
|
# has its own strengths and weaknesses.
|
190
191
|
# Try one that works for your use case.
|
191
|
-
# or "unstructured", "
|
192
|
-
library="
|
192
|
+
# or "unstructured", "fitz", "pymupdf4llm", "pypdf"
|
193
|
+
library="pymupdf4llm",
|
193
194
|
),
|
194
195
|
)
|
195
196
|
|
@@ -810,9 +811,11 @@ class DocChatAgent(ChatAgent):
|
|
810
811
|
return "\n".join(
|
811
812
|
[
|
812
813
|
f"""
|
813
|
-
[{i+1}]
|
814
|
+
-----[EXTRACT #{i+1}]----------
|
814
815
|
{content}
|
815
816
|
{source}
|
817
|
+
-----END OF EXTRACT------------
|
818
|
+
|
816
819
|
"""
|
817
820
|
for i, (content, source) in enumerate(zip(contents, sources))
|
818
821
|
]
|
@@ -949,12 +952,13 @@ class DocChatAgent(ChatAgent):
|
|
949
952
|
continue
|
950
953
|
|
951
954
|
# Combine original content with questions in a structured way
|
952
|
-
combined_content =
|
953
|
-
|
954
|
-
|
955
|
+
combined_content = textwrap.dedent(
|
956
|
+
f"""\
|
957
|
+
{doc.content}
|
955
958
|
{enrichment_config.delimiter}
|
956
959
|
{enrichment}
|
957
|
-
"""
|
960
|
+
"""
|
961
|
+
)
|
958
962
|
|
959
963
|
new_doc = doc.copy(
|
960
964
|
update={
|
@@ -1440,7 +1444,7 @@ class DocChatAgent(ChatAgent):
|
|
1440
1444
|
delimiter = self.config.chunk_enrichment_config.delimiter
|
1441
1445
|
return [
|
1442
1446
|
(
|
1443
|
-
doc.copy(update={"content": doc.content.split(delimiter)[0]
|
1447
|
+
doc.copy(update={"content": doc.content.split(delimiter)[0]})
|
1444
1448
|
if doc.content and getattr(doc.metadata, "has_enrichment", False)
|
1445
1449
|
else doc
|
1446
1450
|
)
|
@@ -18,7 +18,7 @@ AzureADTokenProvider = Callable[[], str]
|
|
18
18
|
|
19
19
|
class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
20
20
|
model_type: str = "openai"
|
21
|
-
model_name: str = "text-embedding-
|
21
|
+
model_name: str = "text-embedding-3-large"
|
22
22
|
api_key: str = ""
|
23
23
|
api_base: Optional[str] = None
|
24
24
|
organization: str = ""
|
@@ -28,7 +28,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
|
28
28
|
|
29
29
|
class AzureOpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
30
30
|
model_type: str = "azure-openai"
|
31
|
-
model_name: str = "text-embedding-
|
31
|
+
model_name: str = "text-embedding-3-large"
|
32
32
|
api_key: str = ""
|
33
33
|
api_base: str = ""
|
34
34
|
deployment_name: Optional[str] = None
|
langroid/exceptions.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Optional
|
1
|
+
from typing import List, Optional
|
2
2
|
|
3
3
|
|
4
4
|
class XMLException(Exception):
|
@@ -15,7 +15,7 @@ class LangroidImportError(ImportError):
|
|
15
15
|
def __init__(
|
16
16
|
self,
|
17
17
|
package: Optional[str] = None,
|
18
|
-
extra: Optional[str] = None,
|
18
|
+
extra: Optional[str | List[str]] = None,
|
19
19
|
error: str = "",
|
20
20
|
*args: object,
|
21
21
|
) -> None:
|
@@ -33,9 +33,21 @@ class LangroidImportError(ImportError):
|
|
33
33
|
error = f"{package} is not installed by default with Langroid.\n"
|
34
34
|
|
35
35
|
if extra:
|
36
|
+
if isinstance(extra, list):
|
37
|
+
help_preamble = f"""
|
38
|
+
If you want to use it, please install langroid with one of these
|
39
|
+
extras: {', '.join(extra)}. The examples below use the first one,
|
40
|
+
i.e. {extra[0]}.
|
41
|
+
"""
|
42
|
+
extra = extra[0]
|
43
|
+
else:
|
44
|
+
help_preamble = f"""
|
45
|
+
If you want to use it, please install langroid with the
|
46
|
+
`{extra}` extra.
|
47
|
+
"""
|
48
|
+
|
36
49
|
install_help = f"""
|
37
|
-
|
38
|
-
with the `{extra}` extra, for example:
|
50
|
+
{help_preamble}
|
39
51
|
|
40
52
|
If you are using pip:
|
41
53
|
pip install "langroid[{extra}]"
|
langroid/parsing/code_parser.py
CHANGED
@@ -3,9 +3,10 @@ from __future__ import annotations
|
|
3
3
|
import itertools
|
4
4
|
import logging
|
5
5
|
import re
|
6
|
+
import tempfile
|
6
7
|
from enum import Enum
|
7
8
|
from io import BytesIO
|
8
|
-
from typing import TYPE_CHECKING, Any, Generator, List, Tuple
|
9
|
+
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
|
9
10
|
|
10
11
|
from langroid.exceptions import LangroidImportError
|
11
12
|
from langroid.utils.object_registry import ObjectRegistry
|
@@ -15,18 +16,24 @@ try:
|
|
15
16
|
except ImportError:
|
16
17
|
if not TYPE_CHECKING:
|
17
18
|
fitz = None
|
19
|
+
try:
|
20
|
+
import pymupdf4llm
|
21
|
+
except ImportError:
|
22
|
+
if not TYPE_CHECKING:
|
23
|
+
pymupdf4llm = None
|
18
24
|
|
19
25
|
try:
|
20
|
-
import
|
26
|
+
import docling
|
21
27
|
except ImportError:
|
22
28
|
if not TYPE_CHECKING:
|
23
|
-
|
29
|
+
docling = None
|
24
30
|
|
25
31
|
try:
|
26
|
-
import
|
32
|
+
import pypdf
|
27
33
|
except ImportError:
|
28
34
|
if not TYPE_CHECKING:
|
29
|
-
|
35
|
+
pypdf = None
|
36
|
+
|
30
37
|
|
31
38
|
import requests
|
32
39
|
from bs4 import BeautifulSoup
|
@@ -41,6 +48,7 @@ logger = logging.getLogger(__name__)
|
|
41
48
|
|
42
49
|
|
43
50
|
class DocumentType(str, Enum):
|
51
|
+
# TODO add `md` (Markdown) and `html`
|
44
52
|
PDF = "pdf"
|
45
53
|
DOCX = "docx"
|
46
54
|
DOC = "doc"
|
@@ -139,10 +147,12 @@ class DocumentParser(Parser):
|
|
139
147
|
if inferred_doc_type == DocumentType.PDF:
|
140
148
|
if config.pdf.library == "fitz":
|
141
149
|
return FitzPDFParser(source, config)
|
150
|
+
elif config.pdf.library == "pymupdf4llm":
|
151
|
+
return PyMuPDF4LLMParser(source, config)
|
152
|
+
elif config.pdf.library == "docling":
|
153
|
+
return DoclingParser(source, config)
|
142
154
|
elif config.pdf.library == "pypdf":
|
143
155
|
return PyPDFParser(source, config)
|
144
|
-
elif config.pdf.library == "pdfplumber":
|
145
|
-
return PDFPlumberParser(source, config)
|
146
156
|
elif config.pdf.library == "unstructured":
|
147
157
|
return UnstructuredPDFParser(source, config)
|
148
158
|
elif config.pdf.library == "pdf2image":
|
@@ -307,8 +317,11 @@ class DocumentParser(Parser):
|
|
307
317
|
"""Yield each page in the PDF."""
|
308
318
|
raise NotImplementedError
|
309
319
|
|
310
|
-
def
|
311
|
-
"""
|
320
|
+
def get_document_from_page(self, page: Any) -> Document:
|
321
|
+
"""
|
322
|
+
Get Langroid Document object (with possible metadata)
|
323
|
+
corresponding to a given page.
|
324
|
+
"""
|
312
325
|
raise NotImplementedError
|
313
326
|
|
314
327
|
def fix_text(self, text: str) -> str:
|
@@ -335,7 +348,10 @@ class DocumentParser(Parser):
|
|
335
348
|
"""
|
336
349
|
|
337
350
|
text = "".join(
|
338
|
-
[
|
351
|
+
[
|
352
|
+
self.get_document_from_page(page).content
|
353
|
+
for _, page in self.iterate_pages()
|
354
|
+
]
|
339
355
|
)
|
340
356
|
return Document(content=text, metadata=DocMetaData(source=self.source))
|
341
357
|
|
@@ -359,7 +375,10 @@ class DocumentParser(Parser):
|
|
359
375
|
common_id = ObjectRegistry.new_id()
|
360
376
|
n_chunks = 0 # how many chunk so far
|
361
377
|
for i, page in self.iterate_pages():
|
362
|
-
|
378
|
+
# not used but could be useful, esp to blend the
|
379
|
+
# metadata from the pages into the chunks
|
380
|
+
page_doc = self.get_document_from_page(page)
|
381
|
+
page_text = page_doc.content
|
363
382
|
split += self.tokenizer.encode(page_text)
|
364
383
|
pages.append(str(i + 1))
|
365
384
|
# split could be so long it needs to be split
|
@@ -422,81 +441,152 @@ class FitzPDFParser(DocumentParser):
|
|
422
441
|
yield i, page
|
423
442
|
doc.close()
|
424
443
|
|
425
|
-
def
|
444
|
+
def get_document_from_page(self, page: "fitz.Page") -> Document:
|
426
445
|
"""
|
427
|
-
|
446
|
+
Get Document object from a given `fitz` page.
|
428
447
|
|
429
448
|
Args:
|
430
449
|
page (fitz.Page): The `fitz` page object.
|
431
450
|
|
432
451
|
Returns:
|
433
|
-
|
452
|
+
Document: Document object, with content and possible metadata.
|
434
453
|
"""
|
435
|
-
return
|
454
|
+
return Document(
|
455
|
+
content=self.fix_text(page.get_text()),
|
456
|
+
metadata=DocMetaData(source=self.source),
|
457
|
+
)
|
436
458
|
|
437
459
|
|
438
|
-
class
|
460
|
+
class PyMuPDF4LLMParser(DocumentParser):
|
439
461
|
"""
|
440
|
-
Parser for processing PDFs using the `
|
462
|
+
Parser for processing PDFs using the `pymupdf4llm` library.
|
441
463
|
"""
|
442
464
|
|
443
|
-
def iterate_pages(self) -> Generator[Tuple[int,
|
465
|
+
def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
|
444
466
|
"""
|
445
|
-
Yield each page in the PDF using `
|
467
|
+
Yield each page in the PDF using `fitz`.
|
446
468
|
|
447
469
|
Returns:
|
448
|
-
Generator[
|
470
|
+
Generator[fitz.Page]: Generator yielding each page.
|
449
471
|
"""
|
450
|
-
if
|
451
|
-
raise LangroidImportError(
|
452
|
-
|
453
|
-
|
472
|
+
if fitz is None:
|
473
|
+
raise LangroidImportError(
|
474
|
+
"pymupdf4llm", ["pymupdf4llm", "all", "pdf-parsers", "doc-chat"]
|
475
|
+
)
|
476
|
+
doc: fitz.Document = fitz.open(stream=self.doc_bytes, filetype="pdf")
|
477
|
+
pages: List[Dict[str, Any]] = pymupdf4llm.to_markdown(doc, page_chunks=True)
|
478
|
+
for i, page in enumerate(pages):
|
454
479
|
yield i, page
|
480
|
+
doc.close()
|
455
481
|
|
456
|
-
def
|
482
|
+
def get_document_from_page(self, page: Dict[str, Any]) -> Document:
|
457
483
|
"""
|
458
|
-
|
484
|
+
Get Document object corresponding to a given "page-chunk"
|
485
|
+
dictionary, see:
|
486
|
+
https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html
|
487
|
+
|
459
488
|
|
460
489
|
Args:
|
461
|
-
page (
|
490
|
+
page (Dict[str,Any]): The "page-chunk" dictionary.
|
462
491
|
|
463
492
|
Returns:
|
464
|
-
|
493
|
+
Document: Document object, with content and possible metadata.
|
465
494
|
"""
|
466
|
-
return
|
495
|
+
return Document(
|
496
|
+
content=self.fix_text(page.get("text", "")),
|
497
|
+
# TODO could possible use other metadata from page, see above link.
|
498
|
+
metadata=DocMetaData(source=self.source),
|
499
|
+
)
|
467
500
|
|
468
501
|
|
469
|
-
class
|
502
|
+
class DoclingParser(DocumentParser):
|
470
503
|
"""
|
471
|
-
Parser for processing PDFs using the `
|
504
|
+
Parser for processing PDFs using the `docling` library.
|
472
505
|
"""
|
473
506
|
|
474
|
-
def iterate_pages(
|
475
|
-
|
476
|
-
|
507
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
508
|
+
"""
|
509
|
+
Yield each page in the PDF using `docling`.
|
510
|
+
|
511
|
+
Returns:
|
512
|
+
Generator[docling.Page]: Generator yielding each page.
|
477
513
|
"""
|
478
|
-
|
514
|
+
if docling is None:
|
515
|
+
raise LangroidImportError(
|
516
|
+
"docling", ["docling", "pdf-parsers", "all", "doc-chat"]
|
517
|
+
)
|
518
|
+
from docling.datamodel.document import TextItem # type: ignore
|
519
|
+
from docling.document_converter import ( # type: ignore
|
520
|
+
ConversionResult,
|
521
|
+
DocumentConverter,
|
522
|
+
)
|
523
|
+
|
524
|
+
converter = DocumentConverter()
|
525
|
+
file_path = self.source
|
526
|
+
if file_path == "bytes":
|
527
|
+
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
528
|
+
tmp.write(self.doc_bytes.getvalue())
|
529
|
+
file_path = tmp.name
|
530
|
+
result: ConversionResult = converter.convert(file_path)
|
531
|
+
doc = result.document
|
532
|
+
n_pages = doc.num_pages() # type: ignore
|
533
|
+
for i in range(n_pages):
|
534
|
+
texts = [
|
535
|
+
item[0].text
|
536
|
+
for item in doc.iterate_items(page_no=i + 1)
|
537
|
+
if isinstance(item[0], TextItem)
|
538
|
+
]
|
539
|
+
text = "\n".join(texts)
|
540
|
+
yield i, text
|
541
|
+
|
542
|
+
def get_document_from_page(self, page: str) -> Document:
|
543
|
+
"""
|
544
|
+
Get Document object from a given `docling` "page" (actually a chunk).
|
545
|
+
|
546
|
+
Args:
|
547
|
+
page (docling.chunking.DocChunk): The `docling` chunk
|
479
548
|
|
480
549
|
Returns:
|
481
|
-
|
550
|
+
Document: Document object, with content and possible metadata.
|
482
551
|
"""
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
552
|
+
return Document(
|
553
|
+
content=self.fix_text(page),
|
554
|
+
metadata=DocMetaData(source=self.source),
|
555
|
+
)
|
556
|
+
|
488
557
|
|
489
|
-
|
558
|
+
class PyPDFParser(DocumentParser):
|
559
|
+
"""
|
560
|
+
Parser for processing PDFs using the `pypdf` library.
|
561
|
+
"""
|
562
|
+
|
563
|
+
def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
|
490
564
|
"""
|
491
|
-
|
565
|
+
Yield each page in the PDF using `pypdf`.
|
566
|
+
|
567
|
+
Returns:
|
568
|
+
Generator[pypdf.pdf.PageObject]: Generator yielding each page.
|
569
|
+
"""
|
570
|
+
if pypdf is None:
|
571
|
+
raise LangroidImportError("pypdf", "pdf-parsers")
|
572
|
+
reader = pypdf.PdfReader(self.doc_bytes)
|
573
|
+
for i, page in enumerate(reader.pages):
|
574
|
+
yield i, page
|
575
|
+
|
576
|
+
def get_document_from_page(self, page: pypdf.PageObject) -> Document:
|
577
|
+
"""
|
578
|
+
Get Document object from a given `pypdf` page.
|
492
579
|
|
493
580
|
Args:
|
494
|
-
page (
|
581
|
+
page (pypdf.pdf.PageObject): The `pypdf` page object.
|
495
582
|
|
496
583
|
Returns:
|
497
|
-
|
584
|
+
Document: Document object, with content and possible metadata.
|
498
585
|
"""
|
499
|
-
return
|
586
|
+
return Document(
|
587
|
+
content=self.fix_text(page.extract_text()),
|
588
|
+
metadata=DocMetaData(source=self.source),
|
589
|
+
)
|
500
590
|
|
501
591
|
|
502
592
|
class ImagePdfParser(DocumentParser):
|
@@ -516,15 +606,15 @@ class ImagePdfParser(DocumentParser):
|
|
516
606
|
for i, image in enumerate(images):
|
517
607
|
yield i, image
|
518
608
|
|
519
|
-
def
|
609
|
+
def get_document_from_page(self, page: "Image") -> Document: # type: ignore
|
520
610
|
"""
|
521
|
-
|
611
|
+
Get Document object corresponding to a given `pdf2image` page.
|
522
612
|
|
523
613
|
Args:
|
524
614
|
page (Image): The PIL Image object.
|
525
615
|
|
526
616
|
Returns:
|
527
|
-
|
617
|
+
Document: Document object, with content and possible metadata.
|
528
618
|
"""
|
529
619
|
try:
|
530
620
|
import pytesseract
|
@@ -532,7 +622,10 @@ class ImagePdfParser(DocumentParser):
|
|
532
622
|
raise LangroidImportError("pytesseract", "pdf-parsers")
|
533
623
|
|
534
624
|
text = pytesseract.image_to_string(page)
|
535
|
-
return
|
625
|
+
return Document(
|
626
|
+
content=self.fix_text(text),
|
627
|
+
metadata=DocMetaData(source=self.source),
|
628
|
+
)
|
536
629
|
|
537
630
|
|
538
631
|
class UnstructuredPDFParser(DocumentParser):
|
@@ -564,8 +657,8 @@ class UnstructuredPDFParser(DocumentParser):
|
|
564
657
|
The `unstructured` library failed to parse the pdf.
|
565
658
|
Please try a different library by setting the `library` field
|
566
659
|
in the `pdf` section of the `parsing` field in the config file.
|
567
|
-
|
568
|
-
fitz,
|
660
|
+
Other supported libraries are:
|
661
|
+
fitz, pymupdf4llm, pypdf
|
569
662
|
"""
|
570
663
|
)
|
571
664
|
|
@@ -584,18 +677,21 @@ class UnstructuredPDFParser(DocumentParser):
|
|
584
677
|
if page_elements:
|
585
678
|
yield page_number, page_elements
|
586
679
|
|
587
|
-
def
|
680
|
+
def get_document_from_page(self, page: Any) -> Document:
|
588
681
|
"""
|
589
|
-
|
682
|
+
Get Document object from a given `unstructured` element.
|
590
683
|
|
591
684
|
Args:
|
592
685
|
page (unstructured element): The `unstructured` element object.
|
593
686
|
|
594
687
|
Returns:
|
595
|
-
|
688
|
+
Document: Document object, with content and possible metadata.
|
596
689
|
"""
|
597
690
|
text = " ".join(el.text for el in page)
|
598
|
-
return
|
691
|
+
return Document(
|
692
|
+
content=self.fix_text(text),
|
693
|
+
metadata=DocMetaData(source=self.source),
|
694
|
+
)
|
599
695
|
|
600
696
|
|
601
697
|
class UnstructuredDocxParser(DocumentParser):
|
@@ -632,9 +728,9 @@ class UnstructuredDocxParser(DocumentParser):
|
|
632
728
|
if page_elements:
|
633
729
|
yield page_number, page_elements
|
634
730
|
|
635
|
-
def
|
731
|
+
def get_document_from_page(self, page: Any) -> Document:
|
636
732
|
"""
|
637
|
-
|
733
|
+
Get Document object from a given `unstructured` element.
|
638
734
|
|
639
735
|
Note:
|
640
736
|
The concept of "pages" doesn't actually exist in the .docx file format in
|
@@ -647,10 +743,13 @@ class UnstructuredDocxParser(DocumentParser):
|
|
647
743
|
page (unstructured element): The `unstructured` element object.
|
648
744
|
|
649
745
|
Returns:
|
650
|
-
|
746
|
+
Document object, with content and possible metadata.
|
651
747
|
"""
|
652
748
|
text = " ".join(el.text for el in page)
|
653
|
-
return
|
749
|
+
return Document(
|
750
|
+
content=self.fix_text(text),
|
751
|
+
metadata=DocMetaData(source=self.source),
|
752
|
+
)
|
654
753
|
|
655
754
|
|
656
755
|
class UnstructuredDocParser(UnstructuredDocxParser):
|
@@ -704,15 +803,19 @@ class PythonDocxParser(DocumentParser):
|
|
704
803
|
for i, para in enumerate(doc.paragraphs, start=1):
|
705
804
|
yield i, [para]
|
706
805
|
|
707
|
-
def
|
806
|
+
def get_document_from_page(self, page: Any) -> Document:
|
708
807
|
"""
|
709
|
-
|
808
|
+
Get Document object from a given 'page', which in this case is a single
|
809
|
+
paragraph.
|
710
810
|
|
711
811
|
Args:
|
712
812
|
page (list): A list containing a single Paragraph object.
|
713
813
|
|
714
814
|
Returns:
|
715
|
-
|
815
|
+
Document: Document object, with content and possible metadata.
|
716
816
|
"""
|
717
817
|
paragraph = page[0]
|
718
|
-
return
|
818
|
+
return Document(
|
819
|
+
content=self.fix_text(paragraph.text),
|
820
|
+
metadata=DocMetaData(source=self.source),
|
821
|
+
)
|
langroid/parsing/parser.py
CHANGED
@@ -23,11 +23,12 @@ class Splitter(str, Enum):
|
|
23
23
|
class PdfParsingConfig(BaseSettings):
|
24
24
|
library: Literal[
|
25
25
|
"fitz",
|
26
|
-
"
|
26
|
+
"pymupdf4llm",
|
27
|
+
"docling",
|
27
28
|
"pypdf",
|
28
29
|
"unstructured",
|
29
30
|
"pdf2image",
|
30
|
-
] = "
|
31
|
+
] = "pymupdf4llm"
|
31
32
|
|
32
33
|
|
33
34
|
class DocxParsingConfig(BaseSettings):
|
@@ -40,6 +41,7 @@ class DocParsingConfig(BaseSettings):
|
|
40
41
|
|
41
42
|
class ParsingConfig(BaseSettings):
|
42
43
|
splitter: str = Splitter.TOKENS
|
44
|
+
chunk_by_page: bool = False # split by page?
|
43
45
|
chunk_size: int = 200 # aim for this many tokens per chunk
|
44
46
|
overlap: int = 50 # overlap between chunks
|
45
47
|
max_chunks: int = 10_000
|
@@ -49,7 +51,7 @@ class ParsingConfig(BaseSettings):
|
|
49
51
|
n_similar_docs: int = 4
|
50
52
|
n_neighbor_ids: int = 5 # window size to store around each chunk
|
51
53
|
separators: List[str] = ["\n\n", "\n", " ", ""]
|
52
|
-
token_encoding_model: str = "text-embedding-
|
54
|
+
token_encoding_model: str = "text-embedding-3-large"
|
53
55
|
pdf: PdfParsingConfig = PdfParsingConfig()
|
54
56
|
docx: DocxParsingConfig = DocxParsingConfig()
|
55
57
|
doc: DocParsingConfig = DocParsingConfig()
|
@@ -61,7 +63,7 @@ class Parser:
|
|
61
63
|
try:
|
62
64
|
self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
|
63
65
|
except Exception:
|
64
|
-
self.tokenizer = tiktoken.encoding_for_model("text-embedding-
|
66
|
+
self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
|
65
67
|
|
66
68
|
def num_tokens(self, text: str) -> int:
|
67
69
|
tokens = self.tokenizer.encode(text)
|
@@ -267,9 +269,11 @@ class Parser:
|
|
267
269
|
# Truncate the chunk text at the punctuation mark
|
268
270
|
chunk_text = chunk_text[: last_punctuation + 1]
|
269
271
|
|
270
|
-
#
|
271
|
-
#
|
272
|
-
|
272
|
+
# Replace redundant (3 or more) newlines with 2 newlines to preser
|
273
|
+
# paragraph separation!
|
274
|
+
# But do NOT strip leading/trailing whitespace, to preserve formatting
|
275
|
+
# (e.g. code blocks, or in case we want to stitch chunks back together)
|
276
|
+
chunk_text_to_append = re.sub(r"\n{3,}", "\n\n", chunk_text)
|
273
277
|
|
274
278
|
if len(chunk_text_to_append) > self.config.discard_chunk_chars:
|
275
279
|
# Append the chunk text to the list of chunks
|
langroid/parsing/utils.py
CHANGED
@@ -310,9 +310,9 @@ def extract_numbered_segments(s: str, specs: str) -> str:
|
|
310
310
|
]
|
311
311
|
|
312
312
|
# If we extracted any segments from this paragraph,
|
313
|
-
# join them and append to results
|
313
|
+
# join them with ellipsis (...) and append to results.
|
314
314
|
if extracted_segments:
|
315
|
-
extracted_paragraphs.append("
|
315
|
+
extracted_paragraphs.append("...".join(extracted_segments))
|
316
316
|
|
317
317
|
return "\n\n".join(extracted_paragraphs)
|
318
318
|
|
@@ -17,25 +17,45 @@ def extract_markdown_references(md_string: str) -> list[int]:
|
|
17
17
|
return sorted(set(int(match) for match in matches))
|
18
18
|
|
19
19
|
|
20
|
-
def format_footnote_text(content: str, width: int =
|
20
|
+
def format_footnote_text(content: str, width: int = 0) -> str:
|
21
21
|
"""
|
22
|
-
Formats the content
|
23
|
-
|
24
|
-
|
25
|
-
lines
|
22
|
+
Formats the content so that each original line is individually processed.
|
23
|
+
- If width=0, no wrapping is done (lines remain as is).
|
24
|
+
- If width>0, lines are wrapped to that width.
|
25
|
+
- Blank lines remain blank (with indentation).
|
26
|
+
- Everything is indented by 4 spaces (for markdown footnotes).
|
26
27
|
|
27
28
|
Args:
|
28
29
|
content (str): The text of the footnote to be formatted.
|
29
|
-
width (int): Maximum width of the text lines.
|
30
|
+
width (int): Maximum width of the text lines. If 0, lines are not wrapped.
|
30
31
|
|
31
32
|
Returns:
|
32
33
|
str: Properly formatted markdown footnote text.
|
33
34
|
"""
|
34
35
|
import textwrap
|
35
36
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
37
|
+
indent = " " # 4 spaces for markdown footnotes
|
38
|
+
lines = content.split("\n") # keep original line structure
|
39
|
+
|
40
|
+
output_lines = []
|
41
|
+
for line in lines:
|
42
|
+
# If the line is empty (or just spaces), keep it blank (but indented)
|
43
|
+
if not line.strip():
|
44
|
+
output_lines.append(indent)
|
45
|
+
continue
|
46
|
+
|
47
|
+
if width > 0:
|
48
|
+
# Wrap each non-empty line to the specified width
|
49
|
+
wrapped = textwrap.wrap(line, width=width)
|
50
|
+
if not wrapped:
|
51
|
+
# If textwrap gives nothing, add a blank (indented) line
|
52
|
+
output_lines.append(indent)
|
53
|
+
else:
|
54
|
+
for subline in wrapped:
|
55
|
+
output_lines.append(indent + subline)
|
56
|
+
else:
|
57
|
+
# No wrapping: just indent the original line
|
58
|
+
output_lines.append(indent + line)
|
59
|
+
|
60
|
+
# Join them with newline so we preserve the paragraph/blank line structure
|
61
|
+
return "\n".join(output_lines)
|
langroid/vector_store/base.py
CHANGED
@@ -264,7 +264,7 @@ class VectorStore(ABC):
|
|
264
264
|
metadata = copy.deepcopy(id2metadata[w[0]])
|
265
265
|
metadata.window_ids = w
|
266
266
|
document = Document(
|
267
|
-
content="
|
267
|
+
content="".join([d.content for d in self.get_documents_by_ids(w)]),
|
268
268
|
metadata=metadata,
|
269
269
|
)
|
270
270
|
# make a fresh id since content is in general different
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
|
4
4
|
|
5
5
|
from langroid.embedding_models.base import (
|
6
6
|
EmbeddingModelsConfig,
|
@@ -18,6 +18,10 @@ logger = logging.getLogger(__name__)
|
|
18
18
|
class ChromaDBConfig(VectorStoreConfig):
|
19
19
|
collection_name: str = "temp"
|
20
20
|
storage_path: str = ".chroma/data"
|
21
|
+
distance: Literal["cosine", "l2", "ip"] = "cosine"
|
22
|
+
construction_ef: int = 100
|
23
|
+
search_ef: int = 100
|
24
|
+
max_neighbors: int = 16
|
21
25
|
embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
|
22
26
|
host: str = "127.0.0.1"
|
23
27
|
port: int = 6333
|
@@ -109,6 +113,13 @@ class ChromaDB(VectorStore):
|
|
109
113
|
name=self.config.collection_name,
|
110
114
|
embedding_function=self.embedding_fn,
|
111
115
|
get_or_create=not replace,
|
116
|
+
metadata={
|
117
|
+
"hnsw:space": self.config.distance,
|
118
|
+
"hnsw:construction_ef": self.config.construction_ef,
|
119
|
+
"hnsw:search_ef": self.config.search_ef,
|
120
|
+
# we could expose other configs, see:
|
121
|
+
# https://docs.trychroma.com/docs/collections/configure
|
122
|
+
},
|
112
123
|
)
|
113
124
|
|
114
125
|
def add_documents(self, documents: Sequence[Document]) -> None:
|
@@ -78,7 +78,7 @@ class QdrantDB(VectorStore):
|
|
78
78
|
super().__init__(config)
|
79
79
|
self.config: QdrantDBConfig = config
|
80
80
|
self.embedding_fn: EmbeddingFunction = self.embedding_model.embedding_fn()
|
81
|
-
self.embedding_dim = self.
|
81
|
+
self.embedding_dim = len(self.embedding_fn(["test"])[0])
|
82
82
|
if self.config.use_sparse_embeddings:
|
83
83
|
try:
|
84
84
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
@@ -43,8 +43,8 @@ class WeaviateDB(VectorStore):
|
|
43
43
|
load_dotenv()
|
44
44
|
key = os.getenv("WEAVIATE_API_KEY")
|
45
45
|
url = os.getenv("WEAVIATE_API_URL")
|
46
|
-
if None
|
47
|
-
|
46
|
+
if url is None or key is None:
|
47
|
+
raise ValueError(
|
48
48
|
"""WEAVIATE_API_KEY, WEAVIATE_API_URL env variable must be set to use
|
49
49
|
WeaviateDB in cloud mode. Please set these values
|
50
50
|
in your .env file.
|
@@ -130,9 +130,9 @@ class WeaviateDB(VectorStore):
|
|
130
130
|
vector_index_config = Configure.VectorIndex.hnsw(
|
131
131
|
distance_metric=VectorDistances.COSINE,
|
132
132
|
)
|
133
|
-
if self.config.embedding
|
133
|
+
if isinstance(self.config.embedding, OpenAIEmbeddingsConfig):
|
134
134
|
vectorizer_config = Configure.Vectorizer.text2vec_openai(
|
135
|
-
model=self.
|
135
|
+
model=self.config.embedding.model_name,
|
136
136
|
)
|
137
137
|
else:
|
138
138
|
vectorizer_config = None
|
@@ -212,7 +212,7 @@ class WeaviateDB(VectorStore):
|
|
212
212
|
return_metadata=MetadataQuery(distance=True),
|
213
213
|
)
|
214
214
|
return [
|
215
|
-
(self.weaviate_obj_to_doc(item), 1 - item.metadata.distance)
|
215
|
+
(self.weaviate_obj_to_doc(item), 1 - (item.metadata.distance or 1))
|
216
216
|
for item in response.objects
|
217
217
|
]
|
218
218
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.37.0
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -12,6 +12,7 @@ Requires-Dist: async-generator<2.0,>=1.10
|
|
12
12
|
Requires-Dist: bs4<1.0.0,>=0.0.1
|
13
13
|
Requires-Dist: cerebras-cloud-sdk<2.0.0,>=1.1.0
|
14
14
|
Requires-Dist: colorlog<7.0.0,>=6.7.0
|
15
|
+
Requires-Dist: docling<3.0.0,>=2.16.0
|
15
16
|
Requires-Dist: docstring-parser<1.0,>=0.16
|
16
17
|
Requires-Dist: duckduckgo-search<7.0.0,>=6.0.0
|
17
18
|
Requires-Dist: faker<19.0.0,>=18.9.0
|
@@ -32,9 +33,10 @@ Requires-Dist: onnxruntime<2.0.0,>=1.16.1
|
|
32
33
|
Requires-Dist: openai<2.0.0,>=1.45.0
|
33
34
|
Requires-Dist: pandas<3.0.0,>=2.0.3
|
34
35
|
Requires-Dist: prettytable<4.0.0,>=3.8.0
|
35
|
-
Requires-Dist: pydantic<
|
36
|
+
Requires-Dist: pydantic<3.0.0,>=1
|
36
37
|
Requires-Dist: pygithub<2.0.0,>=1.58.1
|
37
38
|
Requires-Dist: pygments<3.0.0,>=2.15.1
|
39
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17
|
38
40
|
Requires-Dist: pyparsing<4.0.0,>=3.0.9
|
39
41
|
Requires-Dist: pytest-rerunfailures<16.0,>=15.0
|
40
42
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
@@ -55,14 +57,15 @@ Provides-Extra: all
|
|
55
57
|
Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'all'
|
56
58
|
Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'all'
|
57
59
|
Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'all'
|
60
|
+
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'all'
|
58
61
|
Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'all'
|
59
|
-
Requires-Dist: huggingface-hub<0.
|
62
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'all'
|
60
63
|
Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
|
61
64
|
Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
|
62
65
|
Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
|
63
66
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
|
64
|
-
Requires-Dist: pdfplumber<0.11.0,>=0.10.2; extra == 'all'
|
65
67
|
Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'all'
|
68
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'all'
|
66
69
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'all'
|
67
70
|
Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'all'
|
68
71
|
Requires-Dist: pypdf>=5.1.0; extra == 'all'
|
@@ -74,7 +77,7 @@ Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'all'
|
|
74
77
|
Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'all'
|
75
78
|
Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'all'
|
76
79
|
Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'all'
|
77
|
-
Requires-Dist: unstructured[docx,pdf,pptx]<0.
|
80
|
+
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'all'
|
78
81
|
Requires-Dist: weaviate-client>=4.9.6; extra == 'all'
|
79
82
|
Provides-Extra: arango
|
80
83
|
Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'arango'
|
@@ -89,13 +92,16 @@ Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'db'
|
|
89
92
|
Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'db'
|
90
93
|
Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'db'
|
91
94
|
Provides-Extra: doc-chat
|
95
|
+
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'doc-chat'
|
92
96
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'doc-chat'
|
93
|
-
Requires-Dist:
|
97
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'doc-chat'
|
94
98
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'doc-chat'
|
95
99
|
Requires-Dist: pypdf>=5.1.0; extra == 'doc-chat'
|
96
100
|
Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
|
97
101
|
Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
|
98
|
-
Requires-Dist: unstructured[docx,pdf,pptx]<0.
|
102
|
+
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
|
103
|
+
Provides-Extra: docling
|
104
|
+
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
|
99
105
|
Provides-Extra: docx
|
100
106
|
Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'docx'
|
101
107
|
Provides-Extra: fastembed
|
@@ -104,7 +110,7 @@ Provides-Extra: hf-embeddings
|
|
104
110
|
Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'hf-embeddings'
|
105
111
|
Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'hf-embeddings'
|
106
112
|
Provides-Extra: hf-transformers
|
107
|
-
Requires-Dist: huggingface-hub<0.
|
113
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'hf-transformers'
|
108
114
|
Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'hf-transformers'
|
109
115
|
Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'hf-transformers'
|
110
116
|
Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'hf-transformers'
|
@@ -125,13 +131,16 @@ Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'mysql'
|
|
125
131
|
Provides-Extra: neo4j
|
126
132
|
Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
|
127
133
|
Provides-Extra: pdf-parsers
|
134
|
+
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
|
128
135
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
|
129
|
-
Requires-Dist:
|
136
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
|
130
137
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
|
131
138
|
Requires-Dist: pypdf>=5.1.0; extra == 'pdf-parsers'
|
132
139
|
Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'pdf-parsers'
|
133
140
|
Provides-Extra: postgres
|
134
141
|
Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'postgres'
|
142
|
+
Provides-Extra: pymupdf4llm
|
143
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pymupdf4llm'
|
135
144
|
Provides-Extra: scrapy
|
136
145
|
Requires-Dist: scrapy<3.0.0,>=2.11.0; extra == 'scrapy'
|
137
146
|
Provides-Extra: sql
|
@@ -139,11 +148,11 @@ Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'sql'
|
|
139
148
|
Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'sql'
|
140
149
|
Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'sql'
|
141
150
|
Provides-Extra: transformers
|
142
|
-
Requires-Dist: huggingface-hub<0.
|
151
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'transformers'
|
143
152
|
Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'transformers'
|
144
153
|
Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'transformers'
|
145
154
|
Provides-Extra: unstructured
|
146
|
-
Requires-Dist: unstructured[docx,pdf,pptx]<0.
|
155
|
+
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'unstructured'
|
147
156
|
Provides-Extra: vecdbs
|
148
157
|
Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'vecdbs'
|
149
158
|
Requires-Dist: lancedb<0.9.0,>=0.8.2; extra == 'vecdbs'
|
@@ -292,20 +301,28 @@ teacher_task.run()
|
|
292
301
|
<summary> <b>Click to expand</b></summary>
|
293
302
|
|
294
303
|
- **Jan 2025:**
|
295
|
-
- [0.
|
304
|
+
- [0.36.0](https://github.com/langroid/langroid/releases/tag/0.36.0): Weaviate vector-db support (thanks @abab-dev).
|
305
|
+
- [0.35.0](https://github.com/langroid/langroid/releases/tag/0.35.0): Capture/Stream reasoning content from
|
306
|
+
Reasoning LLMs (e.g. DeepSeek, OpenAI o1) in addition to final answer.
|
307
|
+
- [0.34.0](https://github.com/langroid/langroid/releases/tag/0.34.0): DocChatAgent
|
308
|
+
chunk enrichment to improve retrieval. (collaboration with @dfm88).
|
309
|
+
- [0.33.0](https://github.com/langroid/langroid/releases/tag/0.33.3) Move from Poetry to uv! (thanks @abab-dev).
|
296
310
|
- [0.32.0](https://github.com/langroid/langroid/releases/tag/0.32.0) DeepSeek v3 support.
|
297
311
|
- **Dec 2024:**
|
298
312
|
- [0.31.0](https://github.com/langroid/langroid/releases/tag/0.31.0) Azure OpenAI Embeddings
|
299
|
-
- [0.30.0](https://github.com/langroid/langroid/releases/tag/0.30.0) Llama-cpp embeddings.
|
300
|
-
- [0.29.0](https://github.com/langroid/langroid/releases/tag/0.29.0) Custom Azure OpenAI Client
|
313
|
+
- [0.30.0](https://github.com/langroid/langroid/releases/tag/0.30.0) Llama-cpp embeddings (thanks @Kwigg).
|
314
|
+
- [0.29.0](https://github.com/langroid/langroid/releases/tag/0.29.0) Custom Azure OpenAI Client (thanks
|
315
|
+
@johannestang).
|
301
316
|
- [0.28.0](https://github.com/langroid/langroid/releases/tag/0.28.0) `ToolMessage`: `_handler` field to override
|
302
|
-
default handler method name in `request` field.
|
317
|
+
default handler method name in `request` field (thanks @alexagr).
|
303
318
|
- [0.27.0](https://github.com/langroid/langroid/releases/tag/0.27.0) OpenRouter Support.
|
304
319
|
- [0.26.0](https://github.com/langroid/langroid/releases/tag/0.26.0) Update to latest Chainlit.
|
305
|
-
- [0.25.0](https://github.com/langroid/langroid/releases/tag/0.25.0) True Async Methods for agent and
|
320
|
+
- [0.25.0](https://github.com/langroid/langroid/releases/tag/0.25.0) True Async Methods for agent and
|
321
|
+
user-response (thanks @alexagr).
|
306
322
|
- **Nov 2024:**
|
307
323
|
- **[0.24.0](https://langroid.github.io/langroid/notes/structured-output/)**:
|
308
324
|
Enables support for `Agent`s with strict JSON schema output format on compatible LLMs and strict mode for the OpenAI tools API.
|
325
|
+
(thanks @nilspalumbo).
|
309
326
|
- **[0.23.0](https://langroid.github.io/langroid/tutorials/local-llm-setup/#local-llms-hosted-on-glhfchat)**:
|
310
327
|
support for LLMs (e.g. `Qwen2.5-Coder-32b-Instruct`) hosted on glhf.chat
|
311
328
|
- **[0.22.0](https://langroid.github.io/langroid/notes/large-tool-results/)**:
|
@@ -1,5 +1,5 @@
|
|
1
1
|
langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
|
2
|
-
langroid/exceptions.py,sha256=
|
2
|
+
langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
|
3
3
|
langroid/mytypes.py,sha256=h1eMq1ZwTLVezObPfCseWNWbEOzP7mAKu2XoS63W1cM,2647
|
4
4
|
langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
|
@@ -14,7 +14,7 @@ langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a
|
|
14
14
|
langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
langroid/agent/callbacks/chainlit.py,sha256=RH8qUXaZE5o2WQz3WJQ1SdFtASGlxWCA6_HYz_3meDQ,20822
|
16
16
|
langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
|
17
|
-
langroid/agent/special/doc_chat_agent.py,sha256=
|
17
|
+
langroid/agent/special/doc_chat_agent.py,sha256=qoXp6PKI7oAQs8rgj934NzZaEEKsPICcgYl_iQY0bac,64818
|
18
18
|
langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
|
19
19
|
langroid/agent/special/lance_tools.py,sha256=qS8x4wi8mrqfbYV2ztFzrcxyhHQ0ZWOc-zkYiH7awj0,2105
|
20
20
|
langroid/agent/special/relevance_extractor_agent.py,sha256=zIx8GUdVo1aGW6ASla0NPQjYYIpmriK_TYMijqAx3F8,4796
|
@@ -57,7 +57,7 @@ langroid/cachedb/momento_cachedb.py,sha256=YEOJ62hEcV6iIeMr5aGgRYgWQqFYaej9gEDEc
|
|
57
57
|
langroid/cachedb/redis_cachedb.py,sha256=7kgnbf4b5CKsCrlL97mHWKvdvlLt8zgn7lc528jEpiE,5141
|
58
58
|
langroid/embedding_models/__init__.py,sha256=XhVIMQJbQRpImcnhA9sJR7h6r7QgPo1SKDCvwEUD9j4,851
|
59
59
|
langroid/embedding_models/base.py,sha256=DUhvzALoW2UMbtmLxP4eJTfPii99WjUNX7bwFpj_K-0,2395
|
60
|
-
langroid/embedding_models/models.py,sha256=
|
60
|
+
langroid/embedding_models/models.py,sha256=YppD52U1lbeygt8_SuPNi6piOV_FgBltZWH5e3l7iso,16776
|
61
61
|
langroid/embedding_models/remote_embeds.py,sha256=6_kjXByVbqhY9cGwl9R83ZcYC2km-nGieNNAo1McHaY,5151
|
62
62
|
langroid/embedding_models/protoc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
63
63
|
langroid/embedding_models/protoc/embeddings.proto,sha256=_O-SgFpTaylQeOTgSpxhEJ7CUw7PeCQQJLaPqpPYKJg,321
|
@@ -77,11 +77,11 @@ langroid/language_models/prompt_formatter/hf_formatter.py,sha256=PVJppmjRvD-2DF-
|
|
77
77
|
langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
|
78
78
|
langroid/parsing/__init__.py,sha256=ZgSAfgTC6VsTLFlRSWT-TwYco7SQeRMeZG-49MnKYGY,936
|
79
79
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
80
|
-
langroid/parsing/code_parser.py,sha256=
|
81
|
-
langroid/parsing/document_parser.py,sha256=
|
80
|
+
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
81
|
+
langroid/parsing/document_parser.py,sha256=1DjkoiieuPxlPtX-3FGzr3frDSKOjfKM4PhaKbVNQ1c,28570
|
82
82
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
83
83
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
84
|
-
langroid/parsing/parser.py,sha256=
|
84
|
+
langroid/parsing/parser.py,sha256=WDv4QnNtAcLSiPe6cPhHOa-aMhrt3OV-kKnVXdgwtmI,12276
|
85
85
|
langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
|
86
86
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
87
87
|
langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
|
@@ -89,7 +89,7 @@ langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,32
|
|
89
89
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
90
90
|
langroid/parsing/url_loader.py,sha256=JK48KktLRDBfjrt4nsUfy92M6yGdEeicAqOum2MdULM,4656
|
91
91
|
langroid/parsing/urls.py,sha256=XjpaV5onG7gKQ5iQeFTzHSw5P08Aqw0g-rMUu61lR6s,7988
|
92
|
-
langroid/parsing/utils.py,sha256=
|
92
|
+
langroid/parsing/utils.py,sha256=YrV2GNL4EOBGknA4AClPGdJ4S5B31radrt-Ou8OAKoU,12749
|
93
93
|
langroid/parsing/web_search.py,sha256=8rW8EI3tyHITaB2l9MT_6yLMeQfo8y-Ih-8N2v2uMpk,4931
|
94
94
|
langroid/prompts/__init__.py,sha256=RW11vK6jiLPuaUh4GpeFvstti73gkm8_rDMtrbo2YsU,142
|
95
95
|
langroid/prompts/dialog.py,sha256=SpfiSyofSgy2pwD1YboHR_yHO3LEEMbv6j2sm874jKo,331
|
@@ -111,18 +111,18 @@ langroid/utils/types.py,sha256=4GrOnU3HLWh-UwaUPp7LlB3V413q3K5OSzc0ggDoQ6A,2510
|
|
111
111
|
langroid/utils/algorithms/__init__.py,sha256=WylYoZymA0fnzpB4vrsH_0n7WsoLhmuZq8qxsOCjUpM,41
|
112
112
|
langroid/utils/algorithms/graph.py,sha256=JbdpPnUOhw4-D6O7ou101JLA3xPCD0Lr3qaPoFCaRfo,2866
|
113
113
|
langroid/utils/output/__init__.py,sha256=7P0f--4IZneNsTxXY5fd6d6iW-CeVe-KSsl-87sbBPc,340
|
114
|
-
langroid/utils/output/citations.py,sha256=
|
114
|
+
langroid/utils/output/citations.py,sha256=mQhRXVN-uhmKd2z32UZQBE0adZGEaQJ7cVXLfkrcZJI,2221
|
115
115
|
langroid/utils/output/printing.py,sha256=yzPJZN-8_jyOJmI9N_oLwEDfjMwVgk3IDiwnZ4eK_AE,2962
|
116
116
|
langroid/utils/output/status.py,sha256=rzbE7mDJcgNNvdtylCseQcPGCGghtJvVq3lB-OPJ49E,1049
|
117
117
|
langroid/vector_store/__init__.py,sha256=BcoOm1tG3y0EqjkIGmMOHkY9iTUhDHgyruknWDKgqIg,1214
|
118
|
-
langroid/vector_store/base.py,sha256=
|
119
|
-
langroid/vector_store/chromadb.py,sha256=
|
118
|
+
langroid/vector_store/base.py,sha256=suBanIt0iKEgnMnGdQOyWS58guG20Jyy-GK4DMMuYL0,14208
|
119
|
+
langroid/vector_store/chromadb.py,sha256=XkpW7pnSf6Lk7Nf1BEIw-zjYGYchoWHgrhnJX7YmxD8,8725
|
120
120
|
langroid/vector_store/lancedb.py,sha256=b3_vWkTjG8mweZ7ZNlUD-NjmQP_rLBZfyKWcxt2vosA,14855
|
121
121
|
langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3HmhHQICXLs,11663
|
122
122
|
langroid/vector_store/momento.py,sha256=UNHGT6jXuQtqY9f6MdqGU14bVnS0zHgIJUa30ULpUJo,10474
|
123
|
-
langroid/vector_store/qdrantdb.py,sha256=
|
124
|
-
langroid/vector_store/weaviatedb.py,sha256=
|
125
|
-
langroid-0.
|
126
|
-
langroid-0.
|
127
|
-
langroid-0.
|
128
|
-
langroid-0.
|
123
|
+
langroid/vector_store/qdrantdb.py,sha256=Cen6f-y6witiR53UQ-5a605Reo0gTj3ygXpE_ehYoZo,18116
|
124
|
+
langroid/vector_store/weaviatedb.py,sha256=C6jd1Twl5_jux3JYyrcTfQb63Lk9HuiUzVF4NahXuGo,10642
|
125
|
+
langroid-0.37.0.dist-info/METADATA,sha256=hlweiAhkVzVb_sVOPF-adwqwDPpAUUsgE1wJFRYNnKg,60524
|
126
|
+
langroid-0.37.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
127
|
+
langroid-0.37.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
128
|
+
langroid-0.37.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|