langroid 0.36.1__py3-none-any.whl → 0.37.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/special/doc_chat_agent.py +3 -3
- langroid/embedding_models/models.py +2 -2
- langroid/exceptions.py +16 -4
- langroid/parsing/code_parser.py +1 -1
- langroid/parsing/document_parser.py +167 -64
- langroid/parsing/parser.py +6 -4
- langroid/vector_store/chromadb.py +12 -1
- langroid/vector_store/qdrantdb.py +1 -1
- langroid/vector_store/weaviatedb.py +5 -5
- {langroid-0.36.1.dist-info → langroid-0.37.0.dist-info}/METADATA +20 -11
- {langroid-0.36.1.dist-info → langroid-0.37.0.dist-info}/RECORD +13 -13
- {langroid-0.36.1.dist-info → langroid-0.37.0.dist-info}/WHEEL +0 -0
- {langroid-0.36.1.dist-info → langroid-0.37.0.dist-info}/licenses/LICENSE +0 -0
@@ -100,7 +100,7 @@ hf_embed_config = SentenceTransformerEmbeddingsConfig(
|
|
100
100
|
|
101
101
|
oai_embed_config = OpenAIEmbeddingsConfig(
|
102
102
|
model_type="openai",
|
103
|
-
model_name="text-embedding-
|
103
|
+
model_name="text-embedding-3-small",
|
104
104
|
dims=1536,
|
105
105
|
)
|
106
106
|
|
@@ -189,8 +189,8 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
189
189
|
# NOTE: PDF parsing is extremely challenging, and each library
|
190
190
|
# has its own strengths and weaknesses.
|
191
191
|
# Try one that works for your use case.
|
192
|
-
# or "unstructured", "
|
193
|
-
library="
|
192
|
+
# or "unstructured", "fitz", "pymupdf4llm", "pypdf"
|
193
|
+
library="pymupdf4llm",
|
194
194
|
),
|
195
195
|
)
|
196
196
|
|
@@ -18,7 +18,7 @@ AzureADTokenProvider = Callable[[], str]
|
|
18
18
|
|
19
19
|
class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
20
20
|
model_type: str = "openai"
|
21
|
-
model_name: str = "text-embedding-
|
21
|
+
model_name: str = "text-embedding-3-large"
|
22
22
|
api_key: str = ""
|
23
23
|
api_base: Optional[str] = None
|
24
24
|
organization: str = ""
|
@@ -28,7 +28,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
|
28
28
|
|
29
29
|
class AzureOpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
30
30
|
model_type: str = "azure-openai"
|
31
|
-
model_name: str = "text-embedding-
|
31
|
+
model_name: str = "text-embedding-3-large"
|
32
32
|
api_key: str = ""
|
33
33
|
api_base: str = ""
|
34
34
|
deployment_name: Optional[str] = None
|
langroid/exceptions.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Optional
|
1
|
+
from typing import List, Optional
|
2
2
|
|
3
3
|
|
4
4
|
class XMLException(Exception):
|
@@ -15,7 +15,7 @@ class LangroidImportError(ImportError):
|
|
15
15
|
def __init__(
|
16
16
|
self,
|
17
17
|
package: Optional[str] = None,
|
18
|
-
extra: Optional[str] = None,
|
18
|
+
extra: Optional[str | List[str]] = None,
|
19
19
|
error: str = "",
|
20
20
|
*args: object,
|
21
21
|
) -> None:
|
@@ -33,9 +33,21 @@ class LangroidImportError(ImportError):
|
|
33
33
|
error = f"{package} is not installed by default with Langroid.\n"
|
34
34
|
|
35
35
|
if extra:
|
36
|
+
if isinstance(extra, list):
|
37
|
+
help_preamble = f"""
|
38
|
+
If you want to use it, please install langroid with one of these
|
39
|
+
extras: {', '.join(extra)}. The examples below use the first one,
|
40
|
+
i.e. {extra[0]}.
|
41
|
+
"""
|
42
|
+
extra = extra[0]
|
43
|
+
else:
|
44
|
+
help_preamble = f"""
|
45
|
+
If you want to use it, please install langroid with the
|
46
|
+
`{extra}` extra.
|
47
|
+
"""
|
48
|
+
|
36
49
|
install_help = f"""
|
37
|
-
|
38
|
-
with the `{extra}` extra, for example:
|
50
|
+
{help_preamble}
|
39
51
|
|
40
52
|
If you are using pip:
|
41
53
|
pip install "langroid[{extra}]"
|
langroid/parsing/code_parser.py
CHANGED
@@ -3,9 +3,10 @@ from __future__ import annotations
|
|
3
3
|
import itertools
|
4
4
|
import logging
|
5
5
|
import re
|
6
|
+
import tempfile
|
6
7
|
from enum import Enum
|
7
8
|
from io import BytesIO
|
8
|
-
from typing import TYPE_CHECKING, Any, Generator, List, Tuple
|
9
|
+
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
|
9
10
|
|
10
11
|
from langroid.exceptions import LangroidImportError
|
11
12
|
from langroid.utils.object_registry import ObjectRegistry
|
@@ -15,18 +16,24 @@ try:
|
|
15
16
|
except ImportError:
|
16
17
|
if not TYPE_CHECKING:
|
17
18
|
fitz = None
|
19
|
+
try:
|
20
|
+
import pymupdf4llm
|
21
|
+
except ImportError:
|
22
|
+
if not TYPE_CHECKING:
|
23
|
+
pymupdf4llm = None
|
18
24
|
|
19
25
|
try:
|
20
|
-
import
|
26
|
+
import docling
|
21
27
|
except ImportError:
|
22
28
|
if not TYPE_CHECKING:
|
23
|
-
|
29
|
+
docling = None
|
24
30
|
|
25
31
|
try:
|
26
|
-
import
|
32
|
+
import pypdf
|
27
33
|
except ImportError:
|
28
34
|
if not TYPE_CHECKING:
|
29
|
-
|
35
|
+
pypdf = None
|
36
|
+
|
30
37
|
|
31
38
|
import requests
|
32
39
|
from bs4 import BeautifulSoup
|
@@ -41,6 +48,7 @@ logger = logging.getLogger(__name__)
|
|
41
48
|
|
42
49
|
|
43
50
|
class DocumentType(str, Enum):
|
51
|
+
# TODO add `md` (Markdown) and `html`
|
44
52
|
PDF = "pdf"
|
45
53
|
DOCX = "docx"
|
46
54
|
DOC = "doc"
|
@@ -139,10 +147,12 @@ class DocumentParser(Parser):
|
|
139
147
|
if inferred_doc_type == DocumentType.PDF:
|
140
148
|
if config.pdf.library == "fitz":
|
141
149
|
return FitzPDFParser(source, config)
|
150
|
+
elif config.pdf.library == "pymupdf4llm":
|
151
|
+
return PyMuPDF4LLMParser(source, config)
|
152
|
+
elif config.pdf.library == "docling":
|
153
|
+
return DoclingParser(source, config)
|
142
154
|
elif config.pdf.library == "pypdf":
|
143
155
|
return PyPDFParser(source, config)
|
144
|
-
elif config.pdf.library == "pdfplumber":
|
145
|
-
return PDFPlumberParser(source, config)
|
146
156
|
elif config.pdf.library == "unstructured":
|
147
157
|
return UnstructuredPDFParser(source, config)
|
148
158
|
elif config.pdf.library == "pdf2image":
|
@@ -307,8 +317,11 @@ class DocumentParser(Parser):
|
|
307
317
|
"""Yield each page in the PDF."""
|
308
318
|
raise NotImplementedError
|
309
319
|
|
310
|
-
def
|
311
|
-
"""
|
320
|
+
def get_document_from_page(self, page: Any) -> Document:
|
321
|
+
"""
|
322
|
+
Get Langroid Document object (with possible metadata)
|
323
|
+
corresponding to a given page.
|
324
|
+
"""
|
312
325
|
raise NotImplementedError
|
313
326
|
|
314
327
|
def fix_text(self, text: str) -> str:
|
@@ -335,7 +348,10 @@ class DocumentParser(Parser):
|
|
335
348
|
"""
|
336
349
|
|
337
350
|
text = "".join(
|
338
|
-
[
|
351
|
+
[
|
352
|
+
self.get_document_from_page(page).content
|
353
|
+
for _, page in self.iterate_pages()
|
354
|
+
]
|
339
355
|
)
|
340
356
|
return Document(content=text, metadata=DocMetaData(source=self.source))
|
341
357
|
|
@@ -359,7 +375,10 @@ class DocumentParser(Parser):
|
|
359
375
|
common_id = ObjectRegistry.new_id()
|
360
376
|
n_chunks = 0 # how many chunk so far
|
361
377
|
for i, page in self.iterate_pages():
|
362
|
-
|
378
|
+
# not used but could be useful, esp to blend the
|
379
|
+
# metadata from the pages into the chunks
|
380
|
+
page_doc = self.get_document_from_page(page)
|
381
|
+
page_text = page_doc.content
|
363
382
|
split += self.tokenizer.encode(page_text)
|
364
383
|
pages.append(str(i + 1))
|
365
384
|
# split could be so long it needs to be split
|
@@ -422,81 +441,152 @@ class FitzPDFParser(DocumentParser):
|
|
422
441
|
yield i, page
|
423
442
|
doc.close()
|
424
443
|
|
425
|
-
def
|
444
|
+
def get_document_from_page(self, page: "fitz.Page") -> Document:
|
426
445
|
"""
|
427
|
-
|
446
|
+
Get Document object from a given `fitz` page.
|
428
447
|
|
429
448
|
Args:
|
430
449
|
page (fitz.Page): The `fitz` page object.
|
431
450
|
|
432
451
|
Returns:
|
433
|
-
|
452
|
+
Document: Document object, with content and possible metadata.
|
434
453
|
"""
|
435
|
-
return
|
454
|
+
return Document(
|
455
|
+
content=self.fix_text(page.get_text()),
|
456
|
+
metadata=DocMetaData(source=self.source),
|
457
|
+
)
|
436
458
|
|
437
459
|
|
438
|
-
class
|
460
|
+
class PyMuPDF4LLMParser(DocumentParser):
|
439
461
|
"""
|
440
|
-
Parser for processing PDFs using the `
|
462
|
+
Parser for processing PDFs using the `pymupdf4llm` library.
|
441
463
|
"""
|
442
464
|
|
443
|
-
def iterate_pages(self) -> Generator[Tuple[int,
|
465
|
+
def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
|
444
466
|
"""
|
445
|
-
Yield each page in the PDF using `
|
467
|
+
Yield each page in the PDF using `fitz`.
|
446
468
|
|
447
469
|
Returns:
|
448
|
-
Generator[
|
470
|
+
Generator[fitz.Page]: Generator yielding each page.
|
449
471
|
"""
|
450
|
-
if
|
451
|
-
raise LangroidImportError(
|
452
|
-
|
453
|
-
|
472
|
+
if fitz is None:
|
473
|
+
raise LangroidImportError(
|
474
|
+
"pymupdf4llm", ["pymupdf4llm", "all", "pdf-parsers", "doc-chat"]
|
475
|
+
)
|
476
|
+
doc: fitz.Document = fitz.open(stream=self.doc_bytes, filetype="pdf")
|
477
|
+
pages: List[Dict[str, Any]] = pymupdf4llm.to_markdown(doc, page_chunks=True)
|
478
|
+
for i, page in enumerate(pages):
|
454
479
|
yield i, page
|
480
|
+
doc.close()
|
455
481
|
|
456
|
-
def
|
482
|
+
def get_document_from_page(self, page: Dict[str, Any]) -> Document:
|
457
483
|
"""
|
458
|
-
|
484
|
+
Get Document object corresponding to a given "page-chunk"
|
485
|
+
dictionary, see:
|
486
|
+
https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html
|
487
|
+
|
459
488
|
|
460
489
|
Args:
|
461
|
-
page (
|
490
|
+
page (Dict[str,Any]): The "page-chunk" dictionary.
|
462
491
|
|
463
492
|
Returns:
|
464
|
-
|
493
|
+
Document: Document object, with content and possible metadata.
|
465
494
|
"""
|
466
|
-
return
|
495
|
+
return Document(
|
496
|
+
content=self.fix_text(page.get("text", "")),
|
497
|
+
# TODO could possible use other metadata from page, see above link.
|
498
|
+
metadata=DocMetaData(source=self.source),
|
499
|
+
)
|
467
500
|
|
468
501
|
|
469
|
-
class
|
502
|
+
class DoclingParser(DocumentParser):
|
470
503
|
"""
|
471
|
-
Parser for processing PDFs using the `
|
504
|
+
Parser for processing PDFs using the `docling` library.
|
472
505
|
"""
|
473
506
|
|
474
|
-
def iterate_pages(
|
475
|
-
|
476
|
-
|
507
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
508
|
+
"""
|
509
|
+
Yield each page in the PDF using `docling`.
|
510
|
+
|
511
|
+
Returns:
|
512
|
+
Generator[docling.Page]: Generator yielding each page.
|
477
513
|
"""
|
478
|
-
|
514
|
+
if docling is None:
|
515
|
+
raise LangroidImportError(
|
516
|
+
"docling", ["docling", "pdf-parsers", "all", "doc-chat"]
|
517
|
+
)
|
518
|
+
from docling.datamodel.document import TextItem # type: ignore
|
519
|
+
from docling.document_converter import ( # type: ignore
|
520
|
+
ConversionResult,
|
521
|
+
DocumentConverter,
|
522
|
+
)
|
523
|
+
|
524
|
+
converter = DocumentConverter()
|
525
|
+
file_path = self.source
|
526
|
+
if file_path == "bytes":
|
527
|
+
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
528
|
+
tmp.write(self.doc_bytes.getvalue())
|
529
|
+
file_path = tmp.name
|
530
|
+
result: ConversionResult = converter.convert(file_path)
|
531
|
+
doc = result.document
|
532
|
+
n_pages = doc.num_pages() # type: ignore
|
533
|
+
for i in range(n_pages):
|
534
|
+
texts = [
|
535
|
+
item[0].text
|
536
|
+
for item in doc.iterate_items(page_no=i + 1)
|
537
|
+
if isinstance(item[0], TextItem)
|
538
|
+
]
|
539
|
+
text = "\n".join(texts)
|
540
|
+
yield i, text
|
541
|
+
|
542
|
+
def get_document_from_page(self, page: str) -> Document:
|
543
|
+
"""
|
544
|
+
Get Document object from a given `docling` "page" (actually a chunk).
|
545
|
+
|
546
|
+
Args:
|
547
|
+
page (docling.chunking.DocChunk): The `docling` chunk
|
479
548
|
|
480
549
|
Returns:
|
481
|
-
|
550
|
+
Document: Document object, with content and possible metadata.
|
482
551
|
"""
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
552
|
+
return Document(
|
553
|
+
content=self.fix_text(page),
|
554
|
+
metadata=DocMetaData(source=self.source),
|
555
|
+
)
|
556
|
+
|
488
557
|
|
489
|
-
|
558
|
+
class PyPDFParser(DocumentParser):
|
559
|
+
"""
|
560
|
+
Parser for processing PDFs using the `pypdf` library.
|
561
|
+
"""
|
562
|
+
|
563
|
+
def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
|
490
564
|
"""
|
491
|
-
|
565
|
+
Yield each page in the PDF using `pypdf`.
|
566
|
+
|
567
|
+
Returns:
|
568
|
+
Generator[pypdf.pdf.PageObject]: Generator yielding each page.
|
569
|
+
"""
|
570
|
+
if pypdf is None:
|
571
|
+
raise LangroidImportError("pypdf", "pdf-parsers")
|
572
|
+
reader = pypdf.PdfReader(self.doc_bytes)
|
573
|
+
for i, page in enumerate(reader.pages):
|
574
|
+
yield i, page
|
575
|
+
|
576
|
+
def get_document_from_page(self, page: pypdf.PageObject) -> Document:
|
577
|
+
"""
|
578
|
+
Get Document object from a given `pypdf` page.
|
492
579
|
|
493
580
|
Args:
|
494
|
-
page (
|
581
|
+
page (pypdf.pdf.PageObject): The `pypdf` page object.
|
495
582
|
|
496
583
|
Returns:
|
497
|
-
|
584
|
+
Document: Document object, with content and possible metadata.
|
498
585
|
"""
|
499
|
-
return
|
586
|
+
return Document(
|
587
|
+
content=self.fix_text(page.extract_text()),
|
588
|
+
metadata=DocMetaData(source=self.source),
|
589
|
+
)
|
500
590
|
|
501
591
|
|
502
592
|
class ImagePdfParser(DocumentParser):
|
@@ -516,15 +606,15 @@ class ImagePdfParser(DocumentParser):
|
|
516
606
|
for i, image in enumerate(images):
|
517
607
|
yield i, image
|
518
608
|
|
519
|
-
def
|
609
|
+
def get_document_from_page(self, page: "Image") -> Document: # type: ignore
|
520
610
|
"""
|
521
|
-
|
611
|
+
Get Document object corresponding to a given `pdf2image` page.
|
522
612
|
|
523
613
|
Args:
|
524
614
|
page (Image): The PIL Image object.
|
525
615
|
|
526
616
|
Returns:
|
527
|
-
|
617
|
+
Document: Document object, with content and possible metadata.
|
528
618
|
"""
|
529
619
|
try:
|
530
620
|
import pytesseract
|
@@ -532,7 +622,10 @@ class ImagePdfParser(DocumentParser):
|
|
532
622
|
raise LangroidImportError("pytesseract", "pdf-parsers")
|
533
623
|
|
534
624
|
text = pytesseract.image_to_string(page)
|
535
|
-
return
|
625
|
+
return Document(
|
626
|
+
content=self.fix_text(text),
|
627
|
+
metadata=DocMetaData(source=self.source),
|
628
|
+
)
|
536
629
|
|
537
630
|
|
538
631
|
class UnstructuredPDFParser(DocumentParser):
|
@@ -564,8 +657,8 @@ class UnstructuredPDFParser(DocumentParser):
|
|
564
657
|
The `unstructured` library failed to parse the pdf.
|
565
658
|
Please try a different library by setting the `library` field
|
566
659
|
in the `pdf` section of the `parsing` field in the config file.
|
567
|
-
|
568
|
-
fitz,
|
660
|
+
Other supported libraries are:
|
661
|
+
fitz, pymupdf4llm, pypdf
|
569
662
|
"""
|
570
663
|
)
|
571
664
|
|
@@ -584,18 +677,21 @@ class UnstructuredPDFParser(DocumentParser):
|
|
584
677
|
if page_elements:
|
585
678
|
yield page_number, page_elements
|
586
679
|
|
587
|
-
def
|
680
|
+
def get_document_from_page(self, page: Any) -> Document:
|
588
681
|
"""
|
589
|
-
|
682
|
+
Get Document object from a given `unstructured` element.
|
590
683
|
|
591
684
|
Args:
|
592
685
|
page (unstructured element): The `unstructured` element object.
|
593
686
|
|
594
687
|
Returns:
|
595
|
-
|
688
|
+
Document: Document object, with content and possible metadata.
|
596
689
|
"""
|
597
690
|
text = " ".join(el.text for el in page)
|
598
|
-
return
|
691
|
+
return Document(
|
692
|
+
content=self.fix_text(text),
|
693
|
+
metadata=DocMetaData(source=self.source),
|
694
|
+
)
|
599
695
|
|
600
696
|
|
601
697
|
class UnstructuredDocxParser(DocumentParser):
|
@@ -632,9 +728,9 @@ class UnstructuredDocxParser(DocumentParser):
|
|
632
728
|
if page_elements:
|
633
729
|
yield page_number, page_elements
|
634
730
|
|
635
|
-
def
|
731
|
+
def get_document_from_page(self, page: Any) -> Document:
|
636
732
|
"""
|
637
|
-
|
733
|
+
Get Document object from a given `unstructured` element.
|
638
734
|
|
639
735
|
Note:
|
640
736
|
The concept of "pages" doesn't actually exist in the .docx file format in
|
@@ -647,10 +743,13 @@ class UnstructuredDocxParser(DocumentParser):
|
|
647
743
|
page (unstructured element): The `unstructured` element object.
|
648
744
|
|
649
745
|
Returns:
|
650
|
-
|
746
|
+
Document object, with content and possible metadata.
|
651
747
|
"""
|
652
748
|
text = " ".join(el.text for el in page)
|
653
|
-
return
|
749
|
+
return Document(
|
750
|
+
content=self.fix_text(text),
|
751
|
+
metadata=DocMetaData(source=self.source),
|
752
|
+
)
|
654
753
|
|
655
754
|
|
656
755
|
class UnstructuredDocParser(UnstructuredDocxParser):
|
@@ -704,15 +803,19 @@ class PythonDocxParser(DocumentParser):
|
|
704
803
|
for i, para in enumerate(doc.paragraphs, start=1):
|
705
804
|
yield i, [para]
|
706
805
|
|
707
|
-
def
|
806
|
+
def get_document_from_page(self, page: Any) -> Document:
|
708
807
|
"""
|
709
|
-
|
808
|
+
Get Document object from a given 'page', which in this case is a single
|
809
|
+
paragraph.
|
710
810
|
|
711
811
|
Args:
|
712
812
|
page (list): A list containing a single Paragraph object.
|
713
813
|
|
714
814
|
Returns:
|
715
|
-
|
815
|
+
Document: Document object, with content and possible metadata.
|
716
816
|
"""
|
717
817
|
paragraph = page[0]
|
718
|
-
return
|
818
|
+
return Document(
|
819
|
+
content=self.fix_text(paragraph.text),
|
820
|
+
metadata=DocMetaData(source=self.source),
|
821
|
+
)
|
langroid/parsing/parser.py
CHANGED
@@ -23,11 +23,12 @@ class Splitter(str, Enum):
|
|
23
23
|
class PdfParsingConfig(BaseSettings):
|
24
24
|
library: Literal[
|
25
25
|
"fitz",
|
26
|
-
"
|
26
|
+
"pymupdf4llm",
|
27
|
+
"docling",
|
27
28
|
"pypdf",
|
28
29
|
"unstructured",
|
29
30
|
"pdf2image",
|
30
|
-
] = "
|
31
|
+
] = "pymupdf4llm"
|
31
32
|
|
32
33
|
|
33
34
|
class DocxParsingConfig(BaseSettings):
|
@@ -40,6 +41,7 @@ class DocParsingConfig(BaseSettings):
|
|
40
41
|
|
41
42
|
class ParsingConfig(BaseSettings):
|
42
43
|
splitter: str = Splitter.TOKENS
|
44
|
+
chunk_by_page: bool = False # split by page?
|
43
45
|
chunk_size: int = 200 # aim for this many tokens per chunk
|
44
46
|
overlap: int = 50 # overlap between chunks
|
45
47
|
max_chunks: int = 10_000
|
@@ -49,7 +51,7 @@ class ParsingConfig(BaseSettings):
|
|
49
51
|
n_similar_docs: int = 4
|
50
52
|
n_neighbor_ids: int = 5 # window size to store around each chunk
|
51
53
|
separators: List[str] = ["\n\n", "\n", " ", ""]
|
52
|
-
token_encoding_model: str = "text-embedding-
|
54
|
+
token_encoding_model: str = "text-embedding-3-large"
|
53
55
|
pdf: PdfParsingConfig = PdfParsingConfig()
|
54
56
|
docx: DocxParsingConfig = DocxParsingConfig()
|
55
57
|
doc: DocParsingConfig = DocParsingConfig()
|
@@ -61,7 +63,7 @@ class Parser:
|
|
61
63
|
try:
|
62
64
|
self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
|
63
65
|
except Exception:
|
64
|
-
self.tokenizer = tiktoken.encoding_for_model("text-embedding-
|
66
|
+
self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
|
65
67
|
|
66
68
|
def num_tokens(self, text: str) -> int:
|
67
69
|
tokens = self.tokenizer.encode(text)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
|
4
4
|
|
5
5
|
from langroid.embedding_models.base import (
|
6
6
|
EmbeddingModelsConfig,
|
@@ -18,6 +18,10 @@ logger = logging.getLogger(__name__)
|
|
18
18
|
class ChromaDBConfig(VectorStoreConfig):
|
19
19
|
collection_name: str = "temp"
|
20
20
|
storage_path: str = ".chroma/data"
|
21
|
+
distance: Literal["cosine", "l2", "ip"] = "cosine"
|
22
|
+
construction_ef: int = 100
|
23
|
+
search_ef: int = 100
|
24
|
+
max_neighbors: int = 16
|
21
25
|
embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
|
22
26
|
host: str = "127.0.0.1"
|
23
27
|
port: int = 6333
|
@@ -109,6 +113,13 @@ class ChromaDB(VectorStore):
|
|
109
113
|
name=self.config.collection_name,
|
110
114
|
embedding_function=self.embedding_fn,
|
111
115
|
get_or_create=not replace,
|
116
|
+
metadata={
|
117
|
+
"hnsw:space": self.config.distance,
|
118
|
+
"hnsw:construction_ef": self.config.construction_ef,
|
119
|
+
"hnsw:search_ef": self.config.search_ef,
|
120
|
+
# we could expose other configs, see:
|
121
|
+
# https://docs.trychroma.com/docs/collections/configure
|
122
|
+
},
|
112
123
|
)
|
113
124
|
|
114
125
|
def add_documents(self, documents: Sequence[Document]) -> None:
|
@@ -78,7 +78,7 @@ class QdrantDB(VectorStore):
|
|
78
78
|
super().__init__(config)
|
79
79
|
self.config: QdrantDBConfig = config
|
80
80
|
self.embedding_fn: EmbeddingFunction = self.embedding_model.embedding_fn()
|
81
|
-
self.embedding_dim = self.
|
81
|
+
self.embedding_dim = len(self.embedding_fn(["test"])[0])
|
82
82
|
if self.config.use_sparse_embeddings:
|
83
83
|
try:
|
84
84
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
@@ -43,8 +43,8 @@ class WeaviateDB(VectorStore):
|
|
43
43
|
load_dotenv()
|
44
44
|
key = os.getenv("WEAVIATE_API_KEY")
|
45
45
|
url = os.getenv("WEAVIATE_API_URL")
|
46
|
-
if None
|
47
|
-
|
46
|
+
if url is None or key is None:
|
47
|
+
raise ValueError(
|
48
48
|
"""WEAVIATE_API_KEY, WEAVIATE_API_URL env variable must be set to use
|
49
49
|
WeaviateDB in cloud mode. Please set these values
|
50
50
|
in your .env file.
|
@@ -130,9 +130,9 @@ class WeaviateDB(VectorStore):
|
|
130
130
|
vector_index_config = Configure.VectorIndex.hnsw(
|
131
131
|
distance_metric=VectorDistances.COSINE,
|
132
132
|
)
|
133
|
-
if self.config.embedding
|
133
|
+
if isinstance(self.config.embedding, OpenAIEmbeddingsConfig):
|
134
134
|
vectorizer_config = Configure.Vectorizer.text2vec_openai(
|
135
|
-
model=self.
|
135
|
+
model=self.config.embedding.model_name,
|
136
136
|
)
|
137
137
|
else:
|
138
138
|
vectorizer_config = None
|
@@ -212,7 +212,7 @@ class WeaviateDB(VectorStore):
|
|
212
212
|
return_metadata=MetadataQuery(distance=True),
|
213
213
|
)
|
214
214
|
return [
|
215
|
-
(self.weaviate_obj_to_doc(item), 1 - item.metadata.distance)
|
215
|
+
(self.weaviate_obj_to_doc(item), 1 - (item.metadata.distance or 1))
|
216
216
|
for item in response.objects
|
217
217
|
]
|
218
218
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.37.0
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -12,6 +12,7 @@ Requires-Dist: async-generator<2.0,>=1.10
|
|
12
12
|
Requires-Dist: bs4<1.0.0,>=0.0.1
|
13
13
|
Requires-Dist: cerebras-cloud-sdk<2.0.0,>=1.1.0
|
14
14
|
Requires-Dist: colorlog<7.0.0,>=6.7.0
|
15
|
+
Requires-Dist: docling<3.0.0,>=2.16.0
|
15
16
|
Requires-Dist: docstring-parser<1.0,>=0.16
|
16
17
|
Requires-Dist: duckduckgo-search<7.0.0,>=6.0.0
|
17
18
|
Requires-Dist: faker<19.0.0,>=18.9.0
|
@@ -32,9 +33,10 @@ Requires-Dist: onnxruntime<2.0.0,>=1.16.1
|
|
32
33
|
Requires-Dist: openai<2.0.0,>=1.45.0
|
33
34
|
Requires-Dist: pandas<3.0.0,>=2.0.3
|
34
35
|
Requires-Dist: prettytable<4.0.0,>=3.8.0
|
35
|
-
Requires-Dist: pydantic<
|
36
|
+
Requires-Dist: pydantic<3.0.0,>=1
|
36
37
|
Requires-Dist: pygithub<2.0.0,>=1.58.1
|
37
38
|
Requires-Dist: pygments<3.0.0,>=2.15.1
|
39
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17
|
38
40
|
Requires-Dist: pyparsing<4.0.0,>=3.0.9
|
39
41
|
Requires-Dist: pytest-rerunfailures<16.0,>=15.0
|
40
42
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
@@ -55,14 +57,15 @@ Provides-Extra: all
|
|
55
57
|
Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'all'
|
56
58
|
Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'all'
|
57
59
|
Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'all'
|
60
|
+
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'all'
|
58
61
|
Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'all'
|
59
|
-
Requires-Dist: huggingface-hub<0.
|
62
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'all'
|
60
63
|
Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
|
61
64
|
Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
|
62
65
|
Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
|
63
66
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
|
64
|
-
Requires-Dist: pdfplumber<0.11.0,>=0.10.2; extra == 'all'
|
65
67
|
Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'all'
|
68
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'all'
|
66
69
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'all'
|
67
70
|
Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'all'
|
68
71
|
Requires-Dist: pypdf>=5.1.0; extra == 'all'
|
@@ -74,7 +77,7 @@ Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'all'
|
|
74
77
|
Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'all'
|
75
78
|
Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'all'
|
76
79
|
Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'all'
|
77
|
-
Requires-Dist: unstructured[docx,pdf,pptx]<0.
|
80
|
+
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'all'
|
78
81
|
Requires-Dist: weaviate-client>=4.9.6; extra == 'all'
|
79
82
|
Provides-Extra: arango
|
80
83
|
Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'arango'
|
@@ -89,13 +92,16 @@ Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'db'
|
|
89
92
|
Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'db'
|
90
93
|
Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'db'
|
91
94
|
Provides-Extra: doc-chat
|
95
|
+
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'doc-chat'
|
92
96
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'doc-chat'
|
93
|
-
Requires-Dist:
|
97
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'doc-chat'
|
94
98
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'doc-chat'
|
95
99
|
Requires-Dist: pypdf>=5.1.0; extra == 'doc-chat'
|
96
100
|
Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
|
97
101
|
Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
|
98
|
-
Requires-Dist: unstructured[docx,pdf,pptx]<0.
|
102
|
+
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
|
103
|
+
Provides-Extra: docling
|
104
|
+
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
|
99
105
|
Provides-Extra: docx
|
100
106
|
Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'docx'
|
101
107
|
Provides-Extra: fastembed
|
@@ -104,7 +110,7 @@ Provides-Extra: hf-embeddings
|
|
104
110
|
Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'hf-embeddings'
|
105
111
|
Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'hf-embeddings'
|
106
112
|
Provides-Extra: hf-transformers
|
107
|
-
Requires-Dist: huggingface-hub<0.
|
113
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'hf-transformers'
|
108
114
|
Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'hf-transformers'
|
109
115
|
Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'hf-transformers'
|
110
116
|
Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'hf-transformers'
|
@@ -125,13 +131,16 @@ Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'mysql'
|
|
125
131
|
Provides-Extra: neo4j
|
126
132
|
Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
|
127
133
|
Provides-Extra: pdf-parsers
|
134
|
+
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
|
128
135
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
|
129
|
-
Requires-Dist:
|
136
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
|
130
137
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
|
131
138
|
Requires-Dist: pypdf>=5.1.0; extra == 'pdf-parsers'
|
132
139
|
Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'pdf-parsers'
|
133
140
|
Provides-Extra: postgres
|
134
141
|
Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'postgres'
|
142
|
+
Provides-Extra: pymupdf4llm
|
143
|
+
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pymupdf4llm'
|
135
144
|
Provides-Extra: scrapy
|
136
145
|
Requires-Dist: scrapy<3.0.0,>=2.11.0; extra == 'scrapy'
|
137
146
|
Provides-Extra: sql
|
@@ -139,11 +148,11 @@ Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'sql'
|
|
139
148
|
Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'sql'
|
140
149
|
Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'sql'
|
141
150
|
Provides-Extra: transformers
|
142
|
-
Requires-Dist: huggingface-hub<0.
|
151
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'transformers'
|
143
152
|
Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'transformers'
|
144
153
|
Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'transformers'
|
145
154
|
Provides-Extra: unstructured
|
146
|
-
Requires-Dist: unstructured[docx,pdf,pptx]<0.
|
155
|
+
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'unstructured'
|
147
156
|
Provides-Extra: vecdbs
|
148
157
|
Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'vecdbs'
|
149
158
|
Requires-Dist: lancedb<0.9.0,>=0.8.2; extra == 'vecdbs'
|
@@ -1,5 +1,5 @@
|
|
1
1
|
langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
|
2
|
-
langroid/exceptions.py,sha256=
|
2
|
+
langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
|
3
3
|
langroid/mytypes.py,sha256=h1eMq1ZwTLVezObPfCseWNWbEOzP7mAKu2XoS63W1cM,2647
|
4
4
|
langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
|
@@ -14,7 +14,7 @@ langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a
|
|
14
14
|
langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
langroid/agent/callbacks/chainlit.py,sha256=RH8qUXaZE5o2WQz3WJQ1SdFtASGlxWCA6_HYz_3meDQ,20822
|
16
16
|
langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
|
17
|
-
langroid/agent/special/doc_chat_agent.py,sha256=
|
17
|
+
langroid/agent/special/doc_chat_agent.py,sha256=qoXp6PKI7oAQs8rgj934NzZaEEKsPICcgYl_iQY0bac,64818
|
18
18
|
langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
|
19
19
|
langroid/agent/special/lance_tools.py,sha256=qS8x4wi8mrqfbYV2ztFzrcxyhHQ0ZWOc-zkYiH7awj0,2105
|
20
20
|
langroid/agent/special/relevance_extractor_agent.py,sha256=zIx8GUdVo1aGW6ASla0NPQjYYIpmriK_TYMijqAx3F8,4796
|
@@ -57,7 +57,7 @@ langroid/cachedb/momento_cachedb.py,sha256=YEOJ62hEcV6iIeMr5aGgRYgWQqFYaej9gEDEc
|
|
57
57
|
langroid/cachedb/redis_cachedb.py,sha256=7kgnbf4b5CKsCrlL97mHWKvdvlLt8zgn7lc528jEpiE,5141
|
58
58
|
langroid/embedding_models/__init__.py,sha256=XhVIMQJbQRpImcnhA9sJR7h6r7QgPo1SKDCvwEUD9j4,851
|
59
59
|
langroid/embedding_models/base.py,sha256=DUhvzALoW2UMbtmLxP4eJTfPii99WjUNX7bwFpj_K-0,2395
|
60
|
-
langroid/embedding_models/models.py,sha256=
|
60
|
+
langroid/embedding_models/models.py,sha256=YppD52U1lbeygt8_SuPNi6piOV_FgBltZWH5e3l7iso,16776
|
61
61
|
langroid/embedding_models/remote_embeds.py,sha256=6_kjXByVbqhY9cGwl9R83ZcYC2km-nGieNNAo1McHaY,5151
|
62
62
|
langroid/embedding_models/protoc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
63
63
|
langroid/embedding_models/protoc/embeddings.proto,sha256=_O-SgFpTaylQeOTgSpxhEJ7CUw7PeCQQJLaPqpPYKJg,321
|
@@ -77,11 +77,11 @@ langroid/language_models/prompt_formatter/hf_formatter.py,sha256=PVJppmjRvD-2DF-
|
|
77
77
|
langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
|
78
78
|
langroid/parsing/__init__.py,sha256=ZgSAfgTC6VsTLFlRSWT-TwYco7SQeRMeZG-49MnKYGY,936
|
79
79
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
80
|
-
langroid/parsing/code_parser.py,sha256=
|
81
|
-
langroid/parsing/document_parser.py,sha256=
|
80
|
+
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
81
|
+
langroid/parsing/document_parser.py,sha256=1DjkoiieuPxlPtX-3FGzr3frDSKOjfKM4PhaKbVNQ1c,28570
|
82
82
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
83
83
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
84
|
-
langroid/parsing/parser.py,sha256=
|
84
|
+
langroid/parsing/parser.py,sha256=WDv4QnNtAcLSiPe6cPhHOa-aMhrt3OV-kKnVXdgwtmI,12276
|
85
85
|
langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
|
86
86
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
87
87
|
langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
|
@@ -116,13 +116,13 @@ langroid/utils/output/printing.py,sha256=yzPJZN-8_jyOJmI9N_oLwEDfjMwVgk3IDiwnZ4e
|
|
116
116
|
langroid/utils/output/status.py,sha256=rzbE7mDJcgNNvdtylCseQcPGCGghtJvVq3lB-OPJ49E,1049
|
117
117
|
langroid/vector_store/__init__.py,sha256=BcoOm1tG3y0EqjkIGmMOHkY9iTUhDHgyruknWDKgqIg,1214
|
118
118
|
langroid/vector_store/base.py,sha256=suBanIt0iKEgnMnGdQOyWS58guG20Jyy-GK4DMMuYL0,14208
|
119
|
-
langroid/vector_store/chromadb.py,sha256=
|
119
|
+
langroid/vector_store/chromadb.py,sha256=XkpW7pnSf6Lk7Nf1BEIw-zjYGYchoWHgrhnJX7YmxD8,8725
|
120
120
|
langroid/vector_store/lancedb.py,sha256=b3_vWkTjG8mweZ7ZNlUD-NjmQP_rLBZfyKWcxt2vosA,14855
|
121
121
|
langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3HmhHQICXLs,11663
|
122
122
|
langroid/vector_store/momento.py,sha256=UNHGT6jXuQtqY9f6MdqGU14bVnS0zHgIJUa30ULpUJo,10474
|
123
|
-
langroid/vector_store/qdrantdb.py,sha256=
|
124
|
-
langroid/vector_store/weaviatedb.py,sha256=
|
125
|
-
langroid-0.
|
126
|
-
langroid-0.
|
127
|
-
langroid-0.
|
128
|
-
langroid-0.
|
123
|
+
langroid/vector_store/qdrantdb.py,sha256=Cen6f-y6witiR53UQ-5a605Reo0gTj3ygXpE_ehYoZo,18116
|
124
|
+
langroid/vector_store/weaviatedb.py,sha256=C6jd1Twl5_jux3JYyrcTfQb63Lk9HuiUzVF4NahXuGo,10642
|
125
|
+
langroid-0.37.0.dist-info/METADATA,sha256=hlweiAhkVzVb_sVOPF-adwqwDPpAUUsgE1wJFRYNnKg,60524
|
126
|
+
langroid-0.37.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
127
|
+
langroid-0.37.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
128
|
+
langroid-0.37.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|