langroid 0.1.64__py3-none-any.whl → 0.1.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/special/doc_chat_agent.py +6 -4
- langroid/mytypes.py +1 -0
- langroid/parsing/parser.py +10 -3
- langroid/parsing/pdf_parser.py +72 -9
- langroid/parsing/repo_loader.py +9 -3
- langroid/parsing/url_loader.py +8 -3
- {langroid-0.1.64.dist-info → langroid-0.1.65.dist-info}/METADATA +1 -1
- {langroid-0.1.64.dist-info → langroid-0.1.65.dist-info}/RECORD +10 -10
- {langroid-0.1.64.dist-info → langroid-0.1.65.dist-info}/LICENSE +0 -0
- {langroid-0.1.64.dist-info → langroid-0.1.65.dist-info}/WHEEL +0 -0
@@ -19,7 +19,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
|
|
19
19
|
from langroid.language_models.base import StreamingIfAllowed
|
20
20
|
from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
|
21
21
|
from langroid.mytypes import DocMetaData, Document, Entity
|
22
|
-
from langroid.parsing.parser import ParsingConfig, Splitter
|
22
|
+
from langroid.parsing.parser import Parser, ParsingConfig, Splitter
|
23
23
|
from langroid.parsing.repo_loader import RepoLoader
|
24
24
|
from langroid.parsing.url_loader import URLLoader
|
25
25
|
from langroid.parsing.urls import get_urls_and_paths
|
@@ -86,7 +86,8 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
86
86
|
]
|
87
87
|
parsing: ParsingConfig = ParsingConfig( # modify as needed
|
88
88
|
splitter=Splitter.TOKENS,
|
89
|
-
chunk_size=
|
89
|
+
chunk_size=800, # aim for this many tokens per chunk
|
90
|
+
overlap=100, # overlap between chunks
|
90
91
|
max_chunks=10_000,
|
91
92
|
# aim to have at least this many chars per chunk when
|
92
93
|
# truncating due to punctuation
|
@@ -145,12 +146,13 @@ class DocChatAgent(ChatAgent):
|
|
145
146
|
return
|
146
147
|
urls, paths = get_urls_and_paths(self.config.doc_paths)
|
147
148
|
docs: List[Document] = []
|
149
|
+
parser = Parser(self.config.parsing)
|
148
150
|
if len(urls) > 0:
|
149
|
-
loader = URLLoader(urls=urls)
|
151
|
+
loader = URLLoader(urls=urls, parser=parser)
|
150
152
|
docs = loader.load()
|
151
153
|
if len(paths) > 0:
|
152
154
|
for p in paths:
|
153
|
-
path_docs = RepoLoader.get_documents(p)
|
155
|
+
path_docs = RepoLoader.get_documents(p, parser=parser)
|
154
156
|
docs.extend(path_docs)
|
155
157
|
n_docs = len(docs)
|
156
158
|
n_splits = self.ingest_docs(docs)
|
langroid/mytypes.py
CHANGED
langroid/parsing/parser.py
CHANGED
@@ -199,11 +199,18 @@ class Parser:
|
|
199
199
|
def split(self, docs: List[Document]) -> List[Document]:
|
200
200
|
if len(docs) == 0:
|
201
201
|
return []
|
202
|
+
# some docs are already splits, so don't split them further!
|
203
|
+
chunked_docs = [d for d in docs if d.metadata.is_chunk]
|
204
|
+
big_docs = [d for d in docs if not d.metadata.is_chunk]
|
205
|
+
if len(big_docs) == 0:
|
206
|
+
return chunked_docs
|
202
207
|
if self.config.splitter == Splitter.PARA_SENTENCE:
|
203
|
-
|
208
|
+
big_doc_chunks = self.split_para_sentence(big_docs)
|
204
209
|
elif self.config.splitter == Splitter.TOKENS:
|
205
|
-
|
210
|
+
big_doc_chunks = self.split_chunk_tokens(big_docs)
|
206
211
|
elif self.config.splitter == Splitter.SIMPLE:
|
207
|
-
|
212
|
+
big_doc_chunks = self.split_simple(big_docs)
|
208
213
|
else:
|
209
214
|
raise ValueError(f"Unknown splitter: {self.config.splitter}")
|
215
|
+
|
216
|
+
return chunked_docs + big_doc_chunks
|
langroid/parsing/pdf_parser.py
CHANGED
@@ -1,15 +1,20 @@
|
|
1
1
|
from io import BytesIO
|
2
|
+
from typing import List
|
2
3
|
|
3
4
|
import requests
|
4
5
|
from pypdf import PdfReader
|
5
6
|
|
6
7
|
from langroid.mytypes import DocMetaData, Document
|
7
|
-
from langroid.parsing.parser import Parser
|
8
|
+
from langroid.parsing.parser import Parser
|
8
9
|
|
9
10
|
|
10
11
|
class PdfParser(Parser):
|
11
|
-
def __init__(self,
|
12
|
-
super().__init__(config)
|
12
|
+
def __init__(self, parser: Parser):
|
13
|
+
super().__init__(parser.config)
|
14
|
+
|
15
|
+
@classmethod
|
16
|
+
def from_Parser(cls, parser: Parser) -> "PdfParser":
|
17
|
+
return cls(parser)
|
13
18
|
|
14
19
|
@staticmethod
|
15
20
|
def _text_from_pdf_reader(reader: PdfReader) -> str:
|
@@ -25,7 +30,7 @@ class PdfParser(Parser):
|
|
25
30
|
text += page.extract_text()
|
26
31
|
return text
|
27
32
|
|
28
|
-
def
|
33
|
+
def _doc_chunks_from_pdf_reader(
|
29
34
|
self,
|
30
35
|
reader: PdfReader,
|
31
36
|
doc: str,
|
@@ -46,7 +51,8 @@ class PdfParser(Parser):
|
|
46
51
|
overlap (int): number of tokens to overlap between chunks
|
47
52
|
|
48
53
|
Returns:
|
49
|
-
List[Document]: a list of `Document` objects,
|
54
|
+
List[Document]: a list of `Document` objects,
|
55
|
+
each containing a chunk of text
|
50
56
|
"""
|
51
57
|
|
52
58
|
split = [] # tokens in curr split
|
@@ -64,7 +70,10 @@ class PdfParser(Parser):
|
|
64
70
|
docs.append(
|
65
71
|
Document(
|
66
72
|
content=self.tokenizer.decode(split[:chunk_tokens]),
|
67
|
-
metadata=DocMetaData(
|
73
|
+
metadata=DocMetaData(
|
74
|
+
source=f"{doc} pages {pg}",
|
75
|
+
is_chunk=True,
|
76
|
+
),
|
68
77
|
)
|
69
78
|
)
|
70
79
|
split = split[chunk_tokens - overlap :]
|
@@ -74,14 +83,45 @@ class PdfParser(Parser):
|
|
74
83
|
docs.append(
|
75
84
|
Document(
|
76
85
|
content=self.tokenizer.decode(split[:chunk_tokens]),
|
77
|
-
metadata=DocMetaData(
|
86
|
+
metadata=DocMetaData(
|
87
|
+
source=f"{doc} pages {pg}",
|
88
|
+
is_chunk=True,
|
89
|
+
),
|
78
90
|
)
|
79
91
|
)
|
80
92
|
return docs
|
81
93
|
|
94
|
+
@staticmethod
|
95
|
+
def doc_chunks_from_pdf_url(url: str, parser: Parser) -> List[Document]:
|
96
|
+
"""
|
97
|
+
Get chunks of text from pdf URL as a list of Document objects,
|
98
|
+
using the parser's chunk_size and overlap settings.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
url (str): contains the URL to the PDF file
|
102
|
+
Returns:
|
103
|
+
a `Document` object containing the content of the pdf file,
|
104
|
+
and metadata containing url
|
105
|
+
"""
|
106
|
+
|
107
|
+
pdfParser = PdfParser.from_Parser(parser)
|
108
|
+
response = requests.get(url)
|
109
|
+
response.raise_for_status()
|
110
|
+
with BytesIO(response.content) as f:
|
111
|
+
reader = PdfReader(f)
|
112
|
+
docs = pdfParser._doc_chunks_from_pdf_reader(
|
113
|
+
reader,
|
114
|
+
doc=url,
|
115
|
+
chunk_tokens=parser.config.chunk_size,
|
116
|
+
overlap=parser.config.overlap,
|
117
|
+
)
|
118
|
+
return docs
|
119
|
+
|
82
120
|
@staticmethod
|
83
121
|
def get_doc_from_pdf_url(url: str) -> Document:
|
84
122
|
"""
|
123
|
+
Get entire text from pdf URL as a single document.
|
124
|
+
|
85
125
|
Args:
|
86
126
|
url (str): contains the URL to the PDF file
|
87
127
|
Returns:
|
@@ -92,9 +132,32 @@ class PdfParser(Parser):
|
|
92
132
|
response.raise_for_status()
|
93
133
|
with BytesIO(response.content) as f:
|
94
134
|
reader = PdfReader(f)
|
95
|
-
text = _text_from_pdf_reader(reader)
|
135
|
+
text = PdfParser._text_from_pdf_reader(reader)
|
96
136
|
return Document(content=text, metadata=DocMetaData(source=str(url)))
|
97
137
|
|
138
|
+
@staticmethod
|
139
|
+
def doc_chunks_from_pdf_path(path: str, parser: Parser) -> List[Document]:
|
140
|
+
"""
|
141
|
+
Get chunks of text from pdf path as a list of Document objects,
|
142
|
+
using the parser's chunk_size and overlap settings.
|
143
|
+
|
144
|
+
Args:
|
145
|
+
url (str): contains the URL to the PDF file
|
146
|
+
Returns:
|
147
|
+
a `Document` object containing the content of the pdf file,
|
148
|
+
and metadata containing url
|
149
|
+
"""
|
150
|
+
|
151
|
+
pdfParser = PdfParser.from_Parser(parser)
|
152
|
+
reader = PdfReader(path)
|
153
|
+
docs = pdfParser._doc_chunks_from_pdf_reader(
|
154
|
+
reader,
|
155
|
+
doc=path,
|
156
|
+
chunk_tokens=parser.config.chunk_size,
|
157
|
+
overlap=parser.config.overlap,
|
158
|
+
)
|
159
|
+
return docs
|
160
|
+
|
98
161
|
@staticmethod
|
99
162
|
def get_doc_from_pdf_file(path: str) -> Document:
|
100
163
|
"""
|
@@ -107,5 +170,5 @@ class PdfParser(Parser):
|
|
107
170
|
and metadata containing path/url
|
108
171
|
"""
|
109
172
|
reader = PdfReader(path)
|
110
|
-
text = _text_from_pdf_reader(reader)
|
173
|
+
text = PdfParser._text_from_pdf_reader(reader)
|
111
174
|
return Document(content=text, metadata=DocMetaData(source=str(path)))
|
langroid/parsing/repo_loader.py
CHANGED
@@ -18,7 +18,8 @@ from github.Repository import Repository
|
|
18
18
|
from pydantic import BaseSettings
|
19
19
|
|
20
20
|
from langroid.mytypes import DocMetaData, Document
|
21
|
-
from langroid.parsing.
|
21
|
+
from langroid.parsing.parser import Parser
|
22
|
+
from langroid.parsing.pdf_parser import PdfParser
|
22
23
|
|
23
24
|
logger = logging.getLogger(__name__)
|
24
25
|
|
@@ -443,6 +444,7 @@ class RepoLoader:
|
|
443
444
|
exclude_dirs: Optional[List[str]] = None,
|
444
445
|
depth: int = -1,
|
445
446
|
lines: Optional[int] = None,
|
447
|
+
parser: Optional[Parser] = None,
|
446
448
|
) -> List[Document]:
|
447
449
|
"""
|
448
450
|
Recursively get all files under a path as Document objects.
|
@@ -458,6 +460,7 @@ class RepoLoader:
|
|
458
460
|
which includes all depths.
|
459
461
|
lines (int, optional): Number of lines to read from each file.
|
460
462
|
Defaults to None, which reads all lines.
|
463
|
+
parser (Parser, optional): Parser to use to parse files.
|
461
464
|
|
462
465
|
Returns:
|
463
466
|
List[Document]: List of Document objects representing files.
|
@@ -490,8 +493,11 @@ class RepoLoader:
|
|
490
493
|
|
491
494
|
for file_path in file_paths:
|
492
495
|
_, file_extension = os.path.splitext(file_path)
|
493
|
-
if file_extension == ".pdf":
|
494
|
-
|
496
|
+
if file_extension.lower() == ".pdf":
|
497
|
+
if parser is None:
|
498
|
+
docs.append(PdfParser.get_doc_from_pdf_file(file_path))
|
499
|
+
else:
|
500
|
+
docs.extend(PdfParser.doc_chunks_from_pdf_path(file_path, parser))
|
495
501
|
else:
|
496
502
|
with open(file_path, "r") as f:
|
497
503
|
if lines is not None:
|
langroid/parsing/url_loader.py
CHANGED
@@ -9,7 +9,8 @@ from trafilatura.downloads import (
|
|
9
9
|
)
|
10
10
|
|
11
11
|
from langroid.mytypes import DocMetaData, Document
|
12
|
-
from langroid.parsing.
|
12
|
+
from langroid.parsing.parser import Parser
|
13
|
+
from langroid.parsing.pdf_parser import PdfParser
|
13
14
|
|
14
15
|
logging.getLogger("trafilatura").setLevel(logging.ERROR)
|
15
16
|
|
@@ -26,8 +27,9 @@ class URLLoader:
|
|
26
27
|
the "accept" button on the cookie dialog.
|
27
28
|
"""
|
28
29
|
|
29
|
-
def __init__(self, urls: List[str]):
|
30
|
+
def __init__(self, urls: List[str], parser: Parser | None = None):
|
30
31
|
self.urls = urls
|
32
|
+
self.parser = parser
|
31
33
|
|
32
34
|
@no_type_check
|
33
35
|
def load(self) -> List[Document]:
|
@@ -43,7 +45,10 @@ class URLLoader:
|
|
43
45
|
)
|
44
46
|
for url, result in buffered_downloads(buffer, threads):
|
45
47
|
if url.lower().endswith(".pdf"):
|
46
|
-
|
48
|
+
if self.parser is None:
|
49
|
+
docs.append(PdfParser.get_doc_from_pdf_url(url))
|
50
|
+
else:
|
51
|
+
docs.extend(PdfParser.doc_chunks_from_pdf_url(url, self.parser))
|
47
52
|
else:
|
48
53
|
text = trafilatura.extract(
|
49
54
|
result,
|
@@ -6,7 +6,7 @@ langroid/agent/chat_document.py,sha256=k7Klav3FIBTf2w95bQtxgqBrf2fMo1ydSlklQvv4R
|
|
6
6
|
langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
|
8
8
|
langroid/agent/special/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
langroid/agent/special/doc_chat_agent.py,sha256=
|
9
|
+
langroid/agent/special/doc_chat_agent.py,sha256=RmYRHclTz7D8dH3eIBFWINm8Cl5zvICt2Ri-6AHRPqc,18227
|
10
10
|
langroid/agent/special/recipient_validator_agent.py,sha256=R3Rit93BNWQar_9stuDBGzmLr2W-IYOQ7oq-tlNNlps,6035
|
11
11
|
langroid/agent/special/retriever_agent.py,sha256=DeOB5crFjXBvDEZT9k9ZVinOfFM2VgS6tQWWFyXSk9o,7204
|
12
12
|
langroid/agent/special/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -40,18 +40,18 @@ langroid/language_models/prompt_formatter/__init__.py,sha256=47DEQpj8HBSa-_TImW-
|
|
40
40
|
langroid/language_models/prompt_formatter/base.py,sha256=2y_GcwhstvB5ih3haS7l5Fv79jVnFJ_vEw1jqWJzB9k,1247
|
41
41
|
langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
|
42
42
|
langroid/language_models/utils.py,sha256=rmnSn-sJ3aKl_wBdeLPkck0Li4Ed6zkCxZYYl7n1V34,4668
|
43
|
-
langroid/mytypes.py,sha256=
|
43
|
+
langroid/mytypes.py,sha256=nJyTaX-nAe2dwRdF8NZKxUNAy_Hvxgtkn1c9buT-d14,1619
|
44
44
|
langroid/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
45
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
46
46
|
langroid/parsing/code-parsing.md,sha256=--cyyNiSZSDlIwcjAV4-shKrSiRe2ytF3AdSoS_hD2g,3294
|
47
47
|
langroid/parsing/code_parser.py,sha256=BbDAzp35wkYQ9U1dpf1ARL0lVyi0tfqEc6_eox2C090,3727
|
48
48
|
langroid/parsing/json.py,sha256=MVqBUfInALQm1QKbcfEvLzWxBz_UztCIyGk7AK5uFPo,1650
|
49
49
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
50
|
-
langroid/parsing/parser.py,sha256=
|
51
|
-
langroid/parsing/pdf_parser.py,sha256=
|
52
|
-
langroid/parsing/repo_loader.py,sha256=
|
50
|
+
langroid/parsing/parser.py,sha256=R1yvqjBvXKOcX1opCj5nTYuDK2HLAfkonM055DM5CP8,7826
|
51
|
+
langroid/parsing/pdf_parser.py,sha256=0FHrxFXwJ5m9xLQlyKrlRGeApGjkJ3gRk1a3fs1h7AI,5890
|
52
|
+
langroid/parsing/repo_loader.py,sha256=lQ_9tceOEiPWATf_SL43iubz05G1XXoF4vINsKJHmkY,27324
|
53
53
|
langroid/parsing/table_loader.py,sha256=uqbupGr4y_7os18RtaY5GpD0hWcgzROoNy8dQIHB4kc,1767
|
54
|
-
langroid/parsing/url_loader.py,sha256=
|
54
|
+
langroid/parsing/url_loader.py,sha256=MfYr2zK-1pOMEEc9y_vrMdtC052XMlr57X2ptYKpDX4,2103
|
55
55
|
langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
|
56
56
|
langroid/parsing/urls.py,sha256=_Bcf1iRdT7cQrQ8hnbPX0Jtzxc0lVFaucTS5rJoKA14,3709
|
57
57
|
langroid/parsing/utils.py,sha256=zqvZWpZktRJTKx_JAqxaIyoudMdKVdB1zzjnOhVYHS4,2196
|
@@ -82,7 +82,7 @@ langroid/vector_store/base.py,sha256=QZx3NUNwf2I0r3A7iuoUHIRGbqt_pFGD0hq1R-Yg8iM
|
|
82
82
|
langroid/vector_store/chromadb.py,sha256=s5pQkKjaMP-Tt5A8M10EInFzttaALPbJAq7q4gf0TKg,5235
|
83
83
|
langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
|
84
84
|
langroid/vector_store/qdrantdb.py,sha256=KRvIIj1IZG2zFqejofMnRs2hT86B-27LgBEnuczdqOU,9072
|
85
|
-
langroid-0.1.
|
86
|
-
langroid-0.1.
|
87
|
-
langroid-0.1.
|
88
|
-
langroid-0.1.
|
85
|
+
langroid-0.1.65.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
86
|
+
langroid-0.1.65.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
|
87
|
+
langroid-0.1.65.dist-info/METADATA,sha256=B5O6tKm5lGdBFcm6Npepa4nYHHZ8zMq0VNkUxSOEo1g,35776
|
88
|
+
langroid-0.1.65.dist-info/RECORD,,
|
File without changes
|
File without changes
|