langroid 0.1.218__py3-none-any.whl → 0.1.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/special/doc_chat_agent.py +54 -25
- langroid/parsing/document_parser.py +145 -22
- langroid/parsing/repo_loader.py +69 -49
- langroid/parsing/urls.py +18 -9
- langroid/parsing/utils.py +27 -9
- langroid/utils/system.py +1 -1
- {langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/METADATA +2 -2
- {langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/RECORD +10 -10
- {langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
- {langroid-0.1.218.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0
@@ -35,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
|
|
35
35
|
from langroid.language_models.base import StreamingIfAllowed
|
36
36
|
from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
|
37
37
|
from langroid.mytypes import DocMetaData, Document, Entity
|
38
|
+
from langroid.parsing.document_parser import DocumentType
|
38
39
|
from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
|
39
40
|
from langroid.parsing.repo_loader import RepoLoader
|
40
41
|
from langroid.parsing.search import (
|
@@ -44,7 +45,7 @@ from langroid.parsing.search import (
|
|
44
45
|
)
|
45
46
|
from langroid.parsing.table_loader import describe_dataframe
|
46
47
|
from langroid.parsing.url_loader import URLLoader
|
47
|
-
from langroid.parsing.urls import get_list_from_user,
|
48
|
+
from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
|
48
49
|
from langroid.parsing.utils import batched
|
49
50
|
from langroid.prompts.prompts_config import PromptsConfig
|
50
51
|
from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
|
@@ -126,7 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
126
127
|
llm=None # use the parent's llm unless explicitly set here
|
127
128
|
)
|
128
129
|
)
|
129
|
-
doc_paths: List[str] = []
|
130
|
+
doc_paths: List[str | bytes] = []
|
130
131
|
default_paths: List[str] = [
|
131
132
|
"https://news.ycombinator.com/item?id=35629033",
|
132
133
|
"https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
|
@@ -248,62 +249,84 @@ class DocChatAgent(ChatAgent):
|
|
248
249
|
raise ValueError("VecDB not set")
|
249
250
|
self.setup_documents(filter=self.config.filter)
|
250
251
|
return
|
251
|
-
self.ingest_doc_paths(self.config.doc_paths)
|
252
|
+
self.ingest_doc_paths(self.config.doc_paths) # type: ignore
|
252
253
|
|
253
254
|
def ingest_doc_paths(
|
254
255
|
self,
|
255
|
-
paths: List[str],
|
256
|
+
paths: str | bytes | List[str | bytes],
|
256
257
|
metadata: (
|
257
258
|
List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
|
258
259
|
) = [],
|
260
|
+
doc_type: str | DocumentType | None = None,
|
259
261
|
) -> List[Document]:
|
260
262
|
"""Split, ingest docs from specified paths,
|
261
263
|
do not add these to config.doc_paths.
|
262
264
|
|
263
265
|
Args:
|
264
|
-
paths:
|
266
|
+
paths: document paths, urls or byte-content of docs.
|
267
|
+
The bytes option is intended to support cases where a document
|
268
|
+
has already been read in as bytes (e.g. from an API or a database),
|
269
|
+
and we want to avoid having to write it to a temporary file
|
270
|
+
just to read it back in.
|
265
271
|
metadata: List of metadata dicts, one for each path.
|
266
272
|
If a single dict is passed in, it is used for all paths.
|
273
|
+
doc_type: DocumentType to use for parsing, if known.
|
274
|
+
MUST apply to all docs if specified.
|
275
|
+
This is especially useful when the `paths` are of bytes type,
|
276
|
+
to help with document type detection.
|
267
277
|
Returns:
|
268
278
|
List of Document objects
|
269
279
|
"""
|
280
|
+
if isinstance(paths, str) or isinstance(paths, bytes):
|
281
|
+
paths = [paths]
|
270
282
|
all_paths = paths
|
271
|
-
paths_meta: Dict[
|
272
|
-
urls_meta: Dict[
|
273
|
-
|
283
|
+
paths_meta: Dict[int, Any] = {}
|
284
|
+
urls_meta: Dict[int, Any] = {}
|
285
|
+
idxs = range(len(all_paths))
|
286
|
+
url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
|
287
|
+
urls = [all_paths[i] for i in url_idxs]
|
288
|
+
paths = [all_paths[i] for i in path_idxs]
|
289
|
+
bytes_list = [all_paths[i] for i in bytes_idxs]
|
290
|
+
path_idxs.extend(bytes_idxs)
|
291
|
+
paths.extend(bytes_list)
|
274
292
|
if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
|
275
293
|
metadata, list
|
276
294
|
):
|
277
295
|
if isinstance(metadata, list):
|
278
|
-
|
296
|
+
idx2meta = {
|
279
297
|
p: (
|
280
298
|
m
|
281
299
|
if isinstance(m, dict)
|
282
300
|
else (isinstance(m, DocMetaData) and m.dict())
|
283
301
|
) # appease mypy
|
284
|
-
for p, m in zip(
|
302
|
+
for p, m in zip(idxs, metadata)
|
285
303
|
}
|
286
304
|
elif isinstance(metadata, dict):
|
287
|
-
|
305
|
+
idx2meta = {p: metadata for p in idxs}
|
288
306
|
else:
|
289
|
-
|
290
|
-
urls_meta = {u:
|
291
|
-
paths_meta = {p:
|
307
|
+
idx2meta = {p: metadata.dict() for p in idxs}
|
308
|
+
urls_meta = {u: idx2meta[u] for u in url_idxs}
|
309
|
+
paths_meta = {p: idx2meta[p] for p in path_idxs}
|
292
310
|
docs: List[Document] = []
|
293
311
|
parser = Parser(self.config.parsing)
|
294
312
|
if len(urls) > 0:
|
295
|
-
for
|
296
|
-
meta = urls_meta.get(
|
297
|
-
loader = URLLoader(urls=[
|
313
|
+
for ui in url_idxs:
|
314
|
+
meta = urls_meta.get(ui, {})
|
315
|
+
loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
|
298
316
|
url_docs = loader.load()
|
299
317
|
# update metadata of each doc with meta
|
300
318
|
for d in url_docs:
|
301
319
|
d.metadata = d.metadata.copy(update=meta)
|
302
320
|
docs.extend(url_docs)
|
303
|
-
if len(paths) > 0:
|
304
|
-
for
|
305
|
-
meta = paths_meta.get(
|
306
|
-
|
321
|
+
if len(paths) > 0: # paths OR bytes are handled similarly
|
322
|
+
for pi in path_idxs:
|
323
|
+
meta = paths_meta.get(pi, {})
|
324
|
+
p = all_paths[pi]
|
325
|
+
path_docs = RepoLoader.get_documents(
|
326
|
+
p,
|
327
|
+
parser=parser,
|
328
|
+
doc_type=doc_type,
|
329
|
+
)
|
307
330
|
# update metadata of each doc with meta
|
308
331
|
for d in path_docs:
|
309
332
|
d.metadata = d.metadata.copy(update=meta)
|
@@ -317,11 +340,12 @@ class DocChatAgent(ChatAgent):
|
|
317
340
|
print(
|
318
341
|
f"""
|
319
342
|
[green]I have processed the following {n_urls} URLs
|
320
|
-
and {n_paths}
|
343
|
+
and {n_paths} docs into {n_splits} parts:
|
321
344
|
""".strip()
|
322
345
|
)
|
323
|
-
|
324
|
-
print("\n".join(
|
346
|
+
path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
|
347
|
+
print("\n".join([u for u in urls if isinstance(u, str)])) # appease mypy
|
348
|
+
print("\n".join(path_reps))
|
325
349
|
return docs
|
326
350
|
|
327
351
|
def ingest_docs(
|
@@ -388,6 +412,7 @@ class DocChatAgent(ChatAgent):
|
|
388
412
|
+ ",content="
|
389
413
|
+ d.content
|
390
414
|
)
|
415
|
+
docs = docs[: self.config.parsing.max_chunks]
|
391
416
|
# add embeddings in batches, to stay under limit of embeddings API
|
392
417
|
batches = list(batched(docs, self.config.embed_batch_size))
|
393
418
|
for batch in batches:
|
@@ -463,6 +488,10 @@ class DocChatAgent(ChatAgent):
|
|
463
488
|
d.metadata.is_chunk = True
|
464
489
|
return self.ingest_docs(docs)
|
465
490
|
|
491
|
+
def set_filter(self, filter: str) -> None:
|
492
|
+
self.config.filter = filter
|
493
|
+
self.setup_documents(filter=filter)
|
494
|
+
|
466
495
|
def setup_documents(
|
467
496
|
self,
|
468
497
|
docs: List[Document] = [],
|
@@ -609,7 +638,7 @@ class DocChatAgent(ChatAgent):
|
|
609
638
|
if len(inputs) == 0:
|
610
639
|
if is_new_collection:
|
611
640
|
inputs = self.config.default_paths
|
612
|
-
self.config.doc_paths = inputs
|
641
|
+
self.config.doc_paths = inputs # type: ignore
|
613
642
|
self.ingest()
|
614
643
|
|
615
644
|
def llm_response(
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import itertools
|
1
2
|
import logging
|
2
3
|
import re
|
3
4
|
from enum import Enum
|
@@ -8,6 +9,7 @@ import fitz
|
|
8
9
|
import pdfplumber
|
9
10
|
import pypdf
|
10
11
|
import requests
|
12
|
+
from bs4 import BeautifulSoup
|
11
13
|
from PIL import Image
|
12
14
|
|
13
15
|
from langroid.mytypes import DocMetaData, Document
|
@@ -20,6 +22,29 @@ class DocumentType(str, Enum):
|
|
20
22
|
PDF = "pdf"
|
21
23
|
DOCX = "docx"
|
22
24
|
DOC = "doc"
|
25
|
+
TXT = "txt"
|
26
|
+
|
27
|
+
|
28
|
+
def is_plain_text(path_or_bytes: str | bytes) -> bool:
|
29
|
+
if isinstance(path_or_bytes, str):
|
30
|
+
if path_or_bytes.startswith(("http://", "https://")):
|
31
|
+
response = requests.get(path_or_bytes)
|
32
|
+
response.raise_for_status()
|
33
|
+
content = response.content[:1024]
|
34
|
+
else:
|
35
|
+
with open(path_or_bytes, "rb") as f:
|
36
|
+
content = f.read(1024)
|
37
|
+
else:
|
38
|
+
content = path_or_bytes[:1024]
|
39
|
+
try:
|
40
|
+
# Attempt to decode the content as UTF-8
|
41
|
+
_ = content.decode("utf-8")
|
42
|
+
# Additional checks can go here, e.g., to verify that the content
|
43
|
+
# doesn't contain too many unusual characters for it to be considered text
|
44
|
+
return True
|
45
|
+
except UnicodeDecodeError:
|
46
|
+
# If decoding fails, it's likely not plain text (or not encoded in UTF-8)
|
47
|
+
return False
|
23
48
|
|
24
49
|
|
25
50
|
class DocumentParser(Parser):
|
@@ -33,19 +58,26 @@ class DocumentParser(Parser):
|
|
33
58
|
"""
|
34
59
|
|
35
60
|
@classmethod
|
36
|
-
def create(
|
61
|
+
def create(
|
62
|
+
cls,
|
63
|
+
source: str | bytes,
|
64
|
+
config: ParsingConfig,
|
65
|
+
doc_type: str | DocumentType | None = None,
|
66
|
+
) -> "DocumentParser":
|
37
67
|
"""
|
38
68
|
Create a DocumentParser instance based on source type
|
39
69
|
and config.<source_type>.library specified.
|
40
70
|
|
41
71
|
Args:
|
42
|
-
source (str): The source
|
72
|
+
source (str|bytes): The source, could be a URL, file path,
|
73
|
+
or bytes object.
|
43
74
|
config (ParserConfig): The parser configuration.
|
75
|
+
doc_type (str|None): The type of document, if known
|
44
76
|
|
45
77
|
Returns:
|
46
78
|
DocumentParser: An instance of a DocumentParser subclass.
|
47
79
|
"""
|
48
|
-
if DocumentParser._document_type(source) == DocumentType.PDF:
|
80
|
+
if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
|
49
81
|
if config.pdf.library == "fitz":
|
50
82
|
return FitzPDFParser(source, config)
|
51
83
|
elif config.pdf.library == "pypdf":
|
@@ -60,7 +92,7 @@ class DocumentParser(Parser):
|
|
60
92
|
raise ValueError(
|
61
93
|
f"Unsupported PDF library specified: {config.pdf.library}"
|
62
94
|
)
|
63
|
-
elif DocumentParser._document_type(source) == DocumentType.DOCX:
|
95
|
+
elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
|
64
96
|
if config.docx.library == "unstructured":
|
65
97
|
return UnstructuredDocxParser(source, config)
|
66
98
|
elif config.docx.library == "python-docx":
|
@@ -69,42 +101,78 @@ class DocumentParser(Parser):
|
|
69
101
|
raise ValueError(
|
70
102
|
f"Unsupported DOCX library specified: {config.docx.library}"
|
71
103
|
)
|
72
|
-
elif DocumentParser._document_type(source) == DocumentType.DOC:
|
104
|
+
elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
|
73
105
|
return UnstructuredDocParser(source, config)
|
74
106
|
else:
|
75
|
-
|
107
|
+
source_name = source if isinstance(source, str) else "bytes"
|
108
|
+
raise ValueError(f"Unsupported document type: {source_name}")
|
76
109
|
|
77
|
-
def __init__(self, source: str, config: ParsingConfig):
|
110
|
+
def __init__(self, source: str | bytes, config: ParsingConfig):
|
78
111
|
"""
|
79
|
-
Initialize the PDFParser.
|
80
|
-
|
81
112
|
Args:
|
82
|
-
source (str): The source
|
113
|
+
source (str|bytes): The source, which could be
|
114
|
+
a path, a URL or a bytes object.
|
83
115
|
"""
|
84
116
|
super().__init__(config)
|
85
|
-
self.source = source
|
86
117
|
self.config = config
|
87
|
-
|
118
|
+
if isinstance(source, bytes):
|
119
|
+
self.source = "bytes"
|
120
|
+
self.doc_bytes = BytesIO(source)
|
121
|
+
else:
|
122
|
+
self.source = source
|
123
|
+
self.doc_bytes = self._load_doc_as_bytesio()
|
88
124
|
|
89
125
|
@staticmethod
|
90
|
-
def _document_type(
|
126
|
+
def _document_type(
|
127
|
+
source: str | bytes, doc_type: str | DocumentType | None = None
|
128
|
+
) -> DocumentType:
|
91
129
|
"""
|
92
130
|
Determine the type of document based on the source.
|
93
131
|
|
94
132
|
Args:
|
95
|
-
source (str): The source
|
133
|
+
source (str|bytes): The source, which could be a URL,
|
134
|
+
a file path, or a bytes object.
|
135
|
+
doc_type (str|DocumentType|None): The type of document, if known.
|
96
136
|
|
97
137
|
Returns:
|
98
138
|
str: The document type.
|
99
139
|
"""
|
100
|
-
if
|
101
|
-
return
|
102
|
-
|
103
|
-
return DocumentType.
|
104
|
-
|
105
|
-
return DocumentType.
|
140
|
+
if isinstance(doc_type, DocumentType):
|
141
|
+
return doc_type
|
142
|
+
if doc_type:
|
143
|
+
return DocumentType(doc_type.lower())
|
144
|
+
if is_plain_text(source):
|
145
|
+
return DocumentType.TXT
|
146
|
+
if isinstance(source, str):
|
147
|
+
# detect file type from path extension
|
148
|
+
if source.lower().endswith(".pdf"):
|
149
|
+
return DocumentType.PDF
|
150
|
+
elif source.lower().endswith(".docx"):
|
151
|
+
return DocumentType.DOCX
|
152
|
+
elif source.lower().endswith(".doc"):
|
153
|
+
return DocumentType.DOC
|
154
|
+
else:
|
155
|
+
raise ValueError(f"Unsupported document type: {source}")
|
106
156
|
else:
|
107
|
-
|
157
|
+
# must be bytes: attempt to detect type from content
|
158
|
+
# using magic mime type detection
|
159
|
+
import magic
|
160
|
+
|
161
|
+
mime_type = magic.from_buffer(source, mime=True)
|
162
|
+
if mime_type == "application/pdf":
|
163
|
+
return DocumentType.PDF
|
164
|
+
elif mime_type in [
|
165
|
+
"application/vnd.openxmlformats-officedocument"
|
166
|
+
".wordprocessingml.document",
|
167
|
+
"application/zip",
|
168
|
+
]:
|
169
|
+
# DOCX files are essentially ZIP files,
|
170
|
+
# but this might catch other ZIP-based formats too!
|
171
|
+
return DocumentType.DOCX
|
172
|
+
elif mime_type == "application/msword":
|
173
|
+
return DocumentType.DOC
|
174
|
+
else:
|
175
|
+
raise ValueError("Unsupported document type from bytes")
|
108
176
|
|
109
177
|
def _load_doc_as_bytesio(self) -> BytesIO:
|
110
178
|
"""
|
@@ -121,6 +189,61 @@ class DocumentParser(Parser):
|
|
121
189
|
with open(self.source, "rb") as f:
|
122
190
|
return BytesIO(f.read())
|
123
191
|
|
192
|
+
@staticmethod
|
193
|
+
def chunks_from_path_or_bytes(
|
194
|
+
source: str | bytes,
|
195
|
+
parser: Parser,
|
196
|
+
doc_type: str | DocumentType | None = None,
|
197
|
+
lines: int | None = None,
|
198
|
+
) -> List[Document]:
|
199
|
+
"""
|
200
|
+
Get document chunks from a file path or bytes object.
|
201
|
+
Args:
|
202
|
+
source (str|bytes): The source, which could be a URL, path or bytes object.
|
203
|
+
parser (Parser): The parser instance (for splitting the document).
|
204
|
+
doc_type (str|DocumentType|None): The type of document, if known.
|
205
|
+
lines (int|None): The number of lines to read from a plain text file.
|
206
|
+
Returns:
|
207
|
+
List[Document]: A list of `Document` objects,
|
208
|
+
each containing a chunk of text, determined by the
|
209
|
+
chunking and splitting settings in the parser config.
|
210
|
+
"""
|
211
|
+
dtype: DocumentType = DocumentParser._document_type(source, doc_type)
|
212
|
+
if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
|
213
|
+
doc_parser = DocumentParser.create(
|
214
|
+
source,
|
215
|
+
parser.config,
|
216
|
+
doc_type=doc_type,
|
217
|
+
)
|
218
|
+
chunks = doc_parser.get_doc_chunks()
|
219
|
+
if len(chunks) == 0 and dtype == DocumentType.PDF:
|
220
|
+
doc_parser = ImagePdfParser(source, parser.config)
|
221
|
+
chunks = doc_parser.get_doc_chunks()
|
222
|
+
return chunks
|
223
|
+
else:
|
224
|
+
# try getting as plain text; these will be chunked downstream
|
225
|
+
# -- could be a bytes object or a path
|
226
|
+
if isinstance(source, bytes):
|
227
|
+
content = source.decode()
|
228
|
+
if lines is not None:
|
229
|
+
file_lines = content.splitlines()[:lines]
|
230
|
+
content = "\n".join(line.strip() for line in file_lines)
|
231
|
+
else:
|
232
|
+
with open(source, "r") as f:
|
233
|
+
if lines is not None:
|
234
|
+
file_lines = list(itertools.islice(f, lines))
|
235
|
+
content = "\n".join(line.strip() for line in file_lines)
|
236
|
+
else:
|
237
|
+
content = f.read()
|
238
|
+
soup = BeautifulSoup(content, "html.parser")
|
239
|
+
text = soup.get_text()
|
240
|
+
source_name = source if isinstance(source, str) else "bytes"
|
241
|
+
doc = Document(
|
242
|
+
content=text,
|
243
|
+
metadata=DocMetaData(source=str(source_name)),
|
244
|
+
)
|
245
|
+
return parser.split([doc])
|
246
|
+
|
124
247
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
125
248
|
"""Yield each page in the PDF."""
|
126
249
|
raise NotImplementedError
|
@@ -145,7 +268,7 @@ class DocumentParser(Parser):
|
|
145
268
|
|
146
269
|
def get_doc(self) -> Document:
|
147
270
|
"""
|
148
|
-
Get entire text from
|
271
|
+
Get entire text from source as a single document.
|
149
272
|
|
150
273
|
Returns:
|
151
274
|
a `Document` object containing the content of the pdf file,
|
langroid/parsing/repo_loader.py
CHANGED
@@ -10,7 +10,6 @@ from pathlib import Path
|
|
10
10
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
11
11
|
from urllib.parse import urlparse
|
12
12
|
|
13
|
-
from bs4 import BeautifulSoup
|
14
13
|
from dotenv import load_dotenv
|
15
14
|
from github import Github
|
16
15
|
from github.ContentFile import ContentFile
|
@@ -19,7 +18,7 @@ from github.Repository import Repository
|
|
19
18
|
from pydantic import BaseModel, BaseSettings, Field
|
20
19
|
|
21
20
|
from langroid.mytypes import DocMetaData, Document
|
22
|
-
from langroid.parsing.document_parser import DocumentParser,
|
21
|
+
from langroid.parsing.document_parser import DocumentParser, DocumentType
|
23
22
|
from langroid.parsing.parser import Parser, ParsingConfig
|
24
23
|
|
25
24
|
logger = logging.getLogger(__name__)
|
@@ -491,18 +490,25 @@ class RepoLoader:
|
|
491
490
|
|
492
491
|
@staticmethod
|
493
492
|
def get_documents(
|
494
|
-
path: str,
|
493
|
+
path: str | bytes,
|
495
494
|
parser: Parser = Parser(ParsingConfig()),
|
496
495
|
file_types: Optional[List[str]] = None,
|
497
496
|
exclude_dirs: Optional[List[str]] = None,
|
498
497
|
depth: int = -1,
|
499
498
|
lines: Optional[int] = None,
|
499
|
+
doc_type: str | DocumentType | None = None,
|
500
500
|
) -> List[Document]:
|
501
501
|
"""
|
502
502
|
Recursively get all files under a path as Document objects.
|
503
503
|
|
504
504
|
Args:
|
505
|
-
path (str): The path to the directory or file.
|
505
|
+
path (str|bytes): The path to the directory or file, or bytes content.
|
506
|
+
The bytes option is meant to support the case where the content
|
507
|
+
has already been read from a file in an upstream process
|
508
|
+
(e.g. from an API or a database), and we want to avoid having to
|
509
|
+
write it to a temporary file just to read it again.
|
510
|
+
(which can be very slow for large files,
|
511
|
+
especially in a docker container)
|
506
512
|
parser (Parser): Parser to use to parse files.
|
507
513
|
file_types (List[str], optional): List of file extensions OR
|
508
514
|
filenames OR file_path_names to include.
|
@@ -513,6 +519,7 @@ class RepoLoader:
|
|
513
519
|
which includes all depths.
|
514
520
|
lines (int, optional): Number of lines to read from each file.
|
515
521
|
Defaults to None, which reads all lines.
|
522
|
+
doc_type (str|DocumentType, optional): The type of document to parse.
|
516
523
|
|
517
524
|
Returns:
|
518
525
|
List[Document]: List of Document objects representing files.
|
@@ -520,56 +527,69 @@ class RepoLoader:
|
|
520
527
|
"""
|
521
528
|
docs = []
|
522
529
|
file_paths = []
|
523
|
-
|
524
|
-
|
525
|
-
if path_obj.is_file():
|
526
|
-
file_paths.append(str(path_obj))
|
530
|
+
if isinstance(path, bytes):
|
531
|
+
file_paths.append(path)
|
527
532
|
else:
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
533
|
+
path_obj = Path(path).resolve()
|
534
|
+
|
535
|
+
if path_obj.is_file():
|
536
|
+
file_paths.append(str(path_obj))
|
537
|
+
else:
|
538
|
+
path_depth = len(path_obj.parts)
|
539
|
+
for root, dirs, files in os.walk(path):
|
540
|
+
# Exclude directories if needed
|
541
|
+
if exclude_dirs:
|
542
|
+
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
543
|
+
|
544
|
+
current_depth = len(Path(root).resolve().parts) - path_depth
|
545
|
+
if depth == -1 or current_depth <= depth:
|
546
|
+
for file in files:
|
547
|
+
file_path = str(Path(root) / file)
|
548
|
+
if (
|
549
|
+
file_types is None
|
550
|
+
or RepoLoader._file_type(file_path) in file_types
|
551
|
+
or os.path.basename(file_path) in file_types
|
552
|
+
or file_path in file_types
|
553
|
+
):
|
554
|
+
file_paths.append(file_path)
|
545
555
|
|
546
556
|
for file_path in file_paths:
|
547
|
-
|
548
|
-
|
549
|
-
doc_parser = DocumentParser.create(
|
557
|
+
docs.extend(
|
558
|
+
DocumentParser.chunks_from_path_or_bytes(
|
550
559
|
file_path,
|
551
|
-
parser
|
552
|
-
|
553
|
-
|
554
|
-
if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
|
555
|
-
doc_parser = ImagePdfParser(file_path, parser.config)
|
556
|
-
new_chunks = doc_parser.get_doc_chunks()
|
557
|
-
docs.extend(new_chunks)
|
558
|
-
else:
|
559
|
-
with open(file_path, "r") as f:
|
560
|
-
if lines is not None:
|
561
|
-
file_lines = list(itertools.islice(f, lines))
|
562
|
-
content = "\n".join(line.strip() for line in file_lines)
|
563
|
-
else:
|
564
|
-
content = f.read()
|
565
|
-
soup = BeautifulSoup(content, "html.parser")
|
566
|
-
text = soup.get_text()
|
567
|
-
docs.append(
|
568
|
-
Document(
|
569
|
-
content=text,
|
570
|
-
metadata=DocMetaData(source=str(file_path)),
|
571
|
-
)
|
560
|
+
parser,
|
561
|
+
doc_type=doc_type,
|
562
|
+
lines=lines,
|
572
563
|
)
|
564
|
+
)
|
565
|
+
# dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
|
566
|
+
# if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
|
567
|
+
# doc_parser = DocumentParser.create(
|
568
|
+
# file_path,
|
569
|
+
# parser.config,
|
570
|
+
# doc_type=doc_type,
|
571
|
+
# )
|
572
|
+
# new_chunks = doc_parser.get_doc_chunks()
|
573
|
+
# if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
|
574
|
+
# doc_parser = ImagePdfParser(file_path, parser.config)
|
575
|
+
# new_chunks = doc_parser.get_doc_chunks()
|
576
|
+
# docs.extend(new_chunks)
|
577
|
+
# else:
|
578
|
+
# # try getting as plain text; these will be chunked downstream
|
579
|
+
# with open(file_path, "r") as f:
|
580
|
+
# if lines is not None:
|
581
|
+
# file_lines = list(itertools.islice(f, lines))
|
582
|
+
# content = "\n".join(line.strip() for line in file_lines)
|
583
|
+
# else:
|
584
|
+
# content = f.read()
|
585
|
+
# soup = BeautifulSoup(content, "html.parser")
|
586
|
+
# text = soup.get_text()
|
587
|
+
# docs.append(
|
588
|
+
# Document(
|
589
|
+
# content=text,
|
590
|
+
# metadata=DocMetaData(source=str(file_path)),
|
591
|
+
# )
|
592
|
+
# )
|
573
593
|
|
574
594
|
return docs
|
575
595
|
|
langroid/parsing/urls.py
CHANGED
@@ -112,26 +112,35 @@ def is_url(s: str) -> bool:
|
|
112
112
|
return False
|
113
113
|
|
114
114
|
|
115
|
-
def
|
115
|
+
def get_urls_paths_bytes_indices(
|
116
|
+
inputs: List[str | bytes],
|
117
|
+
) -> Tuple[List[int], List[int], List[int]]:
|
116
118
|
"""
|
117
|
-
Given a list of inputs, return a
|
119
|
+
Given a list of inputs, return a
|
120
|
+
list of indices of URLs, list of indices of paths, list of indices of byte-contents.
|
118
121
|
Args:
|
119
|
-
inputs: list of strings
|
122
|
+
inputs: list of strings or bytes
|
120
123
|
Returns:
|
121
|
-
list of
|
124
|
+
list of Indices of URLs,
|
125
|
+
list of indices of paths,
|
126
|
+
list of indices of byte-contents
|
122
127
|
"""
|
123
128
|
urls = []
|
124
129
|
paths = []
|
125
|
-
|
130
|
+
byte_list = []
|
131
|
+
for i, item in enumerate(inputs):
|
132
|
+
if isinstance(item, bytes):
|
133
|
+
byte_list.append(i)
|
134
|
+
continue
|
126
135
|
try:
|
127
|
-
|
128
|
-
urls.append(
|
136
|
+
Url(url=parse_obj_as(HttpUrl, item))
|
137
|
+
urls.append(i)
|
129
138
|
except ValidationError:
|
130
139
|
if os.path.exists(item):
|
131
|
-
paths.append(
|
140
|
+
paths.append(i)
|
132
141
|
else:
|
133
142
|
logger.warning(f"{item} is neither a URL nor a path.")
|
134
|
-
return urls, paths
|
143
|
+
return urls, paths, byte_list
|
135
144
|
|
136
145
|
|
137
146
|
def crawl_url(url: str, max_urls: int = 1) -> List[str]:
|
langroid/parsing/utils.py
CHANGED
@@ -10,10 +10,11 @@ import nltk
|
|
10
10
|
from faker import Faker
|
11
11
|
|
12
12
|
from langroid.mytypes import Document
|
13
|
+
from langroid.parsing.document_parser import DocumentType
|
13
14
|
from langroid.parsing.parser import Parser, ParsingConfig
|
14
15
|
from langroid.parsing.repo_loader import RepoLoader
|
15
16
|
from langroid.parsing.url_loader import URLLoader
|
16
|
-
from langroid.parsing.urls import
|
17
|
+
from langroid.parsing.urls import get_urls_paths_bytes_indices
|
17
18
|
|
18
19
|
Faker.seed(23)
|
19
20
|
random.seed(43)
|
@@ -314,37 +315,54 @@ def extract_numbered_segments(s: str, specs: str) -> str:
|
|
314
315
|
|
315
316
|
|
316
317
|
def extract_content_from_path(
|
317
|
-
path: str | List[str],
|
318
|
+
path: bytes | str | List[bytes | str],
|
319
|
+
parsing: ParsingConfig,
|
320
|
+
doc_type: str | DocumentType | None = None,
|
318
321
|
) -> str | List[str]:
|
319
322
|
"""
|
320
323
|
Extract the content from a file path or URL, or a list of file paths or URLs.
|
321
324
|
|
322
325
|
Args:
|
323
|
-
path (str | List[str]): The file path or URL, or a list of file paths or
|
326
|
+
path (bytes | str | List[str]): The file path or URL, or a list of file paths or
|
327
|
+
URLs, or bytes content. The bytes option is meant to support cases
|
328
|
+
where upstream code may have already loaded the content (e.g., from a
|
329
|
+
database or API) and we want to avoid having to copy the content to a
|
330
|
+
temporary file.
|
324
331
|
parsing (ParsingConfig): The parsing configuration.
|
332
|
+
doc_type (str | DocumentType | None): The document type if known.
|
333
|
+
If multiple paths are given, this MUST apply to ALL docs.
|
325
334
|
|
326
335
|
Returns:
|
327
336
|
str | List[str]: The extracted content if a single file path or URL is provided,
|
328
337
|
or a list of extracted contents if a
|
329
338
|
list of file paths or URLs is provided.
|
330
339
|
"""
|
331
|
-
if isinstance(path, str):
|
332
|
-
|
340
|
+
if isinstance(path, str) or isinstance(path, bytes):
|
341
|
+
paths = [path]
|
333
342
|
elif isinstance(path, list) and len(path) == 0:
|
334
343
|
return ""
|
335
|
-
|
344
|
+
else:
|
345
|
+
paths = path
|
346
|
+
|
347
|
+
url_idxs, path_idxs, byte_idxs = get_urls_paths_bytes_indices(paths)
|
348
|
+
urls = [paths[i] for i in url_idxs]
|
349
|
+
path_list = [paths[i] for i in path_idxs]
|
350
|
+
byte_list = [paths[i] for i in byte_idxs]
|
351
|
+
path_list.extend(byte_list)
|
336
352
|
parser = Parser(parsing)
|
337
353
|
docs: List[Document] = []
|
338
354
|
try:
|
339
355
|
if len(urls) > 0:
|
340
|
-
loader = URLLoader(urls=urls, parser=parser)
|
356
|
+
loader = URLLoader(urls=urls, parser=parser) # type: ignore
|
341
357
|
docs = loader.load()
|
342
358
|
if len(path_list) > 0:
|
343
359
|
for p in path_list:
|
344
|
-
path_docs = RepoLoader.get_documents(
|
360
|
+
path_docs = RepoLoader.get_documents(
|
361
|
+
p, parser=parser, doc_type=doc_type
|
362
|
+
)
|
345
363
|
docs.extend(path_docs)
|
346
364
|
except Exception as e:
|
347
|
-
logger.warning(f"Error loading path {
|
365
|
+
logger.warning(f"Error loading path {paths}: {e}")
|
348
366
|
return ""
|
349
367
|
if len(docs) == 1:
|
350
368
|
return docs[0].content
|
langroid/utils/system.py
CHANGED
@@ -131,7 +131,7 @@ def generate_user_id(org: str = "") -> str:
|
|
131
131
|
def update_hash(hash: str | None = None, s: str = "") -> str:
|
132
132
|
"""
|
133
133
|
Takes a SHA256 hash string and a new string, updates the hash with the new string,
|
134
|
-
and returns the updated hash string
|
134
|
+
and returns the updated hash string.
|
135
135
|
|
136
136
|
Args:
|
137
137
|
hash (str): A SHA256 hash string.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.219
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
License: MIT
|
6
6
|
Author: Prasad Chalasani
|
@@ -85,7 +85,7 @@ Requires-Dist: pytest-redis (>=3.0.2,<4.0.0)
|
|
85
85
|
Requires-Dist: python-docx (>=1.1.0,<2.0.0)
|
86
86
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
87
87
|
Requires-Dist: python-socketio (>=5.11.0,<6.0.0) ; extra == "chainlit"
|
88
|
-
Requires-Dist: qdrant-client (>=1.
|
88
|
+
Requires-Dist: qdrant-client (>=1.8.0,<2.0.0)
|
89
89
|
Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
|
90
90
|
Requires-Dist: redis (>=5.0.1,<6.0.0)
|
91
91
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
@@ -10,7 +10,7 @@ langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
|
11
11
|
langroid/agent/openai_assistant.py,sha256=QTLBgnH6Btf2GWzN-WApvra-vPQWvYcXcAOULuIy4Ig,32702
|
12
12
|
langroid/agent/special/__init__.py,sha256=XPE076zD-roskxNBn-A1hnh4AHoMiQN9gk1UDjPaBaU,1201
|
13
|
-
langroid/agent/special/doc_chat_agent.py,sha256
|
13
|
+
langroid/agent/special/doc_chat_agent.py,sha256=-jMgaAvjMEIVL1iPpxhGYq3_YoIvSfic3em5FzoKtWQ,53342
|
14
14
|
langroid/agent/special/lance_doc_chat_agent.py,sha256=USp0U3eTaJzwF_3bdqE7CedSLbaqAi2tm-VzygcyLaA,10175
|
15
15
|
langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
|
16
16
|
langroid/agent/special/lance_rag/critic_agent.py,sha256=pi_9eMBxEycbWTddtq_yz-mOb2V4SgGm3zfsOH1HU-Q,5775
|
@@ -75,19 +75,19 @@ langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulr
|
|
75
75
|
langroid/parsing/code-parsing.md,sha256=--cyyNiSZSDlIwcjAV4-shKrSiRe2ytF3AdSoS_hD2g,3294
|
76
76
|
langroid/parsing/code_parser.py,sha256=BbDAzp35wkYQ9U1dpf1ARL0lVyi0tfqEc6_eox2C090,3727
|
77
77
|
langroid/parsing/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
|
-
langroid/parsing/document_parser.py,sha256=
|
78
|
+
langroid/parsing/document_parser.py,sha256=uf1YhpC8-Z1RF7R0Yfy39VOHGf4YWwJjnDRrDIl3Q3E,22307
|
79
79
|
langroid/parsing/image_text.py,sha256=sbLIQ5nHe2UnYUksBaQsmZGaX-X0qgEpPd7CEzi_z5M,910
|
80
80
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
81
81
|
langroid/parsing/parse_json.py,sha256=tgB_oatcrgt6L9ZplC-xBBXjLzL1gjSQf1L2_W5kwFA,4230
|
82
82
|
langroid/parsing/parser.py,sha256=vE5j1LVDeFQPmLrXCWBfvuoPsjjvVIGHcsIWCBR8HDM,10617
|
83
|
-
langroid/parsing/repo_loader.py,sha256=
|
83
|
+
langroid/parsing/repo_loader.py,sha256=nyVBvkhh2nXTLFwMcnsayqMrjvtLKXXj89RTBzXBcng,30781
|
84
84
|
langroid/parsing/search.py,sha256=plQtjarB9afGfJLB0CyPXPq3mM4m7kRsfd0_4brziEI,8846
|
85
85
|
langroid/parsing/spider.py,sha256=w_mHR1B4KOmxsBLoVI8kMkMTEbwTzeK3ath9fOMJrTk,3043
|
86
86
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
87
87
|
langroid/parsing/url_loader.py,sha256=Na2TBlKuQkloZzkE2d7xl6mh9olS3CbpgCsJbJ-xhIA,4472
|
88
88
|
langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
|
89
|
-
langroid/parsing/urls.py,sha256=
|
90
|
-
langroid/parsing/utils.py,sha256=
|
89
|
+
langroid/parsing/urls.py,sha256=5B0-2MM4LoFC7jHUJ0rft7Mx5GUrnmz8oFioO0iaMt8,7975
|
90
|
+
langroid/parsing/utils.py,sha256=pbSAbfwA28EBNESpQRJee_Kp1b44qze-2_2b9qJOKfM,12646
|
91
91
|
langroid/parsing/web_search.py,sha256=XSiSHB4c1Wa8RjWkC4Yh-ac8S7a2WPPYj0n-Ma716RY,4759
|
92
92
|
langroid/prompts/__init__.py,sha256=B0vpJzIJlMR3mFRtoQwyALsFzBHvLp9f92acD8xJA_0,185
|
93
93
|
langroid/prompts/chat-gpt4-system-prompt.md,sha256=Q3uLCJTPQvmUkZN2XDnkBC7M2K3X0F3C3GIQBaFvYvw,5329
|
@@ -110,7 +110,7 @@ langroid/utils/output/printing.py,sha256=5EsYB1O4qKhocW19aebOUzK82RD9U5nygbY21yo
|
|
110
110
|
langroid/utils/output/status.py,sha256=VoSXmWDuddo1ipCzDAA6qlgffr5E4lSmBD0rIdNxxcs,774
|
111
111
|
langroid/utils/pandas_utils.py,sha256=UctS986Jtl_MvU5rA7-GfrjEHXP7MNu8ePhepv0bTn0,755
|
112
112
|
langroid/utils/pydantic_utils.py,sha256=yb-ghaQYL7EIYeiZ0tailvZvAuJZNF7UBXkd3z35OYc,21728
|
113
|
-
langroid/utils/system.py,sha256=
|
113
|
+
langroid/utils/system.py,sha256=tWoEbzHzJ6ywdsoa9EwsQrZfGk2t7q87_zKNwau2C8s,4546
|
114
114
|
langroid/utils/web/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
115
115
|
langroid/utils/web/login.py,sha256=1iz9eUAHa87vpKIkzwkmFa00avwFWivDSAr7QUhK7U0,2528
|
116
116
|
langroid/vector_store/__init__.py,sha256=D82ioqPWxKTTbN0qiPNB-I1GjovhLw1MgDuYhcB3hCs,831
|
@@ -121,7 +121,7 @@ langroid/vector_store/meilisearch.py,sha256=d2huA9P-NoYRuAQ9ZeXJmMKr7ry8u90RUSR2
|
|
121
121
|
langroid/vector_store/momento.py,sha256=9cui31TTrILid2KIzUpBkN2Ey3g_CZWOQVdaFsA4Ors,10045
|
122
122
|
langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
|
123
123
|
langroid/vector_store/qdrantdb.py,sha256=_egbsP9SWBwmI827EDYSSOqfIQSmwNsmJfFTxrLpWYE,13457
|
124
|
-
langroid-0.1.
|
125
|
-
langroid-0.1.
|
126
|
-
langroid-0.1.
|
127
|
-
langroid-0.1.
|
124
|
+
langroid-0.1.219.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
125
|
+
langroid-0.1.219.dist-info/METADATA,sha256=hPGE8zril18HUqkbbqKiSsFGwyMyCr0232TvF1HZx0Q,47945
|
126
|
+
langroid-0.1.219.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
127
|
+
langroid-0.1.219.dist-info/RECORD,,
|
File without changes
|
File without changes
|