langroid 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/__init__.py +95 -0
- langroid/agent/__init__.py +40 -0
- langroid/agent/base.py +222 -91
- langroid/agent/batch.py +264 -0
- langroid/agent/callbacks/chainlit.py +608 -0
- langroid/agent/chat_agent.py +247 -101
- langroid/agent/chat_document.py +41 -4
- langroid/agent/openai_assistant.py +842 -0
- langroid/agent/special/__init__.py +50 -0
- langroid/agent/special/doc_chat_agent.py +837 -141
- langroid/agent/special/lance_doc_chat_agent.py +258 -0
- langroid/agent/special/lance_rag/__init__.py +9 -0
- langroid/agent/special/lance_rag/critic_agent.py +136 -0
- langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
- langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
- langroid/agent/special/lance_tools.py +44 -0
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
- langroid/agent/special/neo4j/utils/__init__.py +0 -0
- langroid/agent/special/neo4j/utils/system_message.py +46 -0
- langroid/agent/special/relevance_extractor_agent.py +127 -0
- langroid/agent/special/retriever_agent.py +32 -198
- langroid/agent/special/sql/__init__.py +11 -0
- langroid/agent/special/sql/sql_chat_agent.py +47 -23
- langroid/agent/special/sql/utils/__init__.py +22 -0
- langroid/agent/special/sql/utils/description_extractors.py +95 -46
- langroid/agent/special/sql/utils/populate_metadata.py +28 -21
- langroid/agent/special/table_chat_agent.py +43 -9
- langroid/agent/task.py +475 -122
- langroid/agent/tool_message.py +75 -13
- langroid/agent/tools/__init__.py +13 -0
- langroid/agent/tools/duckduckgo_search_tool.py +66 -0
- langroid/agent/tools/google_search_tool.py +11 -0
- langroid/agent/tools/metaphor_search_tool.py +67 -0
- langroid/agent/tools/recipient_tool.py +16 -29
- langroid/agent/tools/run_python_code.py +60 -0
- langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
- langroid/agent/tools/segment_extract_tool.py +36 -0
- langroid/cachedb/__init__.py +9 -0
- langroid/cachedb/base.py +22 -2
- langroid/cachedb/momento_cachedb.py +26 -2
- langroid/cachedb/redis_cachedb.py +78 -11
- langroid/embedding_models/__init__.py +34 -0
- langroid/embedding_models/base.py +21 -2
- langroid/embedding_models/models.py +120 -18
- langroid/embedding_models/protoc/embeddings.proto +19 -0
- langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
- langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
- langroid/embedding_models/remote_embeds.py +153 -0
- langroid/language_models/__init__.py +45 -0
- langroid/language_models/azure_openai.py +80 -27
- langroid/language_models/base.py +117 -12
- langroid/language_models/config.py +5 -0
- langroid/language_models/openai_assistants.py +3 -0
- langroid/language_models/openai_gpt.py +558 -174
- langroid/language_models/prompt_formatter/__init__.py +15 -0
- langroid/language_models/prompt_formatter/base.py +4 -6
- langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
- langroid/language_models/utils.py +18 -21
- langroid/mytypes.py +25 -8
- langroid/parsing/__init__.py +46 -0
- langroid/parsing/document_parser.py +260 -63
- langroid/parsing/image_text.py +32 -0
- langroid/parsing/parse_json.py +143 -0
- langroid/parsing/parser.py +122 -59
- langroid/parsing/repo_loader.py +114 -52
- langroid/parsing/search.py +68 -63
- langroid/parsing/spider.py +3 -2
- langroid/parsing/table_loader.py +44 -0
- langroid/parsing/url_loader.py +59 -11
- langroid/parsing/urls.py +85 -37
- langroid/parsing/utils.py +298 -4
- langroid/parsing/web_search.py +73 -0
- langroid/prompts/__init__.py +11 -0
- langroid/prompts/chat-gpt4-system-prompt.md +68 -0
- langroid/prompts/prompts_config.py +1 -1
- langroid/utils/__init__.py +17 -0
- langroid/utils/algorithms/__init__.py +3 -0
- langroid/utils/algorithms/graph.py +103 -0
- langroid/utils/configuration.py +36 -5
- langroid/utils/constants.py +4 -0
- langroid/utils/globals.py +2 -2
- langroid/utils/logging.py +2 -5
- langroid/utils/output/__init__.py +21 -0
- langroid/utils/output/printing.py +47 -1
- langroid/utils/output/status.py +33 -0
- langroid/utils/pandas_utils.py +30 -0
- langroid/utils/pydantic_utils.py +616 -2
- langroid/utils/system.py +98 -0
- langroid/vector_store/__init__.py +40 -0
- langroid/vector_store/base.py +203 -6
- langroid/vector_store/chromadb.py +59 -32
- langroid/vector_store/lancedb.py +463 -0
- langroid/vector_store/meilisearch.py +10 -7
- langroid/vector_store/momento.py +262 -0
- langroid/vector_store/qdrantdb.py +104 -22
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/METADATA +329 -149
- langroid-0.1.219.dist-info/RECORD +127 -0
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/WHEEL +1 -1
- langroid/agent/special/recipient_validator_agent.py +0 -157
- langroid/parsing/json.py +0 -64
- langroid/utils/web/selenium_login.py +0 -36
- langroid-0.1.85.dist-info/RECORD +0 -94
- /langroid/{scripts → agent/callbacks}/__init__.py +0 -0
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
langroid/parsing/parser.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from enum import Enum
|
3
|
-
from
|
4
|
-
from typing import List
|
3
|
+
from typing import Dict, List, Literal
|
5
4
|
|
6
5
|
import tiktoken
|
7
6
|
from pydantic import BaseSettings
|
@@ -20,11 +19,21 @@ class Splitter(str, Enum):
|
|
20
19
|
|
21
20
|
|
22
21
|
class PdfParsingConfig(BaseSettings):
|
23
|
-
library:
|
22
|
+
library: Literal[
|
23
|
+
"fitz",
|
24
|
+
"pdfplumber",
|
25
|
+
"pypdf",
|
26
|
+
"unstructured",
|
27
|
+
"pdf2image",
|
28
|
+
] = "pdfplumber"
|
24
29
|
|
25
30
|
|
26
31
|
class DocxParsingConfig(BaseSettings):
|
27
|
-
library:
|
32
|
+
library: Literal["python-docx", "unstructured"] = "unstructured"
|
33
|
+
|
34
|
+
|
35
|
+
class DocParsingConfig(BaseSettings):
|
36
|
+
library: Literal["unstructured"] = "unstructured"
|
28
37
|
|
29
38
|
|
30
39
|
class ParsingConfig(BaseSettings):
|
@@ -36,10 +45,12 @@ class ParsingConfig(BaseSettings):
|
|
36
45
|
min_chunk_chars: int = 350
|
37
46
|
discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
|
38
47
|
n_similar_docs: int = 4
|
48
|
+
n_neighbor_ids: int = 5 # window size to store around each chunk
|
39
49
|
separators: List[str] = ["\n\n", "\n", " ", ""]
|
40
50
|
token_encoding_model: str = "text-embedding-ada-002"
|
41
51
|
pdf: PdfParsingConfig = PdfParsingConfig()
|
42
52
|
docx: DocxParsingConfig = DocxParsingConfig()
|
53
|
+
doc: DocParsingConfig = DocParsingConfig()
|
43
54
|
|
44
55
|
|
45
56
|
class Parser:
|
@@ -51,72 +62,122 @@ class Parser:
|
|
51
62
|
tokens = self.tokenizer.encode(text)
|
52
63
|
return len(tokens)
|
53
64
|
|
65
|
+
def add_window_ids(self, chunks: List[Document]) -> None:
|
66
|
+
"""Chunks may belong to multiple docs, but for each doc,
|
67
|
+
they appear consecutively. Add window_ids in metadata"""
|
68
|
+
|
69
|
+
# discard empty chunks
|
70
|
+
chunks = [c for c in chunks if c.content.strip() != ""]
|
71
|
+
if len(chunks) == 0:
|
72
|
+
return
|
73
|
+
# The original metadata.id (if any) is ignored since it will be same for all
|
74
|
+
# chunks and is useless. We want a distinct id for each chunk.
|
75
|
+
orig_ids = [c.metadata.id for c in chunks]
|
76
|
+
ids = [Document.hash_id(str(c)) for c in chunks]
|
77
|
+
id2chunk = {id: c for id, c in zip(ids, chunks)}
|
78
|
+
|
79
|
+
# group the ids by orig_id
|
80
|
+
orig_id_to_ids: Dict[str, List[str]] = {}
|
81
|
+
for orig_id, id in zip(orig_ids, ids):
|
82
|
+
if orig_id not in orig_id_to_ids:
|
83
|
+
orig_id_to_ids[orig_id] = []
|
84
|
+
orig_id_to_ids[orig_id].append(id)
|
85
|
+
|
86
|
+
# now each orig_id maps to a sequence of ids within a single doc
|
87
|
+
|
88
|
+
k = self.config.n_neighbor_ids
|
89
|
+
for orig, ids in orig_id_to_ids.items():
|
90
|
+
# ids are consecutive chunks in a single doc
|
91
|
+
n = len(ids)
|
92
|
+
window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
|
93
|
+
for i, _ in enumerate(ids):
|
94
|
+
c = id2chunk[ids[i]]
|
95
|
+
c.metadata.window_ids = window_ids[i]
|
96
|
+
c.metadata.id = ids[i]
|
97
|
+
c.metadata.is_chunk = True
|
98
|
+
|
54
99
|
def split_simple(self, docs: List[Document]) -> List[Document]:
|
55
100
|
if len(self.config.separators) == 0:
|
56
101
|
raise ValueError("Must have at least one separator")
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
)
|
63
|
-
|
64
|
-
|
102
|
+
final_docs = []
|
103
|
+
|
104
|
+
for d in docs:
|
105
|
+
if d.content.strip() == "":
|
106
|
+
continue
|
107
|
+
chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
|
108
|
+
chunk_docs = [
|
109
|
+
Document(
|
110
|
+
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
111
|
+
)
|
112
|
+
for c in chunks
|
113
|
+
if c.strip() != ""
|
114
|
+
]
|
115
|
+
self.add_window_ids(chunk_docs)
|
116
|
+
final_docs += chunk_docs
|
117
|
+
return final_docs
|
65
118
|
|
66
119
|
def split_para_sentence(self, docs: List[Document]) -> List[Document]:
|
67
|
-
final_chunks = []
|
68
120
|
chunks = docs
|
69
121
|
while True:
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
if
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
)
|
122
|
+
un_splittables = 0
|
123
|
+
split_chunks = []
|
124
|
+
for c in chunks:
|
125
|
+
if c.content.strip() == "":
|
126
|
+
continue
|
127
|
+
if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
|
128
|
+
# small chunk: no need to split
|
129
|
+
split_chunks.append(c)
|
130
|
+
continue
|
131
|
+
splits = self._split_para_sentence_once([c])
|
132
|
+
un_splittables += len(splits) == 1
|
133
|
+
split_chunks += splits
|
134
|
+
if len(split_chunks) == len(chunks):
|
135
|
+
if un_splittables > 0:
|
136
|
+
max_len = max([self.num_tokens(p.content) for p in chunks])
|
137
|
+
logger.warning(
|
138
|
+
f"""
|
139
|
+
Unable to split {un_splittables} chunks
|
140
|
+
using chunk_size = {self.config.chunk_size}.
|
141
|
+
Max chunk size is {max_len} tokens.
|
142
|
+
"""
|
143
|
+
)
|
93
144
|
break # we won't be able to shorten them with current settings
|
145
|
+
chunks = split_chunks.copy()
|
94
146
|
|
95
|
-
|
147
|
+
self.add_window_ids(chunks)
|
148
|
+
return chunks
|
96
149
|
|
97
150
|
def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
151
|
+
final_chunks = []
|
152
|
+
for d in docs:
|
153
|
+
if d.content.strip() == "":
|
154
|
+
continue
|
155
|
+
chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
|
156
|
+
chunk_docs = [
|
157
|
+
Document(
|
158
|
+
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
103
159
|
)
|
104
|
-
|
160
|
+
for c in chunks
|
161
|
+
if c.strip() != ""
|
105
162
|
]
|
106
|
-
|
107
|
-
|
108
|
-
return
|
163
|
+
final_chunks += chunk_docs
|
164
|
+
|
165
|
+
return final_chunks
|
109
166
|
|
110
167
|
def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
168
|
+
final_docs = []
|
169
|
+
for d in docs:
|
170
|
+
chunks = self.chunk_tokens(d.content)
|
171
|
+
chunk_docs = [
|
172
|
+
Document(
|
173
|
+
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
174
|
+
)
|
175
|
+
for c in chunks
|
176
|
+
if c.strip() != ""
|
116
177
|
]
|
117
|
-
|
118
|
-
|
119
|
-
return
|
178
|
+
self.add_window_ids(chunk_docs)
|
179
|
+
final_docs += chunk_docs
|
180
|
+
return final_docs
|
120
181
|
|
121
182
|
def chunk_tokens(
|
122
183
|
self,
|
@@ -198,17 +259,19 @@ class Parser:
|
|
198
259
|
# Increment the number of chunks
|
199
260
|
num_chunks += 1
|
200
261
|
|
201
|
-
#
|
202
|
-
|
203
|
-
remaining_text = self.tokenizer.decode(tokens).replace("\n", " ").strip()
|
204
|
-
if len(remaining_text) > self.config.discard_chunk_chars:
|
205
|
-
chunks.append(remaining_text)
|
262
|
+
# There may be remaining tokens, but we discard them
|
263
|
+
# since we have already reached the maximum number of chunks
|
206
264
|
|
207
265
|
return chunks
|
208
266
|
|
209
267
|
def split(self, docs: List[Document]) -> List[Document]:
|
210
268
|
if len(docs) == 0:
|
211
269
|
return []
|
270
|
+
# create ids in metadata of docs if absent:
|
271
|
+
# we need this to distinguish docs later in add_window_ids
|
272
|
+
for d in docs:
|
273
|
+
if d.metadata.id in [None, ""]:
|
274
|
+
d.metadata.id = d._unique_hash_id()
|
212
275
|
# some docs are already splits, so don't split them further!
|
213
276
|
chunked_docs = [d for d in docs if d.metadata.is_chunk]
|
214
277
|
big_docs = [d for d in docs if not d.metadata.is_chunk]
|
langroid/parsing/repo_loader.py
CHANGED
@@ -10,15 +10,15 @@ from pathlib import Path
|
|
10
10
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
11
11
|
from urllib.parse import urlparse
|
12
12
|
|
13
|
-
from bs4 import BeautifulSoup
|
14
13
|
from dotenv import load_dotenv
|
15
14
|
from github import Github
|
16
15
|
from github.ContentFile import ContentFile
|
16
|
+
from github.Label import Label
|
17
17
|
from github.Repository import Repository
|
18
|
-
from pydantic import BaseSettings
|
18
|
+
from pydantic import BaseModel, BaseSettings, Field
|
19
19
|
|
20
20
|
from langroid.mytypes import DocMetaData, Document
|
21
|
-
from langroid.parsing.document_parser import DocumentParser
|
21
|
+
from langroid.parsing.document_parser import DocumentParser, DocumentType
|
22
22
|
from langroid.parsing.parser import Parser, ParsingConfig
|
23
23
|
|
24
24
|
logger = logging.getLogger(__name__)
|
@@ -43,6 +43,22 @@ def _has_files(directory: str) -> bool:
|
|
43
43
|
return False
|
44
44
|
|
45
45
|
|
46
|
+
# Pydantic model for GitHub issue data
|
47
|
+
class IssueData(BaseModel):
|
48
|
+
state: str = Field(..., description="State of issue e.g. open or closed")
|
49
|
+
year: int = Field(..., description="Year issue was created")
|
50
|
+
month: int = Field(..., description="Month issue was created")
|
51
|
+
day: int = Field(..., description="Day issue was created")
|
52
|
+
assignee: Optional[str] = Field(..., description="Assignee of issue")
|
53
|
+
size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
|
54
|
+
text: str = Field(..., description="Text of issue, i.e. description body")
|
55
|
+
|
56
|
+
|
57
|
+
def get_issue_size(labels: List[Label]) -> str | None:
|
58
|
+
sizes = ["XS", "S", "M", "L", "XL", "XXL"]
|
59
|
+
return next((label.name for label in labels if label.name in sizes), None)
|
60
|
+
|
61
|
+
|
46
62
|
class RepoLoaderConfig(BaseSettings):
|
47
63
|
"""
|
48
64
|
Configuration for RepoLoader.
|
@@ -155,6 +171,27 @@ class RepoLoader:
|
|
155
171
|
def _get_dir_name(self) -> str:
|
156
172
|
return urlparse(self.url).path.replace("/", "_")
|
157
173
|
|
174
|
+
def get_issues(self, k: int | None = 100) -> List[IssueData]:
|
175
|
+
"""Get up to k issues from the GitHub repo."""
|
176
|
+
if k is None:
|
177
|
+
issues = self.repo.get_issues(state="all")
|
178
|
+
else:
|
179
|
+
issues = self.repo.get_issues(state="all")[:k]
|
180
|
+
issue_data_list = []
|
181
|
+
for issue in issues:
|
182
|
+
issue_data = IssueData(
|
183
|
+
state=issue.state,
|
184
|
+
year=issue.created_at.year,
|
185
|
+
month=issue.created_at.month,
|
186
|
+
day=issue.created_at.day,
|
187
|
+
assignee=issue.assignee.login if issue.assignee else None,
|
188
|
+
size=get_issue_size(issue.labels),
|
189
|
+
text=issue.body or "No issue description body.",
|
190
|
+
)
|
191
|
+
issue_data_list.append(issue_data)
|
192
|
+
|
193
|
+
return issue_data_list
|
194
|
+
|
158
195
|
@staticmethod
|
159
196
|
def _file_type(name: str) -> str:
|
160
197
|
"""
|
@@ -336,8 +373,8 @@ class RepoLoader:
|
|
336
373
|
|
337
374
|
Returns:
|
338
375
|
Tuple of (dict, List_of_Documents):
|
339
|
-
|
340
|
-
|
376
|
+
A dictionary containing file and directory names, with file
|
377
|
+
contents, and a list of Document objects for each file.
|
341
378
|
"""
|
342
379
|
if path is None:
|
343
380
|
if self.clone_path is None or not _has_files(self.clone_path):
|
@@ -382,8 +419,8 @@ class RepoLoader:
|
|
382
419
|
|
383
420
|
Returns:
|
384
421
|
Tuple of (dict, List_of_Documents):
|
385
|
-
|
386
|
-
|
422
|
+
A dictionary containing file and directory names, with file contents.
|
423
|
+
A list of Document objects for each file.
|
387
424
|
"""
|
388
425
|
|
389
426
|
folder_structure = {
|
@@ -453,18 +490,25 @@ class RepoLoader:
|
|
453
490
|
|
454
491
|
@staticmethod
|
455
492
|
def get_documents(
|
456
|
-
path: str,
|
493
|
+
path: str | bytes,
|
457
494
|
parser: Parser = Parser(ParsingConfig()),
|
458
495
|
file_types: Optional[List[str]] = None,
|
459
496
|
exclude_dirs: Optional[List[str]] = None,
|
460
497
|
depth: int = -1,
|
461
498
|
lines: Optional[int] = None,
|
499
|
+
doc_type: str | DocumentType | None = None,
|
462
500
|
) -> List[Document]:
|
463
501
|
"""
|
464
502
|
Recursively get all files under a path as Document objects.
|
465
503
|
|
466
504
|
Args:
|
467
|
-
path (str): The path to the directory or file.
|
505
|
+
path (str|bytes): The path to the directory or file, or bytes content.
|
506
|
+
The bytes option is meant to support the case where the content
|
507
|
+
has already been read from a file in an upstream process
|
508
|
+
(e.g. from an API or a database), and we want to avoid having to
|
509
|
+
write it to a temporary file just to read it again.
|
510
|
+
(which can be very slow for large files,
|
511
|
+
especially in a docker container)
|
468
512
|
parser (Parser): Parser to use to parse files.
|
469
513
|
file_types (List[str], optional): List of file extensions OR
|
470
514
|
filenames OR file_path_names to include.
|
@@ -475,6 +519,7 @@ class RepoLoader:
|
|
475
519
|
which includes all depths.
|
476
520
|
lines (int, optional): Number of lines to read from each file.
|
477
521
|
Defaults to None, which reads all lines.
|
522
|
+
doc_type (str|DocumentType, optional): The type of document to parse.
|
478
523
|
|
479
524
|
Returns:
|
480
525
|
List[Document]: List of Document objects representing files.
|
@@ -482,52 +527,69 @@ class RepoLoader:
|
|
482
527
|
"""
|
483
528
|
docs = []
|
484
529
|
file_paths = []
|
485
|
-
|
486
|
-
|
487
|
-
if path_obj.is_file():
|
488
|
-
file_paths.append(str(path_obj))
|
530
|
+
if isinstance(path, bytes):
|
531
|
+
file_paths.append(path)
|
489
532
|
else:
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
533
|
+
path_obj = Path(path).resolve()
|
534
|
+
|
535
|
+
if path_obj.is_file():
|
536
|
+
file_paths.append(str(path_obj))
|
537
|
+
else:
|
538
|
+
path_depth = len(path_obj.parts)
|
539
|
+
for root, dirs, files in os.walk(path):
|
540
|
+
# Exclude directories if needed
|
541
|
+
if exclude_dirs:
|
542
|
+
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
543
|
+
|
544
|
+
current_depth = len(Path(root).resolve().parts) - path_depth
|
545
|
+
if depth == -1 or current_depth <= depth:
|
546
|
+
for file in files:
|
547
|
+
file_path = str(Path(root) / file)
|
548
|
+
if (
|
549
|
+
file_types is None
|
550
|
+
or RepoLoader._file_type(file_path) in file_types
|
551
|
+
or os.path.basename(file_path) in file_types
|
552
|
+
or file_path in file_types
|
553
|
+
):
|
554
|
+
file_paths.append(file_path)
|
507
555
|
|
508
556
|
for file_path in file_paths:
|
509
|
-
|
510
|
-
|
511
|
-
doc_parser = DocumentParser.create(
|
557
|
+
docs.extend(
|
558
|
+
DocumentParser.chunks_from_path_or_bytes(
|
512
559
|
file_path,
|
513
|
-
parser
|
514
|
-
|
515
|
-
|
516
|
-
else:
|
517
|
-
with open(file_path, "r") as f:
|
518
|
-
if lines is not None:
|
519
|
-
file_lines = list(itertools.islice(f, lines))
|
520
|
-
content = "\n".join(line.strip() for line in file_lines)
|
521
|
-
else:
|
522
|
-
content = f.read()
|
523
|
-
soup = BeautifulSoup(content, "html.parser")
|
524
|
-
text = soup.get_text()
|
525
|
-
docs.append(
|
526
|
-
Document(
|
527
|
-
content=text,
|
528
|
-
metadata=DocMetaData(source=str(file_path)),
|
529
|
-
)
|
560
|
+
parser,
|
561
|
+
doc_type=doc_type,
|
562
|
+
lines=lines,
|
530
563
|
)
|
564
|
+
)
|
565
|
+
# dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
|
566
|
+
# if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
|
567
|
+
# doc_parser = DocumentParser.create(
|
568
|
+
# file_path,
|
569
|
+
# parser.config,
|
570
|
+
# doc_type=doc_type,
|
571
|
+
# )
|
572
|
+
# new_chunks = doc_parser.get_doc_chunks()
|
573
|
+
# if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
|
574
|
+
# doc_parser = ImagePdfParser(file_path, parser.config)
|
575
|
+
# new_chunks = doc_parser.get_doc_chunks()
|
576
|
+
# docs.extend(new_chunks)
|
577
|
+
# else:
|
578
|
+
# # try getting as plain text; these will be chunked downstream
|
579
|
+
# with open(file_path, "r") as f:
|
580
|
+
# if lines is not None:
|
581
|
+
# file_lines = list(itertools.islice(f, lines))
|
582
|
+
# content = "\n".join(line.strip() for line in file_lines)
|
583
|
+
# else:
|
584
|
+
# content = f.read()
|
585
|
+
# soup = BeautifulSoup(content, "html.parser")
|
586
|
+
# text = soup.get_text()
|
587
|
+
# docs.append(
|
588
|
+
# Document(
|
589
|
+
# content=text,
|
590
|
+
# metadata=DocMetaData(source=str(file_path)),
|
591
|
+
# )
|
592
|
+
# )
|
531
593
|
|
532
594
|
return docs
|
533
595
|
|
@@ -543,8 +605,8 @@ class RepoLoader:
|
|
543
605
|
of lines per file (if any of these are specified).
|
544
606
|
|
545
607
|
Args:
|
546
|
-
k(int): max number of files to load, or None for all files
|
547
|
-
depth(int): max depth to recurse, or None for infinite depth
|
608
|
+
k (int): max number of files to load, or None for all files
|
609
|
+
depth (int): max depth to recurse, or None for infinite depth
|
548
610
|
lines (int): max number of lines to get, from a file, or None for all lines
|
549
611
|
|
550
612
|
Returns:
|