langroid 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. langroid/__init__.py +95 -0
  2. langroid/agent/__init__.py +40 -0
  3. langroid/agent/base.py +222 -91
  4. langroid/agent/batch.py +264 -0
  5. langroid/agent/callbacks/chainlit.py +608 -0
  6. langroid/agent/chat_agent.py +247 -101
  7. langroid/agent/chat_document.py +41 -4
  8. langroid/agent/openai_assistant.py +842 -0
  9. langroid/agent/special/__init__.py +50 -0
  10. langroid/agent/special/doc_chat_agent.py +837 -141
  11. langroid/agent/special/lance_doc_chat_agent.py +258 -0
  12. langroid/agent/special/lance_rag/__init__.py +9 -0
  13. langroid/agent/special/lance_rag/critic_agent.py +136 -0
  14. langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
  15. langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
  16. langroid/agent/special/lance_tools.py +44 -0
  17. langroid/agent/special/neo4j/__init__.py +0 -0
  18. langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
  19. langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
  20. langroid/agent/special/neo4j/utils/__init__.py +0 -0
  21. langroid/agent/special/neo4j/utils/system_message.py +46 -0
  22. langroid/agent/special/relevance_extractor_agent.py +127 -0
  23. langroid/agent/special/retriever_agent.py +32 -198
  24. langroid/agent/special/sql/__init__.py +11 -0
  25. langroid/agent/special/sql/sql_chat_agent.py +47 -23
  26. langroid/agent/special/sql/utils/__init__.py +22 -0
  27. langroid/agent/special/sql/utils/description_extractors.py +95 -46
  28. langroid/agent/special/sql/utils/populate_metadata.py +28 -21
  29. langroid/agent/special/table_chat_agent.py +43 -9
  30. langroid/agent/task.py +475 -122
  31. langroid/agent/tool_message.py +75 -13
  32. langroid/agent/tools/__init__.py +13 -0
  33. langroid/agent/tools/duckduckgo_search_tool.py +66 -0
  34. langroid/agent/tools/google_search_tool.py +11 -0
  35. langroid/agent/tools/metaphor_search_tool.py +67 -0
  36. langroid/agent/tools/recipient_tool.py +16 -29
  37. langroid/agent/tools/run_python_code.py +60 -0
  38. langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
  39. langroid/agent/tools/segment_extract_tool.py +36 -0
  40. langroid/cachedb/__init__.py +9 -0
  41. langroid/cachedb/base.py +22 -2
  42. langroid/cachedb/momento_cachedb.py +26 -2
  43. langroid/cachedb/redis_cachedb.py +78 -11
  44. langroid/embedding_models/__init__.py +34 -0
  45. langroid/embedding_models/base.py +21 -2
  46. langroid/embedding_models/models.py +120 -18
  47. langroid/embedding_models/protoc/embeddings.proto +19 -0
  48. langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
  49. langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
  50. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
  51. langroid/embedding_models/remote_embeds.py +153 -0
  52. langroid/language_models/__init__.py +45 -0
  53. langroid/language_models/azure_openai.py +80 -27
  54. langroid/language_models/base.py +117 -12
  55. langroid/language_models/config.py +5 -0
  56. langroid/language_models/openai_assistants.py +3 -0
  57. langroid/language_models/openai_gpt.py +558 -174
  58. langroid/language_models/prompt_formatter/__init__.py +15 -0
  59. langroid/language_models/prompt_formatter/base.py +4 -6
  60. langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
  61. langroid/language_models/utils.py +18 -21
  62. langroid/mytypes.py +25 -8
  63. langroid/parsing/__init__.py +46 -0
  64. langroid/parsing/document_parser.py +260 -63
  65. langroid/parsing/image_text.py +32 -0
  66. langroid/parsing/parse_json.py +143 -0
  67. langroid/parsing/parser.py +122 -59
  68. langroid/parsing/repo_loader.py +114 -52
  69. langroid/parsing/search.py +68 -63
  70. langroid/parsing/spider.py +3 -2
  71. langroid/parsing/table_loader.py +44 -0
  72. langroid/parsing/url_loader.py +59 -11
  73. langroid/parsing/urls.py +85 -37
  74. langroid/parsing/utils.py +298 -4
  75. langroid/parsing/web_search.py +73 -0
  76. langroid/prompts/__init__.py +11 -0
  77. langroid/prompts/chat-gpt4-system-prompt.md +68 -0
  78. langroid/prompts/prompts_config.py +1 -1
  79. langroid/utils/__init__.py +17 -0
  80. langroid/utils/algorithms/__init__.py +3 -0
  81. langroid/utils/algorithms/graph.py +103 -0
  82. langroid/utils/configuration.py +36 -5
  83. langroid/utils/constants.py +4 -0
  84. langroid/utils/globals.py +2 -2
  85. langroid/utils/logging.py +2 -5
  86. langroid/utils/output/__init__.py +21 -0
  87. langroid/utils/output/printing.py +47 -1
  88. langroid/utils/output/status.py +33 -0
  89. langroid/utils/pandas_utils.py +30 -0
  90. langroid/utils/pydantic_utils.py +616 -2
  91. langroid/utils/system.py +98 -0
  92. langroid/vector_store/__init__.py +40 -0
  93. langroid/vector_store/base.py +203 -6
  94. langroid/vector_store/chromadb.py +59 -32
  95. langroid/vector_store/lancedb.py +463 -0
  96. langroid/vector_store/meilisearch.py +10 -7
  97. langroid/vector_store/momento.py +262 -0
  98. langroid/vector_store/qdrantdb.py +104 -22
  99. {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/METADATA +329 -149
  100. langroid-0.1.219.dist-info/RECORD +127 -0
  101. {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/WHEEL +1 -1
  102. langroid/agent/special/recipient_validator_agent.py +0 -157
  103. langroid/parsing/json.py +0 -64
  104. langroid/utils/web/selenium_login.py +0 -36
  105. langroid-0.1.85.dist-info/RECORD +0 -94
  106. /langroid/{scripts → agent/callbacks}/__init__.py +0 -0
  107. {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
@@ -1,7 +1,6 @@
1
1
  import logging
2
2
  from enum import Enum
3
- from functools import reduce
4
- from typing import List
3
+ from typing import Dict, List, Literal
5
4
 
6
5
  import tiktoken
7
6
  from pydantic import BaseSettings
@@ -20,11 +19,21 @@ class Splitter(str, Enum):
20
19
 
21
20
 
22
21
  class PdfParsingConfig(BaseSettings):
23
- library: str = "pdfplumber"
22
+ library: Literal[
23
+ "fitz",
24
+ "pdfplumber",
25
+ "pypdf",
26
+ "unstructured",
27
+ "pdf2image",
28
+ ] = "pdfplumber"
24
29
 
25
30
 
26
31
  class DocxParsingConfig(BaseSettings):
27
- library: str = "unstructured"
32
+ library: Literal["python-docx", "unstructured"] = "unstructured"
33
+
34
+
35
+ class DocParsingConfig(BaseSettings):
36
+ library: Literal["unstructured"] = "unstructured"
28
37
 
29
38
 
30
39
  class ParsingConfig(BaseSettings):
@@ -36,10 +45,12 @@ class ParsingConfig(BaseSettings):
36
45
  min_chunk_chars: int = 350
37
46
  discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
38
47
  n_similar_docs: int = 4
48
+ n_neighbor_ids: int = 5 # window size to store around each chunk
39
49
  separators: List[str] = ["\n\n", "\n", " ", ""]
40
50
  token_encoding_model: str = "text-embedding-ada-002"
41
51
  pdf: PdfParsingConfig = PdfParsingConfig()
42
52
  docx: DocxParsingConfig = DocxParsingConfig()
53
+ doc: DocParsingConfig = DocParsingConfig()
43
54
 
44
55
 
45
56
  class Parser:
@@ -51,72 +62,122 @@ class Parser:
51
62
  tokens = self.tokenizer.encode(text)
52
63
  return len(tokens)
53
64
 
65
+ def add_window_ids(self, chunks: List[Document]) -> None:
66
+ """Chunks may belong to multiple docs, but for each doc,
67
+ they appear consecutively. Add window_ids in metadata"""
68
+
69
+ # discard empty chunks
70
+ chunks = [c for c in chunks if c.content.strip() != ""]
71
+ if len(chunks) == 0:
72
+ return
73
+ # The original metadata.id (if any) is ignored since it will be same for all
74
+ # chunks and is useless. We want a distinct id for each chunk.
75
+ orig_ids = [c.metadata.id for c in chunks]
76
+ ids = [Document.hash_id(str(c)) for c in chunks]
77
+ id2chunk = {id: c for id, c in zip(ids, chunks)}
78
+
79
+ # group the ids by orig_id
80
+ orig_id_to_ids: Dict[str, List[str]] = {}
81
+ for orig_id, id in zip(orig_ids, ids):
82
+ if orig_id not in orig_id_to_ids:
83
+ orig_id_to_ids[orig_id] = []
84
+ orig_id_to_ids[orig_id].append(id)
85
+
86
+ # now each orig_id maps to a sequence of ids within a single doc
87
+
88
+ k = self.config.n_neighbor_ids
89
+ for orig, ids in orig_id_to_ids.items():
90
+ # ids are consecutive chunks in a single doc
91
+ n = len(ids)
92
+ window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
93
+ for i, _ in enumerate(ids):
94
+ c = id2chunk[ids[i]]
95
+ c.metadata.window_ids = window_ids[i]
96
+ c.metadata.id = ids[i]
97
+ c.metadata.is_chunk = True
98
+
54
99
  def split_simple(self, docs: List[Document]) -> List[Document]:
55
100
  if len(self.config.separators) == 0:
56
101
  raise ValueError("Must have at least one separator")
57
- return [
58
- Document(content=chunk.strip(), metadata=d.metadata)
59
- for d in docs
60
- for chunk in remove_extra_whitespace(d.content).split(
61
- self.config.separators[0]
62
- )
63
- if chunk.strip() != ""
64
- ]
102
+ final_docs = []
103
+
104
+ for d in docs:
105
+ if d.content.strip() == "":
106
+ continue
107
+ chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
108
+ chunk_docs = [
109
+ Document(
110
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
111
+ )
112
+ for c in chunks
113
+ if c.strip() != ""
114
+ ]
115
+ self.add_window_ids(chunk_docs)
116
+ final_docs += chunk_docs
117
+ return final_docs
65
118
 
66
119
  def split_para_sentence(self, docs: List[Document]) -> List[Document]:
67
- final_chunks = []
68
120
  chunks = docs
69
121
  while True:
70
- long_chunks = [
71
- p
72
- for p in chunks
73
- if self.num_tokens(p.content) > 1.3 * self.config.chunk_size
74
- ]
75
- if len(long_chunks) == 0:
76
- break
77
- short_chunks = [
78
- p
79
- for p in chunks
80
- if self.num_tokens(p.content) <= 1.3 * self.config.chunk_size
81
- ]
82
- final_chunks += short_chunks
83
- chunks = self._split_para_sentence_once(long_chunks)
84
- if len(chunks) == len(long_chunks):
85
- max_len = max([self.num_tokens(p.content) for p in long_chunks])
86
- logger.warning(
87
- f"""
88
- Unable to split {len(long_chunks)} long chunks
89
- using chunk_size = {self.config.chunk_size}.
90
- Max chunk size is {max_len} tokens.
91
- """
92
- )
122
+ un_splittables = 0
123
+ split_chunks = []
124
+ for c in chunks:
125
+ if c.content.strip() == "":
126
+ continue
127
+ if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
128
+ # small chunk: no need to split
129
+ split_chunks.append(c)
130
+ continue
131
+ splits = self._split_para_sentence_once([c])
132
+ un_splittables += len(splits) == 1
133
+ split_chunks += splits
134
+ if len(split_chunks) == len(chunks):
135
+ if un_splittables > 0:
136
+ max_len = max([self.num_tokens(p.content) for p in chunks])
137
+ logger.warning(
138
+ f"""
139
+ Unable to split {un_splittables} chunks
140
+ using chunk_size = {self.config.chunk_size}.
141
+ Max chunk size is {max_len} tokens.
142
+ """
143
+ )
93
144
  break # we won't be able to shorten them with current settings
145
+ chunks = split_chunks.copy()
94
146
 
95
- return final_chunks + chunks
147
+ self.add_window_ids(chunks)
148
+ return chunks
96
149
 
97
150
  def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
98
- chunked_docs = [
99
- [
100
- Document(content=chunk.strip(), metadata=d.metadata)
101
- for chunk in create_chunks(
102
- d.content, self.config.chunk_size, self.num_tokens
151
+ final_chunks = []
152
+ for d in docs:
153
+ if d.content.strip() == "":
154
+ continue
155
+ chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
156
+ chunk_docs = [
157
+ Document(
158
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
103
159
  )
104
- if chunk.strip() != ""
160
+ for c in chunks
161
+ if c.strip() != ""
105
162
  ]
106
- for d in docs
107
- ]
108
- return reduce(lambda x, y: x + y, chunked_docs)
163
+ final_chunks += chunk_docs
164
+
165
+ return final_chunks
109
166
 
110
167
  def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
111
- chunked_docs = [
112
- [
113
- Document(content=chunk.strip(), metadata=d.metadata)
114
- for chunk in self.chunk_tokens(d.content)
115
- if chunk.strip() != ""
168
+ final_docs = []
169
+ for d in docs:
170
+ chunks = self.chunk_tokens(d.content)
171
+ chunk_docs = [
172
+ Document(
173
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
174
+ )
175
+ for c in chunks
176
+ if c.strip() != ""
116
177
  ]
117
- for d in docs
118
- ]
119
- return reduce(lambda x, y: x + y, chunked_docs)
178
+ self.add_window_ids(chunk_docs)
179
+ final_docs += chunk_docs
180
+ return final_docs
120
181
 
121
182
  def chunk_tokens(
122
183
  self,
@@ -198,17 +259,19 @@ class Parser:
198
259
  # Increment the number of chunks
199
260
  num_chunks += 1
200
261
 
201
- # Handle the remaining tokens
202
- if tokens:
203
- remaining_text = self.tokenizer.decode(tokens).replace("\n", " ").strip()
204
- if len(remaining_text) > self.config.discard_chunk_chars:
205
- chunks.append(remaining_text)
262
+ # There may be remaining tokens, but we discard them
263
+ # since we have already reached the maximum number of chunks
206
264
 
207
265
  return chunks
208
266
 
209
267
  def split(self, docs: List[Document]) -> List[Document]:
210
268
  if len(docs) == 0:
211
269
  return []
270
+ # create ids in metadata of docs if absent:
271
+ # we need this to distinguish docs later in add_window_ids
272
+ for d in docs:
273
+ if d.metadata.id in [None, ""]:
274
+ d.metadata.id = d._unique_hash_id()
212
275
  # some docs are already splits, so don't split them further!
213
276
  chunked_docs = [d for d in docs if d.metadata.is_chunk]
214
277
  big_docs = [d for d in docs if not d.metadata.is_chunk]
@@ -10,15 +10,15 @@ from pathlib import Path
10
10
  from typing import Any, Dict, List, Optional, Tuple, Union
11
11
  from urllib.parse import urlparse
12
12
 
13
- from bs4 import BeautifulSoup
14
13
  from dotenv import load_dotenv
15
14
  from github import Github
16
15
  from github.ContentFile import ContentFile
16
+ from github.Label import Label
17
17
  from github.Repository import Repository
18
- from pydantic import BaseSettings
18
+ from pydantic import BaseModel, BaseSettings, Field
19
19
 
20
20
  from langroid.mytypes import DocMetaData, Document
21
- from langroid.parsing.document_parser import DocumentParser
21
+ from langroid.parsing.document_parser import DocumentParser, DocumentType
22
22
  from langroid.parsing.parser import Parser, ParsingConfig
23
23
 
24
24
  logger = logging.getLogger(__name__)
@@ -43,6 +43,22 @@ def _has_files(directory: str) -> bool:
43
43
  return False
44
44
 
45
45
 
46
+ # Pydantic model for GitHub issue data
47
+ class IssueData(BaseModel):
48
+ state: str = Field(..., description="State of issue e.g. open or closed")
49
+ year: int = Field(..., description="Year issue was created")
50
+ month: int = Field(..., description="Month issue was created")
51
+ day: int = Field(..., description="Day issue was created")
52
+ assignee: Optional[str] = Field(..., description="Assignee of issue")
53
+ size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
54
+ text: str = Field(..., description="Text of issue, i.e. description body")
55
+
56
+
57
+ def get_issue_size(labels: List[Label]) -> str | None:
58
+ sizes = ["XS", "S", "M", "L", "XL", "XXL"]
59
+ return next((label.name for label in labels if label.name in sizes), None)
60
+
61
+
46
62
  class RepoLoaderConfig(BaseSettings):
47
63
  """
48
64
  Configuration for RepoLoader.
@@ -155,6 +171,27 @@ class RepoLoader:
155
171
  def _get_dir_name(self) -> str:
156
172
  return urlparse(self.url).path.replace("/", "_")
157
173
 
174
+ def get_issues(self, k: int | None = 100) -> List[IssueData]:
175
+ """Get up to k issues from the GitHub repo."""
176
+ if k is None:
177
+ issues = self.repo.get_issues(state="all")
178
+ else:
179
+ issues = self.repo.get_issues(state="all")[:k]
180
+ issue_data_list = []
181
+ for issue in issues:
182
+ issue_data = IssueData(
183
+ state=issue.state,
184
+ year=issue.created_at.year,
185
+ month=issue.created_at.month,
186
+ day=issue.created_at.day,
187
+ assignee=issue.assignee.login if issue.assignee else None,
188
+ size=get_issue_size(issue.labels),
189
+ text=issue.body or "No issue description body.",
190
+ )
191
+ issue_data_list.append(issue_data)
192
+
193
+ return issue_data_list
194
+
158
195
  @staticmethod
159
196
  def _file_type(name: str) -> str:
160
197
  """
@@ -336,8 +373,8 @@ class RepoLoader:
336
373
 
337
374
  Returns:
338
375
  Tuple of (dict, List_of_Documents):
339
- A dictionary containing file and directory names, with file contents, and
340
- A list of Document objects for each file.
376
+ A dictionary containing file and directory names, with file
377
+ contents, and a list of Document objects for each file.
341
378
  """
342
379
  if path is None:
343
380
  if self.clone_path is None or not _has_files(self.clone_path):
@@ -382,8 +419,8 @@ class RepoLoader:
382
419
 
383
420
  Returns:
384
421
  Tuple of (dict, List_of_Documents):
385
- A dictionary containing file and directory names, with file contents.
386
- A list of Document objects for each file.
422
+ A dictionary containing file and directory names, with file contents.
423
+ A list of Document objects for each file.
387
424
  """
388
425
 
389
426
  folder_structure = {
@@ -453,18 +490,25 @@ class RepoLoader:
453
490
 
454
491
  @staticmethod
455
492
  def get_documents(
456
- path: str,
493
+ path: str | bytes,
457
494
  parser: Parser = Parser(ParsingConfig()),
458
495
  file_types: Optional[List[str]] = None,
459
496
  exclude_dirs: Optional[List[str]] = None,
460
497
  depth: int = -1,
461
498
  lines: Optional[int] = None,
499
+ doc_type: str | DocumentType | None = None,
462
500
  ) -> List[Document]:
463
501
  """
464
502
  Recursively get all files under a path as Document objects.
465
503
 
466
504
  Args:
467
- path (str): The path to the directory or file.
505
+ path (str|bytes): The path to the directory or file, or bytes content.
506
+ The bytes option is meant to support the case where the content
507
+ has already been read from a file in an upstream process
508
+ (e.g. from an API or a database), and we want to avoid having to
509
+ write it to a temporary file just to read it again.
510
+ (which can be very slow for large files,
511
+ especially in a docker container)
468
512
  parser (Parser): Parser to use to parse files.
469
513
  file_types (List[str], optional): List of file extensions OR
470
514
  filenames OR file_path_names to include.
@@ -475,6 +519,7 @@ class RepoLoader:
475
519
  which includes all depths.
476
520
  lines (int, optional): Number of lines to read from each file.
477
521
  Defaults to None, which reads all lines.
522
+ doc_type (str|DocumentType, optional): The type of document to parse.
478
523
 
479
524
  Returns:
480
525
  List[Document]: List of Document objects representing files.
@@ -482,52 +527,69 @@ class RepoLoader:
482
527
  """
483
528
  docs = []
484
529
  file_paths = []
485
- path_obj = Path(path).resolve()
486
-
487
- if path_obj.is_file():
488
- file_paths.append(str(path_obj))
530
+ if isinstance(path, bytes):
531
+ file_paths.append(path)
489
532
  else:
490
- path_depth = len(path_obj.parts)
491
- for root, dirs, files in os.walk(path):
492
- # Exclude directories if needed
493
- if exclude_dirs:
494
- dirs[:] = [d for d in dirs if d not in exclude_dirs]
495
-
496
- current_depth = len(Path(root).resolve().parts) - path_depth
497
- if depth == -1 or current_depth <= depth:
498
- for file in files:
499
- file_path = str(Path(root) / file)
500
- if (
501
- file_types is None
502
- or RepoLoader._file_type(file_path) in file_types
503
- or os.path.basename(file_path) in file_types
504
- or file_path in file_types
505
- ):
506
- file_paths.append(file_path)
533
+ path_obj = Path(path).resolve()
534
+
535
+ if path_obj.is_file():
536
+ file_paths.append(str(path_obj))
537
+ else:
538
+ path_depth = len(path_obj.parts)
539
+ for root, dirs, files in os.walk(path):
540
+ # Exclude directories if needed
541
+ if exclude_dirs:
542
+ dirs[:] = [d for d in dirs if d not in exclude_dirs]
543
+
544
+ current_depth = len(Path(root).resolve().parts) - path_depth
545
+ if depth == -1 or current_depth <= depth:
546
+ for file in files:
547
+ file_path = str(Path(root) / file)
548
+ if (
549
+ file_types is None
550
+ or RepoLoader._file_type(file_path) in file_types
551
+ or os.path.basename(file_path) in file_types
552
+ or file_path in file_types
553
+ ):
554
+ file_paths.append(file_path)
507
555
 
508
556
  for file_path in file_paths:
509
- _, file_extension = os.path.splitext(file_path)
510
- if file_extension.lower() in [".pdf", ".docx"]:
511
- doc_parser = DocumentParser.create(
557
+ docs.extend(
558
+ DocumentParser.chunks_from_path_or_bytes(
512
559
  file_path,
513
- parser.config,
514
- )
515
- docs.extend(doc_parser.get_doc_chunks())
516
- else:
517
- with open(file_path, "r") as f:
518
- if lines is not None:
519
- file_lines = list(itertools.islice(f, lines))
520
- content = "\n".join(line.strip() for line in file_lines)
521
- else:
522
- content = f.read()
523
- soup = BeautifulSoup(content, "html.parser")
524
- text = soup.get_text()
525
- docs.append(
526
- Document(
527
- content=text,
528
- metadata=DocMetaData(source=str(file_path)),
529
- )
560
+ parser,
561
+ doc_type=doc_type,
562
+ lines=lines,
530
563
  )
564
+ )
565
+ # dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
566
+ # if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
567
+ # doc_parser = DocumentParser.create(
568
+ # file_path,
569
+ # parser.config,
570
+ # doc_type=doc_type,
571
+ # )
572
+ # new_chunks = doc_parser.get_doc_chunks()
573
+ # if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
574
+ # doc_parser = ImagePdfParser(file_path, parser.config)
575
+ # new_chunks = doc_parser.get_doc_chunks()
576
+ # docs.extend(new_chunks)
577
+ # else:
578
+ # # try getting as plain text; these will be chunked downstream
579
+ # with open(file_path, "r") as f:
580
+ # if lines is not None:
581
+ # file_lines = list(itertools.islice(f, lines))
582
+ # content = "\n".join(line.strip() for line in file_lines)
583
+ # else:
584
+ # content = f.read()
585
+ # soup = BeautifulSoup(content, "html.parser")
586
+ # text = soup.get_text()
587
+ # docs.append(
588
+ # Document(
589
+ # content=text,
590
+ # metadata=DocMetaData(source=str(file_path)),
591
+ # )
592
+ # )
531
593
 
532
594
  return docs
533
595
 
@@ -543,8 +605,8 @@ class RepoLoader:
543
605
  of lines per file (if any of these are specified).
544
606
 
545
607
  Args:
546
- k(int): max number of files to load, or None for all files
547
- depth(int): max depth to recurse, or None for infinite depth
608
+ k (int): max number of files to load, or None for all files
609
+ depth (int): max depth to recurse, or None for infinite depth
548
610
  lines (int): max number of lines to get, from a file, or None for all lines
549
611
 
550
612
  Returns: