langroid 0.1.64__py3-none-any.whl → 0.1.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
19
19
  from langroid.language_models.base import StreamingIfAllowed
20
20
  from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
21
21
  from langroid.mytypes import DocMetaData, Document, Entity
22
- from langroid.parsing.parser import ParsingConfig, Splitter
22
+ from langroid.parsing.parser import Parser, ParsingConfig, Splitter
23
23
  from langroid.parsing.repo_loader import RepoLoader
24
24
  from langroid.parsing.url_loader import URLLoader
25
25
  from langroid.parsing.urls import get_urls_and_paths
@@ -86,7 +86,8 @@ class DocChatAgentConfig(ChatAgentConfig):
86
86
  ]
87
87
  parsing: ParsingConfig = ParsingConfig( # modify as needed
88
88
  splitter=Splitter.TOKENS,
89
- chunk_size=500, # aim for this many tokens per chunk
89
+ chunk_size=800, # aim for this many tokens per chunk
90
+ overlap=100, # overlap between chunks
90
91
  max_chunks=10_000,
91
92
  # aim to have at least this many chars per chunk when
92
93
  # truncating due to punctuation
@@ -145,12 +146,13 @@ class DocChatAgent(ChatAgent):
145
146
  return
146
147
  urls, paths = get_urls_and_paths(self.config.doc_paths)
147
148
  docs: List[Document] = []
149
+ parser = Parser(self.config.parsing)
148
150
  if len(urls) > 0:
149
- loader = URLLoader(urls=urls)
151
+ loader = URLLoader(urls=urls, parser=parser)
150
152
  docs = loader.load()
151
153
  if len(paths) > 0:
152
154
  for p in paths:
153
- path_docs = RepoLoader.get_documents(p)
155
+ path_docs = RepoLoader.get_documents(p, parser=parser)
154
156
  docs.extend(path_docs)
155
157
  n_docs = len(docs)
156
158
  n_splits = self.ingest_docs(docs)
langroid/mytypes.py CHANGED
@@ -25,6 +25,7 @@ class DocMetaData(BaseModel):
25
25
  """Metadata for a document."""
26
26
 
27
27
  source: str = "context"
28
+ is_chunk: bool = False # if it is a chunk, don't split
28
29
 
29
30
  class Config:
30
31
  extra = Extra.allow
@@ -199,11 +199,18 @@ class Parser:
199
199
  def split(self, docs: List[Document]) -> List[Document]:
200
200
  if len(docs) == 0:
201
201
  return []
202
+ # some docs are already splits, so don't split them further!
203
+ chunked_docs = [d for d in docs if d.metadata.is_chunk]
204
+ big_docs = [d for d in docs if not d.metadata.is_chunk]
205
+ if len(big_docs) == 0:
206
+ return chunked_docs
202
207
  if self.config.splitter == Splitter.PARA_SENTENCE:
203
- return self.split_para_sentence(docs)
208
+ big_doc_chunks = self.split_para_sentence(big_docs)
204
209
  elif self.config.splitter == Splitter.TOKENS:
205
- return self.split_chunk_tokens(docs)
210
+ big_doc_chunks = self.split_chunk_tokens(big_docs)
206
211
  elif self.config.splitter == Splitter.SIMPLE:
207
- return self.split_simple(docs)
212
+ big_doc_chunks = self.split_simple(big_docs)
208
213
  else:
209
214
  raise ValueError(f"Unknown splitter: {self.config.splitter}")
215
+
216
+ return chunked_docs + big_doc_chunks
@@ -1,15 +1,20 @@
1
1
  from io import BytesIO
2
+ from typing import List
2
3
 
3
4
  import requests
4
5
  from pypdf import PdfReader
5
6
 
6
7
  from langroid.mytypes import DocMetaData, Document
7
- from langroid.parsing.parser import Parser, ParsingConfig
8
+ from langroid.parsing.parser import Parser
8
9
 
9
10
 
10
11
  class PdfParser(Parser):
11
- def __init__(self, config: ParsingConfig):
12
- super().__init__(config)
12
+ def __init__(self, parser: Parser):
13
+ super().__init__(parser.config)
14
+
15
+ @classmethod
16
+ def from_Parser(cls, parser: Parser) -> "PdfParser":
17
+ return cls(parser)
13
18
 
14
19
  @staticmethod
15
20
  def _text_from_pdf_reader(reader: PdfReader) -> str:
@@ -25,7 +30,7 @@ class PdfParser(Parser):
25
30
  text += page.extract_text()
26
31
  return text
27
32
 
28
- def _chunk_docs_from_pdf_reader(
33
+ def _doc_chunks_from_pdf_reader(
29
34
  self,
30
35
  reader: PdfReader,
31
36
  doc: str,
@@ -46,7 +51,8 @@ class PdfParser(Parser):
46
51
  overlap (int): number of tokens to overlap between chunks
47
52
 
48
53
  Returns:
49
- List[Document]: a list of `Document` objects, each containing a chunk of text
54
+ List[Document]: a list of `Document` objects,
55
+ each containing a chunk of text
50
56
  """
51
57
 
52
58
  split = [] # tokens in curr split
@@ -64,7 +70,10 @@ class PdfParser(Parser):
64
70
  docs.append(
65
71
  Document(
66
72
  content=self.tokenizer.decode(split[:chunk_tokens]),
67
- metadata=DocMetaData(source=f"{doc} pages {pg}"),
73
+ metadata=DocMetaData(
74
+ source=f"{doc} pages {pg}",
75
+ is_chunk=True,
76
+ ),
68
77
  )
69
78
  )
70
79
  split = split[chunk_tokens - overlap :]
@@ -74,14 +83,45 @@ class PdfParser(Parser):
74
83
  docs.append(
75
84
  Document(
76
85
  content=self.tokenizer.decode(split[:chunk_tokens]),
77
- metadata=DocMetaData(source=f"{doc} pages {pg}"),
86
+ metadata=DocMetaData(
87
+ source=f"{doc} pages {pg}",
88
+ is_chunk=True,
89
+ ),
78
90
  )
79
91
  )
80
92
  return docs
81
93
 
94
+ @staticmethod
95
+ def doc_chunks_from_pdf_url(url: str, parser: Parser) -> List[Document]:
96
+ """
97
+ Get chunks of text from pdf URL as a list of Document objects,
98
+ using the parser's chunk_size and overlap settings.
99
+
100
+ Args:
101
+ url (str): contains the URL to the PDF file
102
+ Returns:
103
+ a `Document` object containing the content of the pdf file,
104
+ and metadata containing url
105
+ """
106
+
107
+ pdfParser = PdfParser.from_Parser(parser)
108
+ response = requests.get(url)
109
+ response.raise_for_status()
110
+ with BytesIO(response.content) as f:
111
+ reader = PdfReader(f)
112
+ docs = pdfParser._doc_chunks_from_pdf_reader(
113
+ reader,
114
+ doc=url,
115
+ chunk_tokens=parser.config.chunk_size,
116
+ overlap=parser.config.overlap,
117
+ )
118
+ return docs
119
+
82
120
  @staticmethod
83
121
  def get_doc_from_pdf_url(url: str) -> Document:
84
122
  """
123
+ Get entire text from pdf URL as a single document.
124
+
85
125
  Args:
86
126
  url (str): contains the URL to the PDF file
87
127
  Returns:
@@ -92,9 +132,32 @@ class PdfParser(Parser):
92
132
  response.raise_for_status()
93
133
  with BytesIO(response.content) as f:
94
134
  reader = PdfReader(f)
95
- text = _text_from_pdf_reader(reader)
135
+ text = PdfParser._text_from_pdf_reader(reader)
96
136
  return Document(content=text, metadata=DocMetaData(source=str(url)))
97
137
 
138
+ @staticmethod
139
+ def doc_chunks_from_pdf_path(path: str, parser: Parser) -> List[Document]:
140
+ """
141
+ Get chunks of text from pdf path as a list of Document objects,
142
+ using the parser's chunk_size and overlap settings.
143
+
144
+ Args:
145
+ url (str): contains the URL to the PDF file
146
+ Returns:
147
+ a `Document` object containing the content of the pdf file,
148
+ and metadata containing url
149
+ """
150
+
151
+ pdfParser = PdfParser.from_Parser(parser)
152
+ reader = PdfReader(path)
153
+ docs = pdfParser._doc_chunks_from_pdf_reader(
154
+ reader,
155
+ doc=path,
156
+ chunk_tokens=parser.config.chunk_size,
157
+ overlap=parser.config.overlap,
158
+ )
159
+ return docs
160
+
98
161
  @staticmethod
99
162
  def get_doc_from_pdf_file(path: str) -> Document:
100
163
  """
@@ -107,5 +170,5 @@ class PdfParser(Parser):
107
170
  and metadata containing path/url
108
171
  """
109
172
  reader = PdfReader(path)
110
- text = _text_from_pdf_reader(reader)
173
+ text = PdfParser._text_from_pdf_reader(reader)
111
174
  return Document(content=text, metadata=DocMetaData(source=str(path)))
@@ -18,7 +18,8 @@ from github.Repository import Repository
18
18
  from pydantic import BaseSettings
19
19
 
20
20
  from langroid.mytypes import DocMetaData, Document
21
- from langroid.parsing.pdf_parser import get_doc_from_pdf_file
21
+ from langroid.parsing.parser import Parser
22
+ from langroid.parsing.pdf_parser import PdfParser
22
23
 
23
24
  logger = logging.getLogger(__name__)
24
25
 
@@ -443,6 +444,7 @@ class RepoLoader:
443
444
  exclude_dirs: Optional[List[str]] = None,
444
445
  depth: int = -1,
445
446
  lines: Optional[int] = None,
447
+ parser: Optional[Parser] = None,
446
448
  ) -> List[Document]:
447
449
  """
448
450
  Recursively get all files under a path as Document objects.
@@ -458,6 +460,7 @@ class RepoLoader:
458
460
  which includes all depths.
459
461
  lines (int, optional): Number of lines to read from each file.
460
462
  Defaults to None, which reads all lines.
463
+ parser (Parser, optional): Parser to use to parse files.
461
464
 
462
465
  Returns:
463
466
  List[Document]: List of Document objects representing files.
@@ -490,8 +493,11 @@ class RepoLoader:
490
493
 
491
494
  for file_path in file_paths:
492
495
  _, file_extension = os.path.splitext(file_path)
493
- if file_extension == ".pdf":
494
- docs.append(get_doc_from_pdf_file(file_path))
496
+ if file_extension.lower() == ".pdf":
497
+ if parser is None:
498
+ docs.append(PdfParser.get_doc_from_pdf_file(file_path))
499
+ else:
500
+ docs.extend(PdfParser.doc_chunks_from_pdf_path(file_path, parser))
495
501
  else:
496
502
  with open(file_path, "r") as f:
497
503
  if lines is not None:
@@ -9,7 +9,8 @@ from trafilatura.downloads import (
9
9
  )
10
10
 
11
11
  from langroid.mytypes import DocMetaData, Document
12
- from langroid.parsing.pdf_parser import get_doc_from_pdf_url
12
+ from langroid.parsing.parser import Parser
13
+ from langroid.parsing.pdf_parser import PdfParser
13
14
 
14
15
  logging.getLogger("trafilatura").setLevel(logging.ERROR)
15
16
 
@@ -26,8 +27,9 @@ class URLLoader:
26
27
  the "accept" button on the cookie dialog.
27
28
  """
28
29
 
29
- def __init__(self, urls: List[str]):
30
+ def __init__(self, urls: List[str], parser: Parser | None = None):
30
31
  self.urls = urls
32
+ self.parser = parser
31
33
 
32
34
  @no_type_check
33
35
  def load(self) -> List[Document]:
@@ -43,7 +45,10 @@ class URLLoader:
43
45
  )
44
46
  for url, result in buffered_downloads(buffer, threads):
45
47
  if url.lower().endswith(".pdf"):
46
- docs.append(get_doc_from_pdf_url(url))
48
+ if self.parser is None:
49
+ docs.append(PdfParser.get_doc_from_pdf_url(url))
50
+ else:
51
+ docs.extend(PdfParser.doc_chunks_from_pdf_url(url, self.parser))
47
52
  else:
48
53
  text = trafilatura.extract(
49
54
  result,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.1.64
3
+ Version: 0.1.65
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -6,7 +6,7 @@ langroid/agent/chat_document.py,sha256=k7Klav3FIBTf2w95bQtxgqBrf2fMo1ydSlklQvv4R
6
6
  langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
8
8
  langroid/agent/special/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- langroid/agent/special/doc_chat_agent.py,sha256=lKenTsnbR91-NeZSvFyWJcmY_8_fKckCuR9oQQVnMXw,18097
9
+ langroid/agent/special/doc_chat_agent.py,sha256=RmYRHclTz7D8dH3eIBFWINm8Cl5zvICt2Ri-6AHRPqc,18227
10
10
  langroid/agent/special/recipient_validator_agent.py,sha256=R3Rit93BNWQar_9stuDBGzmLr2W-IYOQ7oq-tlNNlps,6035
11
11
  langroid/agent/special/retriever_agent.py,sha256=DeOB5crFjXBvDEZT9k9ZVinOfFM2VgS6tQWWFyXSk9o,7204
12
12
  langroid/agent/special/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -40,18 +40,18 @@ langroid/language_models/prompt_formatter/__init__.py,sha256=47DEQpj8HBSa-_TImW-
40
40
  langroid/language_models/prompt_formatter/base.py,sha256=2y_GcwhstvB5ih3haS7l5Fv79jVnFJ_vEw1jqWJzB9k,1247
41
41
  langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
42
42
  langroid/language_models/utils.py,sha256=rmnSn-sJ3aKl_wBdeLPkck0Li4Ed6zkCxZYYl7n1V34,4668
43
- langroid/mytypes.py,sha256=_0TnRjIRFvJh1MThFGqtD8hUzq1W3LqzTS_WCFeodzw,1559
43
+ langroid/mytypes.py,sha256=nJyTaX-nAe2dwRdF8NZKxUNAy_Hvxgtkn1c9buT-d14,1619
44
44
  langroid/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
46
46
  langroid/parsing/code-parsing.md,sha256=--cyyNiSZSDlIwcjAV4-shKrSiRe2ytF3AdSoS_hD2g,3294
47
47
  langroid/parsing/code_parser.py,sha256=BbDAzp35wkYQ9U1dpf1ARL0lVyi0tfqEc6_eox2C090,3727
48
48
  langroid/parsing/json.py,sha256=MVqBUfInALQm1QKbcfEvLzWxBz_UztCIyGk7AK5uFPo,1650
49
49
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
50
- langroid/parsing/parser.py,sha256=GDYJSQAY-kHoGNpYll8HlSrsFpa_fJmlEfV1HWuXSJ0,7478
51
- langroid/parsing/pdf_parser.py,sha256=YRH4ZT0UsdMAaDi7vy0DBQTXOTxOF1xXERrs9nmoQ0c,3836
52
- langroid/parsing/repo_loader.py,sha256=ILlvBH-wSvfdLLQKHklAzuxU980_ajts_bz7_9IwtLY,27017
50
+ langroid/parsing/parser.py,sha256=R1yvqjBvXKOcX1opCj5nTYuDK2HLAfkonM055DM5CP8,7826
51
+ langroid/parsing/pdf_parser.py,sha256=0FHrxFXwJ5m9xLQlyKrlRGeApGjkJ3gRk1a3fs1h7AI,5890
52
+ langroid/parsing/repo_loader.py,sha256=lQ_9tceOEiPWATf_SL43iubz05G1XXoF4vINsKJHmkY,27324
53
53
  langroid/parsing/table_loader.py,sha256=uqbupGr4y_7os18RtaY5GpD0hWcgzROoNy8dQIHB4kc,1767
54
- langroid/parsing/url_loader.py,sha256=ROXkdkzFeQ9lxJhaqYu95Eh9nWoZjwj2rlKWapj7p0Q,1839
54
+ langroid/parsing/url_loader.py,sha256=MfYr2zK-1pOMEEc9y_vrMdtC052XMlr57X2ptYKpDX4,2103
55
55
  langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
56
56
  langroid/parsing/urls.py,sha256=_Bcf1iRdT7cQrQ8hnbPX0Jtzxc0lVFaucTS5rJoKA14,3709
57
57
  langroid/parsing/utils.py,sha256=zqvZWpZktRJTKx_JAqxaIyoudMdKVdB1zzjnOhVYHS4,2196
@@ -82,7 +82,7 @@ langroid/vector_store/base.py,sha256=QZx3NUNwf2I0r3A7iuoUHIRGbqt_pFGD0hq1R-Yg8iM
82
82
  langroid/vector_store/chromadb.py,sha256=s5pQkKjaMP-Tt5A8M10EInFzttaALPbJAq7q4gf0TKg,5235
83
83
  langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
84
84
  langroid/vector_store/qdrantdb.py,sha256=KRvIIj1IZG2zFqejofMnRs2hT86B-27LgBEnuczdqOU,9072
85
- langroid-0.1.64.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
86
- langroid-0.1.64.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
87
- langroid-0.1.64.dist-info/METADATA,sha256=Zag_MPQ33R4KqBF3Uiptj39sRMAVT4o7PWoaiRYrvb4,35776
88
- langroid-0.1.64.dist-info/RECORD,,
85
+ langroid-0.1.65.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
86
+ langroid-0.1.65.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
87
+ langroid-0.1.65.dist-info/METADATA,sha256=B5O6tKm5lGdBFcm6Npepa4nYHHZ8zMq0VNkUxSOEo1g,35776
88
+ langroid-0.1.65.dist-info/RECORD,,