langroid 0.1.63__py3-none-any.whl → 0.1.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
19
19
  from langroid.language_models.base import StreamingIfAllowed
20
20
  from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
21
21
  from langroid.mytypes import DocMetaData, Document, Entity
22
- from langroid.parsing.parser import ParsingConfig, Splitter
22
+ from langroid.parsing.parser import Parser, ParsingConfig, Splitter
23
23
  from langroid.parsing.repo_loader import RepoLoader
24
24
  from langroid.parsing.url_loader import URLLoader
25
25
  from langroid.parsing.urls import get_urls_and_paths
@@ -86,7 +86,8 @@ class DocChatAgentConfig(ChatAgentConfig):
86
86
  ]
87
87
  parsing: ParsingConfig = ParsingConfig( # modify as needed
88
88
  splitter=Splitter.TOKENS,
89
- chunk_size=500, # aim for this many tokens per chunk
89
+ chunk_size=800, # aim for this many tokens per chunk
90
+ overlap=100, # overlap between chunks
90
91
  max_chunks=10_000,
91
92
  # aim to have at least this many chars per chunk when
92
93
  # truncating due to punctuation
@@ -145,12 +146,13 @@ class DocChatAgent(ChatAgent):
145
146
  return
146
147
  urls, paths = get_urls_and_paths(self.config.doc_paths)
147
148
  docs: List[Document] = []
149
+ parser = Parser(self.config.parsing)
148
150
  if len(urls) > 0:
149
- loader = URLLoader(urls=urls)
151
+ loader = URLLoader(urls=urls, parser=parser)
150
152
  docs = loader.load()
151
153
  if len(paths) > 0:
152
154
  for p in paths:
153
- path_docs = RepoLoader.get_documents(p)
155
+ path_docs = RepoLoader.get_documents(p, parser=parser)
154
156
  docs.extend(path_docs)
155
157
  n_docs = len(docs)
156
158
  n_splits = self.ingest_docs(docs)
@@ -45,6 +45,10 @@ If you receive a null or other unexpected result, see if you have made an assump
45
45
  in your code, and try another way, or use `run_code` to explore the dataframe
46
46
  before submitting your final code.
47
47
 
48
+ Once you have the answer to the question, say DONE and show me the answer.
49
+ If you receive an error message, try using the `run_code` tool/function
50
+ again with the corrected code.
51
+
48
52
  Start by asking me what I want to know about the data.
49
53
  """
50
54
 
langroid/mytypes.py CHANGED
@@ -25,6 +25,7 @@ class DocMetaData(BaseModel):
25
25
  """Metadata for a document."""
26
26
 
27
27
  source: str = "context"
28
+ is_chunk: bool = False # if it is a chunk, don't split
28
29
 
29
30
  class Config:
30
31
  extra = Extra.allow
@@ -22,6 +22,7 @@ class Splitter(str, Enum):
22
22
  class ParsingConfig(BaseSettings):
23
23
  splitter: str = Splitter.TOKENS
24
24
  chunk_size: int = 200 # aim for this many tokens per chunk
25
+ overlap: int = 50 # overlap between chunks
25
26
  max_chunks: int = 10_000
26
27
  # aim to have at least this many chars per chunk when truncating due to punctuation
27
28
  min_chunk_chars: int = 350
@@ -198,11 +199,18 @@ class Parser:
198
199
  def split(self, docs: List[Document]) -> List[Document]:
199
200
  if len(docs) == 0:
200
201
  return []
202
+ # some docs are already splits, so don't split them further!
203
+ chunked_docs = [d for d in docs if d.metadata.is_chunk]
204
+ big_docs = [d for d in docs if not d.metadata.is_chunk]
205
+ if len(big_docs) == 0:
206
+ return chunked_docs
201
207
  if self.config.splitter == Splitter.PARA_SENTENCE:
202
- return self.split_para_sentence(docs)
208
+ big_doc_chunks = self.split_para_sentence(big_docs)
203
209
  elif self.config.splitter == Splitter.TOKENS:
204
- return self.split_chunk_tokens(docs)
210
+ big_doc_chunks = self.split_chunk_tokens(big_docs)
205
211
  elif self.config.splitter == Splitter.SIMPLE:
206
- return self.split_simple(docs)
212
+ big_doc_chunks = self.split_simple(big_docs)
207
213
  else:
208
214
  raise ValueError(f"Unknown splitter: {self.config.splitter}")
215
+
216
+ return chunked_docs + big_doc_chunks
@@ -1,51 +1,174 @@
1
1
  from io import BytesIO
2
+ from typing import List
2
3
 
3
4
  import requests
4
5
  from pypdf import PdfReader
5
6
 
6
7
  from langroid.mytypes import DocMetaData, Document
8
+ from langroid.parsing.parser import Parser
7
9
 
8
10
 
9
- def _text_from_pdf_reader(reader: PdfReader) -> str:
10
- """
11
- Extract text from a `PdfReader` object.
12
- Args:
13
- reader (PdfReader): a `PdfReader` object
14
- Returns:
15
- str: the extracted text
16
- """
17
- text = ""
18
- for page in reader.pages:
19
- text += page.extract_text()
20
- return text
21
-
22
-
23
- def get_doc_from_pdf_url(url: str) -> Document:
24
- """
25
- Args:
26
- url (str): contains the URL to the PDF file
27
- Returns:
28
- a `Document` object containing the content of the pdf file,
29
- and metadata containing url
30
- """
31
- response = requests.get(url)
32
- response.raise_for_status()
33
- with BytesIO(response.content) as f:
34
- reader = PdfReader(f)
35
- text = _text_from_pdf_reader(reader)
36
- return Document(content=text, metadata=DocMetaData(source=str(url)))
37
-
38
-
39
- def get_doc_from_pdf_file(path: str) -> Document:
40
- """
41
- Given local path to a PDF file, extract the text content.
42
- Args:
43
- path (str): full path to the PDF file
44
- PDF file obtained via URL
45
- Returns:
46
- a `Document` object containing the content of the pdf file,
47
- and metadata containing path/url
48
- """
49
- reader = PdfReader(path)
50
- text = _text_from_pdf_reader(reader)
51
- return Document(content=text, metadata=DocMetaData(source=str(path)))
11
+ class PdfParser(Parser):
12
+ def __init__(self, parser: Parser):
13
+ super().__init__(parser.config)
14
+
15
+ @classmethod
16
+ def from_Parser(cls, parser: Parser) -> "PdfParser":
17
+ return cls(parser)
18
+
19
+ @staticmethod
20
+ def _text_from_pdf_reader(reader: PdfReader) -> str:
21
+ """
22
+ Extract text from a `PdfReader` object.
23
+ Args:
24
+ reader (PdfReader): a `PdfReader` object
25
+ Returns:
26
+ str: the extracted text
27
+ """
28
+ text = ""
29
+ for page in reader.pages:
30
+ text += page.extract_text()
31
+ return text
32
+
33
+ def _doc_chunks_from_pdf_reader(
34
+ self,
35
+ reader: PdfReader,
36
+ doc: str,
37
+ chunk_tokens: int,
38
+ overlap: int = 0,
39
+ ) -> List[Document]:
40
+ """
41
+ Get document chunks from a PdfReader object,
42
+ with page references in the document metadata.
43
+
44
+ Adapted from
45
+ https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py
46
+
47
+ Args:
48
+ reader (PdfReader): a `PdfReader` object
49
+ doc: URL or filename of the PDF file
50
+ chunk_tokens (int): number of tokens in each chunk
51
+ overlap (int): number of tokens to overlap between chunks
52
+
53
+ Returns:
54
+ List[Document]: a list of `Document` objects,
55
+ each containing a chunk of text
56
+ """
57
+
58
+ split = [] # tokens in curr split
59
+ pages: List[str] = []
60
+ docs: List[Document] = []
61
+ for i, page in enumerate(reader.pages):
62
+ split += self.tokenizer.encode(page.extract_text())
63
+ pages.append(str(i + 1))
64
+ # split could be so long it needs to be split
65
+ # into multiple chunks. Or it could be so short
66
+ # that it needs to be combined with the next chunk.
67
+ while len(split) > chunk_tokens:
68
+ # pretty formatting of pages (e.g. 1-3, 4, 5-7)
69
+ pg = "-".join([pages[0], pages[-1]])
70
+ docs.append(
71
+ Document(
72
+ content=self.tokenizer.decode(split[:chunk_tokens]),
73
+ metadata=DocMetaData(
74
+ source=f"{doc} pages {pg}",
75
+ is_chunk=True,
76
+ ),
77
+ )
78
+ )
79
+ split = split[chunk_tokens - overlap :]
80
+ pages = [str(i + 1)]
81
+ if len(split) > overlap:
82
+ pg = "-".join([pages[0], pages[-1]])
83
+ docs.append(
84
+ Document(
85
+ content=self.tokenizer.decode(split[:chunk_tokens]),
86
+ metadata=DocMetaData(
87
+ source=f"{doc} pages {pg}",
88
+ is_chunk=True,
89
+ ),
90
+ )
91
+ )
92
+ return docs
93
+
94
+ @staticmethod
95
+ def doc_chunks_from_pdf_url(url: str, parser: Parser) -> List[Document]:
96
+ """
97
+ Get chunks of text from pdf URL as a list of Document objects,
98
+ using the parser's chunk_size and overlap settings.
99
+
100
+ Args:
101
+ url (str): contains the URL to the PDF file
102
+ Returns:
103
+ a `Document` object containing the content of the pdf file,
104
+ and metadata containing url
105
+ """
106
+
107
+ pdfParser = PdfParser.from_Parser(parser)
108
+ response = requests.get(url)
109
+ response.raise_for_status()
110
+ with BytesIO(response.content) as f:
111
+ reader = PdfReader(f)
112
+ docs = pdfParser._doc_chunks_from_pdf_reader(
113
+ reader,
114
+ doc=url,
115
+ chunk_tokens=parser.config.chunk_size,
116
+ overlap=parser.config.overlap,
117
+ )
118
+ return docs
119
+
120
+ @staticmethod
121
+ def get_doc_from_pdf_url(url: str) -> Document:
122
+ """
123
+ Get entire text from pdf URL as a single document.
124
+
125
+ Args:
126
+ url (str): contains the URL to the PDF file
127
+ Returns:
128
+ a `Document` object containing the content of the pdf file,
129
+ and metadata containing url
130
+ """
131
+ response = requests.get(url)
132
+ response.raise_for_status()
133
+ with BytesIO(response.content) as f:
134
+ reader = PdfReader(f)
135
+ text = PdfParser._text_from_pdf_reader(reader)
136
+ return Document(content=text, metadata=DocMetaData(source=str(url)))
137
+
138
+ @staticmethod
139
+ def doc_chunks_from_pdf_path(path: str, parser: Parser) -> List[Document]:
140
+ """
141
+ Get chunks of text from pdf path as a list of Document objects,
142
+ using the parser's chunk_size and overlap settings.
143
+
144
+ Args:
145
+ url (str): contains the URL to the PDF file
146
+ Returns:
147
+ a `Document` object containing the content of the pdf file,
148
+ and metadata containing url
149
+ """
150
+
151
+ pdfParser = PdfParser.from_Parser(parser)
152
+ reader = PdfReader(path)
153
+ docs = pdfParser._doc_chunks_from_pdf_reader(
154
+ reader,
155
+ doc=path,
156
+ chunk_tokens=parser.config.chunk_size,
157
+ overlap=parser.config.overlap,
158
+ )
159
+ return docs
160
+
161
+ @staticmethod
162
+ def get_doc_from_pdf_file(path: str) -> Document:
163
+ """
164
+ Given local path to a PDF file, extract the text content.
165
+ Args:
166
+ path (str): full path to the PDF file
167
+ PDF file obtained via URL
168
+ Returns:
169
+ a `Document` object containing the content of the pdf file,
170
+ and metadata containing path/url
171
+ """
172
+ reader = PdfReader(path)
173
+ text = PdfParser._text_from_pdf_reader(reader)
174
+ return Document(content=text, metadata=DocMetaData(source=str(path)))
@@ -18,7 +18,8 @@ from github.Repository import Repository
18
18
  from pydantic import BaseSettings
19
19
 
20
20
  from langroid.mytypes import DocMetaData, Document
21
- from langroid.parsing.pdf_parser import get_doc_from_pdf_file
21
+ from langroid.parsing.parser import Parser
22
+ from langroid.parsing.pdf_parser import PdfParser
22
23
 
23
24
  logger = logging.getLogger(__name__)
24
25
 
@@ -443,6 +444,7 @@ class RepoLoader:
443
444
  exclude_dirs: Optional[List[str]] = None,
444
445
  depth: int = -1,
445
446
  lines: Optional[int] = None,
447
+ parser: Optional[Parser] = None,
446
448
  ) -> List[Document]:
447
449
  """
448
450
  Recursively get all files under a path as Document objects.
@@ -458,6 +460,7 @@ class RepoLoader:
458
460
  which includes all depths.
459
461
  lines (int, optional): Number of lines to read from each file.
460
462
  Defaults to None, which reads all lines.
463
+ parser (Parser, optional): Parser to use to parse files.
461
464
 
462
465
  Returns:
463
466
  List[Document]: List of Document objects representing files.
@@ -490,8 +493,11 @@ class RepoLoader:
490
493
 
491
494
  for file_path in file_paths:
492
495
  _, file_extension = os.path.splitext(file_path)
493
- if file_extension == ".pdf":
494
- docs.append(get_doc_from_pdf_file(file_path))
496
+ if file_extension.lower() == ".pdf":
497
+ if parser is None:
498
+ docs.append(PdfParser.get_doc_from_pdf_file(file_path))
499
+ else:
500
+ docs.extend(PdfParser.doc_chunks_from_pdf_path(file_path, parser))
495
501
  else:
496
502
  with open(file_path, "r") as f:
497
503
  if lines is not None:
@@ -9,7 +9,8 @@ from trafilatura.downloads import (
9
9
  )
10
10
 
11
11
  from langroid.mytypes import DocMetaData, Document
12
- from langroid.parsing.pdf_parser import get_doc_from_pdf_url
12
+ from langroid.parsing.parser import Parser
13
+ from langroid.parsing.pdf_parser import PdfParser
13
14
 
14
15
  logging.getLogger("trafilatura").setLevel(logging.ERROR)
15
16
 
@@ -26,8 +27,9 @@ class URLLoader:
26
27
  the "accept" button on the cookie dialog.
27
28
  """
28
29
 
29
- def __init__(self, urls: List[str]):
30
+ def __init__(self, urls: List[str], parser: Parser | None = None):
30
31
  self.urls = urls
32
+ self.parser = parser
31
33
 
32
34
  @no_type_check
33
35
  def load(self) -> List[Document]:
@@ -43,7 +45,10 @@ class URLLoader:
43
45
  )
44
46
  for url, result in buffered_downloads(buffer, threads):
45
47
  if url.lower().endswith(".pdf"):
46
- docs.append(get_doc_from_pdf_url(url))
48
+ if self.parser is None:
49
+ docs.append(PdfParser.get_doc_from_pdf_url(url))
50
+ else:
51
+ docs.extend(PdfParser.doc_chunks_from_pdf_url(url, self.parser))
47
52
  else:
48
53
  text = trafilatura.extract(
49
54
  result,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.1.63
3
+ Version: 0.1.65
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -6,7 +6,7 @@ langroid/agent/chat_document.py,sha256=k7Klav3FIBTf2w95bQtxgqBrf2fMo1ydSlklQvv4R
6
6
  langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
8
8
  langroid/agent/special/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- langroid/agent/special/doc_chat_agent.py,sha256=lKenTsnbR91-NeZSvFyWJcmY_8_fKckCuR9oQQVnMXw,18097
9
+ langroid/agent/special/doc_chat_agent.py,sha256=RmYRHclTz7D8dH3eIBFWINm8Cl5zvICt2Ri-6AHRPqc,18227
10
10
  langroid/agent/special/recipient_validator_agent.py,sha256=R3Rit93BNWQar_9stuDBGzmLr2W-IYOQ7oq-tlNNlps,6035
11
11
  langroid/agent/special/retriever_agent.py,sha256=DeOB5crFjXBvDEZT9k9ZVinOfFM2VgS6tQWWFyXSk9o,7204
12
12
  langroid/agent/special/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -16,7 +16,7 @@ langroid/agent/special/sql/utils/description_extractors.py,sha256=GcQ82IhKPInS_3
16
16
  langroid/agent/special/sql/utils/populate_metadata.py,sha256=zRjw31a1ZXvpx9bcmbtC2mngdHl-bp1ZNHStcPG8_Qk,2712
17
17
  langroid/agent/special/sql/utils/system_message.py,sha256=qKLHkvQWRQodTtPLPxr1GSLUYUFASZU8x-ybV67cB68,1885
18
18
  langroid/agent/special/sql/utils/tools.py,sha256=6uB2424SLtmapui9ggcEr0ZTiB6_dL1-JRGgN8RK9Js,1332
19
- langroid/agent/special/table_chat_agent.py,sha256=PTCE7MmunQj7tFiKAMIh7kvdIeQYU5ceXgBabwsxdg8,7244
19
+ langroid/agent/special/table_chat_agent.py,sha256=2nRGW25WDEbR-ukQjeV3mzsC0qk2gOgl4MnLI6gejTs,7425
20
20
  langroid/agent/task.py,sha256=UqbjZP4hiG3yRrPWf-nqIyLtK8i0c3fWUEYKbcZ3n50,28275
21
21
  langroid/agent/tool_message.py,sha256=8I59BMkqfH_qpWazhv9_rpPjlaG826vVG5dyJGeOn3o,5936
22
22
  langroid/agent/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -40,18 +40,18 @@ langroid/language_models/prompt_formatter/__init__.py,sha256=47DEQpj8HBSa-_TImW-
40
40
  langroid/language_models/prompt_formatter/base.py,sha256=2y_GcwhstvB5ih3haS7l5Fv79jVnFJ_vEw1jqWJzB9k,1247
41
41
  langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
42
42
  langroid/language_models/utils.py,sha256=rmnSn-sJ3aKl_wBdeLPkck0Li4Ed6zkCxZYYl7n1V34,4668
43
- langroid/mytypes.py,sha256=_0TnRjIRFvJh1MThFGqtD8hUzq1W3LqzTS_WCFeodzw,1559
43
+ langroid/mytypes.py,sha256=nJyTaX-nAe2dwRdF8NZKxUNAy_Hvxgtkn1c9buT-d14,1619
44
44
  langroid/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
46
46
  langroid/parsing/code-parsing.md,sha256=--cyyNiSZSDlIwcjAV4-shKrSiRe2ytF3AdSoS_hD2g,3294
47
47
  langroid/parsing/code_parser.py,sha256=BbDAzp35wkYQ9U1dpf1ARL0lVyi0tfqEc6_eox2C090,3727
48
48
  langroid/parsing/json.py,sha256=MVqBUfInALQm1QKbcfEvLzWxBz_UztCIyGk7AK5uFPo,1650
49
49
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
50
- langroid/parsing/parser.py,sha256=6C9rpymc7R-FOILbTgS15IWwX7R4zLM5vU0UQLI-3C0,7430
51
- langroid/parsing/pdf_parser.py,sha256=RFrck50VvqYl37xzUp-cj4uSC4FDIvJqTwv100Dilgg,1432
52
- langroid/parsing/repo_loader.py,sha256=ILlvBH-wSvfdLLQKHklAzuxU980_ajts_bz7_9IwtLY,27017
50
+ langroid/parsing/parser.py,sha256=R1yvqjBvXKOcX1opCj5nTYuDK2HLAfkonM055DM5CP8,7826
51
+ langroid/parsing/pdf_parser.py,sha256=0FHrxFXwJ5m9xLQlyKrlRGeApGjkJ3gRk1a3fs1h7AI,5890
52
+ langroid/parsing/repo_loader.py,sha256=lQ_9tceOEiPWATf_SL43iubz05G1XXoF4vINsKJHmkY,27324
53
53
  langroid/parsing/table_loader.py,sha256=uqbupGr4y_7os18RtaY5GpD0hWcgzROoNy8dQIHB4kc,1767
54
- langroid/parsing/url_loader.py,sha256=ROXkdkzFeQ9lxJhaqYu95Eh9nWoZjwj2rlKWapj7p0Q,1839
54
+ langroid/parsing/url_loader.py,sha256=MfYr2zK-1pOMEEc9y_vrMdtC052XMlr57X2ptYKpDX4,2103
55
55
  langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
56
56
  langroid/parsing/urls.py,sha256=_Bcf1iRdT7cQrQ8hnbPX0Jtzxc0lVFaucTS5rJoKA14,3709
57
57
  langroid/parsing/utils.py,sha256=zqvZWpZktRJTKx_JAqxaIyoudMdKVdB1zzjnOhVYHS4,2196
@@ -82,7 +82,7 @@ langroid/vector_store/base.py,sha256=QZx3NUNwf2I0r3A7iuoUHIRGbqt_pFGD0hq1R-Yg8iM
82
82
  langroid/vector_store/chromadb.py,sha256=s5pQkKjaMP-Tt5A8M10EInFzttaALPbJAq7q4gf0TKg,5235
83
83
  langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
84
84
  langroid/vector_store/qdrantdb.py,sha256=KRvIIj1IZG2zFqejofMnRs2hT86B-27LgBEnuczdqOU,9072
85
- langroid-0.1.63.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
86
- langroid-0.1.63.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
87
- langroid-0.1.63.dist-info/METADATA,sha256=5oka-JCHZrPQdVKsd7gIOXKlKDaDvZ6MWSnO4pmvWLc,35776
88
- langroid-0.1.63.dist-info/RECORD,,
85
+ langroid-0.1.65.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
86
+ langroid-0.1.65.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
87
+ langroid-0.1.65.dist-info/METADATA,sha256=B5O6tKm5lGdBFcm6Npepa4nYHHZ8zMq0VNkUxSOEo1g,35776
88
+ langroid-0.1.65.dist-info/RECORD,,